In [1]:
import pandas as pd
pd.set_option("display.max.columns", None)

## Loading the Raw Data in data frame

### Shape of the data

In [2]:
data = pd.read_csv("../Datasets/Raw-Data.csv")
data.shape

(16, 7)

### Sample Data values

In [3]:
data.head()

Unnamed: 0,Country,Age,Gender,Symptoms,Experiencing_Symptoms,Severity,Contact
0,China,0-9,Male,"Fever,Tiredness,Dry-Cough,Difficulty-in-Breath...","Pains,Nasal-Congestion,Runny-Nose,Diarrhea",Mild,Yes
1,Italy,10-19,Female,"Fever,Tiredness,Dry-Cough,Difficulty-in-Breathing","Pains,Nasal-Congestion,Runny-Nose",Moderate,No
2,Iran,20-24,Transgender,"Fever,Tiredness,Dry-Cough","Pains,Nasal-Congestion",Severe,Dont-Know
3,Republic of Korean,25-59,,"Fever,Tiredness",Pains,,
4,France,60+,,Fever,"Nasal-Congestion,Runny-Nose,Diarrhea",,


### Finding the count of missing values in the columns

In [4]:
print("Column wise missing values count..\n")
print("country: \t\t", data.Country.isna().sum())
print("Age: \t\t\t", data.Age.isna().sum())
print("Gender: \t\t", data.Gender.isna().sum())
print("Symptoms: \t\t", data.Symptoms.isna().sum())
print("Experiencing_Symptoms: \t", data.Experiencing_Symptoms.isna().sum())
print("Severity: \t\t", data.Severity.isna().sum())
print("Contact: \t\t", data.Contact.isna().sum())

Column wise missing values count..

country: 		 6
Age: 			 11
Gender: 		 13
Symptoms: 		 0
Experiencing_Symptoms: 	 5
Severity: 		 12
Contact: 		 13


### Possible total Combinations

In [5]:
country = len(data.Country.dropna().unique())
age = len(data.Age.dropna().unique())
gender = len(data.Gender.dropna().unique())
symptoms = len(data.Symptoms.dropna().unique())
esymptoms = len(data.Experiencing_Symptoms.dropna().unique())
severity = len(data.Severity.dropna().unique())
contact = len(data.Contact.dropna().unique())

print("Possible Number of Total Combination: ",country * age * gender * symptoms * esymptoms * severity * contact)

Possible Number of Total Combination:  316800


### Dropping the missing / null values

In [6]:
import itertools
columns = [data.Country.dropna().unique().tolist(),
          data.Age.dropna().unique().tolist(),
          data.Gender.dropna().unique().tolist(),
          data.Symptoms.dropna().unique().tolist(),
          data.Experiencing_Symptoms.dropna().unique().tolist(),
          data.Severity.dropna().unique().tolist(),
          data.Contact.dropna().unique().tolist()]

final_data = pd.DataFrame(list(itertools.product(*columns)), columns=data.columns)

### Data shape after handling the missing vlaues

In [7]:
final_data.shape

(316800, 7)

In [8]:
final_data.head()

Unnamed: 0,Country,Age,Gender,Symptoms,Experiencing_Symptoms,Severity,Contact
0,China,0-9,Male,"Fever,Tiredness,Dry-Cough,Difficulty-in-Breath...","Pains,Nasal-Congestion,Runny-Nose,Diarrhea",Mild,Yes
1,China,0-9,Male,"Fever,Tiredness,Dry-Cough,Difficulty-in-Breath...","Pains,Nasal-Congestion,Runny-Nose,Diarrhea",Mild,No
2,China,0-9,Male,"Fever,Tiredness,Dry-Cough,Difficulty-in-Breath...","Pains,Nasal-Congestion,Runny-Nose,Diarrhea",Mild,Dont-Know
3,China,0-9,Male,"Fever,Tiredness,Dry-Cough,Difficulty-in-Breath...","Pains,Nasal-Congestion,Runny-Nose,Diarrhea",Moderate,Yes
4,China,0-9,Male,"Fever,Tiredness,Dry-Cough,Difficulty-in-Breath...","Pains,Nasal-Congestion,Runny-Nose,Diarrhea",Moderate,No


## Sympton analysis

### Extracting the symptoms

In [9]:
symptoms_list = final_data['Symptoms'].str.split(',')

from collections import Counter
symptoms_counter = Counter(([a for b in symptoms_list.tolist() for a in b]))

for symptom in symptoms_counter.keys():
    final_data[symptom] = 0
    final_data.loc[final_data['Symptoms'].str.contains(symptom), symptom] = 1

final_data

Unnamed: 0,Country,Age,Gender,Symptoms,Experiencing_Symptoms,Severity,Contact,Fever,Tiredness,Dry-Cough,Difficulty-in-Breathing,Sore-Throat,None_Sympton
0,China,0-9,Male,"Fever,Tiredness,Dry-Cough,Difficulty-in-Breath...","Pains,Nasal-Congestion,Runny-Nose,Diarrhea",Mild,Yes,1,1,1,1,1,0
1,China,0-9,Male,"Fever,Tiredness,Dry-Cough,Difficulty-in-Breath...","Pains,Nasal-Congestion,Runny-Nose,Diarrhea",Mild,No,1,1,1,1,1,0
2,China,0-9,Male,"Fever,Tiredness,Dry-Cough,Difficulty-in-Breath...","Pains,Nasal-Congestion,Runny-Nose,Diarrhea",Mild,Dont-Know,1,1,1,1,1,0
3,China,0-9,Male,"Fever,Tiredness,Dry-Cough,Difficulty-in-Breath...","Pains,Nasal-Congestion,Runny-Nose,Diarrhea",Moderate,Yes,1,1,1,1,1,0
4,China,0-9,Male,"Fever,Tiredness,Dry-Cough,Difficulty-in-Breath...","Pains,Nasal-Congestion,Runny-Nose,Diarrhea",Moderate,No,1,1,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
316795,Other,60+,Transgender,None_Sympton,None_Experiencing,Severe,No,0,0,0,0,0,1
316796,Other,60+,Transgender,None_Sympton,None_Experiencing,Severe,Dont-Know,0,0,0,0,0,1
316797,Other,60+,Transgender,None_Sympton,None_Experiencing,,Yes,0,0,0,0,0,1
316798,Other,60+,Transgender,None_Sympton,None_Experiencing,,No,0,0,0,0,0,1


### Extracting the symptoms experiencing

In [10]:
esymptoms_list = final_data['Experiencing_Symptoms'].str.split(',')

from collections import Counter
esymptoms_counter = Counter(([a for b in esymptoms_list.tolist() for a in b]))

for esymptom in esymptoms_counter.keys():
    final_data[esymptom] = 0
    final_data.loc[final_data['Experiencing_Symptoms'].str.contains(esymptom), esymptom] = 1

final_data

Unnamed: 0,Country,Age,Gender,Symptoms,Experiencing_Symptoms,Severity,Contact,Fever,Tiredness,Dry-Cough,Difficulty-in-Breathing,Sore-Throat,None_Sympton,Pains,Nasal-Congestion,Runny-Nose,Diarrhea,None_Experiencing
0,China,0-9,Male,"Fever,Tiredness,Dry-Cough,Difficulty-in-Breath...","Pains,Nasal-Congestion,Runny-Nose,Diarrhea",Mild,Yes,1,1,1,1,1,0,1,1,1,1,0
1,China,0-9,Male,"Fever,Tiredness,Dry-Cough,Difficulty-in-Breath...","Pains,Nasal-Congestion,Runny-Nose,Diarrhea",Mild,No,1,1,1,1,1,0,1,1,1,1,0
2,China,0-9,Male,"Fever,Tiredness,Dry-Cough,Difficulty-in-Breath...","Pains,Nasal-Congestion,Runny-Nose,Diarrhea",Mild,Dont-Know,1,1,1,1,1,0,1,1,1,1,0
3,China,0-9,Male,"Fever,Tiredness,Dry-Cough,Difficulty-in-Breath...","Pains,Nasal-Congestion,Runny-Nose,Diarrhea",Moderate,Yes,1,1,1,1,1,0,1,1,1,1,0
4,China,0-9,Male,"Fever,Tiredness,Dry-Cough,Difficulty-in-Breath...","Pains,Nasal-Congestion,Runny-Nose,Diarrhea",Moderate,No,1,1,1,1,1,0,1,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
316795,Other,60+,Transgender,None_Sympton,None_Experiencing,Severe,No,0,0,0,0,0,1,0,0,0,0,1
316796,Other,60+,Transgender,None_Sympton,None_Experiencing,Severe,Dont-Know,0,0,0,0,0,1,0,0,0,0,1
316797,Other,60+,Transgender,None_Sympton,None_Experiencing,,Yes,0,0,0,0,0,1,0,0,0,0,1
316798,Other,60+,Transgender,None_Sympton,None_Experiencing,,No,0,0,0,0,0,1,0,0,0,0,1


### Dropping the normalized columns raw features

In [11]:
final_data = final_data.drop(['Symptoms','Experiencing_Symptoms'],axis=1)
dummies = pd.get_dummies(final_data.drop('Country',axis=1))
dummies['Country'] = final_data['Country']
final_data = dummies
final_data.head()

Unnamed: 0,Fever,Tiredness,Dry-Cough,Difficulty-in-Breathing,Sore-Throat,None_Sympton,Pains,Nasal-Congestion,Runny-Nose,Diarrhea,None_Experiencing,Age_0-9,Age_10-19,Age_20-24,Age_25-59,Age_60+,Gender_Female,Gender_Male,Gender_Transgender,Severity_Mild,Severity_Moderate,Severity_None,Severity_Severe,Contact_Dont-Know,Contact_No,Contact_Yes,Country
0,1,1,1,1,1,0,1,1,1,1,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,1,China
1,1,1,1,1,1,0,1,1,1,1,0,1,0,0,0,0,0,1,0,1,0,0,0,0,1,0,China
2,1,1,1,1,1,0,1,1,1,1,0,1,0,0,0,0,0,1,0,1,0,0,0,1,0,0,China
3,1,1,1,1,1,0,1,1,1,1,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,1,China
4,1,1,1,1,1,0,1,1,1,1,0,1,0,0,0,0,0,1,0,0,1,0,0,0,1,0,China


## Statistics

In [12]:
final_data.describe()

Unnamed: 0,Fever,Tiredness,Dry-Cough,Difficulty-in-Breathing,Sore-Throat,None_Sympton,Pains,Nasal-Congestion,Runny-Nose,Diarrhea,None_Experiencing,Age_0-9,Age_10-19,Age_20-24,Age_25-59,Age_60+,Gender_Female,Gender_Male,Gender_Transgender,Severity_Mild,Severity_Moderate,Severity_None,Severity_Severe,Contact_Dont-Know,Contact_No,Contact_Yes
count,316800.0,316800.0,316800.0,316800.0,316800.0,316800.0,316800.0,316800.0,316800.0,316800.0,316800.0,316800.0,316800.0,316800.0,316800.0,316800.0,316800.0,316800.0,316800.0,316800.0,316800.0,316800.0,316800.0,316800.0,316800.0,316800.0
mean,0.3125,0.5,0.5625,0.5,0.3125,0.0625,0.363636,0.545455,0.545455,0.363636,0.090909,0.2,0.2,0.2,0.2,0.2,0.333333,0.333333,0.333333,0.25,0.25,0.25,0.25,0.333333,0.333333,0.333333
std,0.463513,0.500001,0.496079,0.500001,0.463513,0.242062,0.481046,0.49793,0.49793,0.481046,0.28748,0.400001,0.400001,0.400001,0.400001,0.400001,0.471405,0.471405,0.471405,0.433013,0.433013,0.433013,0.433013,0.471405,0.471405,0.471405
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.5,1.0,0.5,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.25,0.25,0.25,0.25,1.0,1.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [25]:
print("Country: ", final_data["Country"].value_counts())

Country:  Iran                  31680
China                 31680
France                31680
Republic of Korean    31680
Other                 31680
Italy                 31680
Other-EUR             31680
Spain                 31680
Germany               31680
UAE                   31680
Name: Country, dtype: int64


In [13]:
print("Age 0-9: \n", final_data['Age_0-9'].value_counts())
print("Age 10-19: \n", final_data['Age_10-19'].value_counts())
print("Age 20-24: \n", final_data['Age_20-24'].value_counts())
print("Age 25-59: \n", final_data['Age_25-59'].value_counts())
print("Age 60+: \n", final_data['Age_60+'].value_counts())

Age 0-9: 
 0    253440
1     63360
Name: Age_0-9, dtype: int64
Age 10-19: 
 0    253440
1     63360
Name: Age_10-19, dtype: int64
Age 20-24: 
 0    253440
1     63360
Name: Age_20-24, dtype: int64
Age 25-59: 
 0    253440
1     63360
Name: Age_25-59, dtype: int64
Age 60+: 
 0    253440
1     63360
Name: Age_60+, dtype: int64


In [22]:
print("Dry-Cough count: \n", final_data['Dry-Cough'].value_counts())
print("Difficulty-in-Breathing count: \n", final_data['Difficulty-in-Breathing'].value_counts())
print("Sore-Throat count: \n", final_data['Sore-Throat'].value_counts())
print("Nasal-Congestion count: \n", final_data['Nasal-Congestion'].value_counts())
print("Runny-Nose count: \n", final_data['Runny-Nose'].value_counts())

Dry-Cough count: 
 1    178200
0    138600
Name: Dry-Cough, dtype: int64
Difficulty-in-Breathing count: 
 0    158400
1    158400
Name: Difficulty-in-Breathing, dtype: int64
Sore-Throat count: 
 0    217800
1     99000
Name: Sore-Throat, dtype: int64
Nasal-Congestion count: 
 1    172800
0    144000
Name: Nasal-Congestion, dtype: int64
Runny-Nose count: 
 1    172800
0    144000
Name: Runny-Nose, dtype: int64


In [23]:
print("Fever count: \n", final_data['Fever'].value_counts())
print("Tiredness count: \n", final_data['Tiredness'].value_counts())
print("Pains count: \n", final_data['Pains'].value_counts())
print("Diarrhea count: \n", final_data['Diarrhea'].value_counts())

Fever count: 
 0    217800
1     99000
Name: Fever, dtype: int64
Tiredness count: 
 0    158400
1    158400
Name: Tiredness, dtype: int64
Pains count: 
 0    201600
1    115200
Name: Pains, dtype: int64
Diarrhea count: 
 0    201600
1    115200
Name: Diarrhea, dtype: int64


In [24]:
print("None_Experiencing count: \n", final_data['None_Experiencing'].value_counts())
print("No symptoms count: \n", final_data['None_Sympton'].value_counts())

None_Experiencing count: 
 0    288000
1     28800
Name: None_Experiencing, dtype: int64
No symptoms count: 
 0    297000
1     19800
Name: None_Sympton, dtype: int64


## Storing the data

In [15]:
final_data.to_csv('Cleaned-Data.csv', index=False, header=True)