In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

In [3]:
df= pd.read_csv('covid_toy.csv')

In [4]:
df.head(5)

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No


In [5]:
df['cough'].value_counts()

cough
Mild      62
Strong    38
Name: count, dtype: int64

In [6]:
df.isnull().sum()

age           0
gender        0
fever        10
cough         0
city          0
has_covid     0
dtype: int64

In [7]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(df.drop(columns=['has_covid']),df['has_covid'],
                                                test_size=0.2)

In [8]:
X_train

Unnamed: 0,age,gender,fever,cough,city
24,13,Female,100.0,Strong,Kolkata
97,20,Female,101.0,Mild,Bangalore
35,82,Female,102.0,Strong,Bangalore
2,42,Male,101.0,Mild,Delhi
47,18,Female,104.0,Mild,Bangalore
...,...,...,...,...,...
22,71,Female,98.0,Strong,Kolkata
60,24,Female,102.0,Strong,Bangalore
72,83,Female,101.0,Mild,Kolkata
9,64,Female,101.0,Mild,Delhi


In [9]:
X_test

Unnamed: 0,age,gender,fever,cough,city
95,12,Female,104.0,Mild,Bangalore
69,73,Female,103.0,Mild,Delhi
41,82,Male,,Mild,Kolkata
27,33,Female,102.0,Strong,Delhi
51,11,Female,100.0,Strong,Kolkata
63,10,Male,100.0,Mild,Bangalore
77,8,Female,101.0,Mild,Kolkata
18,64,Female,98.0,Mild,Bangalore
13,64,Male,102.0,Mild,Bangalore
83,17,Female,104.0,Mild,Kolkata


In [10]:
y_train

24     No
97     No
35     No
2      No
47     No
     ... 
22    Yes
60    Yes
72     No
9      No
70     No
Name: has_covid, Length: 80, dtype: object

In [11]:
y_test

95     No
69     No
41    Yes
27     No
51    Yes
63     No
77     No
18    Yes
13    Yes
83     No
73    Yes
79    Yes
88     No
21    Yes
19    Yes
50    Yes
54    Yes
29    Yes
59    Yes
7     Yes
Name: has_covid, dtype: object

# 1. Aam Zindagi

In [12]:
# adding simple imputer to fever col
si = SimpleImputer()
X_train_fever = si.fit_transform(X_train[['fever']])

# also the test data
X_test_fever = si.fit_transform(X_test[['fever']])
                                 
X_train_fever.shape

(80, 1)

In [13]:
# Ordinalencoding -> cough
oe = OrdinalEncoder(categories=[['Mild','Strong']])
X_train_cough = oe.fit_transform(X_train[['cough']])

# also the test data
X_test_cough = oe.fit_transform(X_test[['cough']])

X_train_cough.shape

(80, 1)

In [14]:
# OneHotEncoding -> gender,city
ohe = OneHotEncoder(drop='first',sparse=False)
X_train_gender_city = ohe.fit_transform(X_train[['gender','city']])

# also the test data
X_test_gender_city = ohe.fit_transform(X_test[['gender','city']])

X_train_gender_city.shape



(80, 4)

In [15]:
# Extracting Age
X_train_age = X_train.drop(columns=['gender','fever','cough','city']).values

# also the test data
X_test_age = X_test.drop(columns=['gender','fever','cough','city']).values

X_train_age.shape

(80, 1)

In [16]:
X_train_transformed = np.concatenate((X_train_age,X_train_fever,X_train_gender_city,X_train_cough),axis=1)
# also the test data
X_test_transformed = np.concatenate((X_test_age,X_test_fever,X_test_gender_city,X_test_cough),axis=1)

X_train_transformed.shape

(80, 7)

# Mentos Zindagi

In [17]:
from sklearn.compose import ColumnTransformer

In [18]:
transformer = ColumnTransformer(transformers=[
    ('tnf1',SimpleImputer(),['fever']),
    ('tnf2',OrdinalEncoder(categories=[['Mild','Strong']]),['cough']),
    ('tnf3',OneHotEncoder(sparse=False,drop='first'),['gender','city'])
],remainder='passthrough')

In [19]:
transformer.fit_transform(X_train).shape



(80, 7)

In [20]:
transformer.transform(X_test).shape

(20, 7)