In [1]:
import pandas as pd 
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, LabelEncoder, MinMaxScaler

from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.tree import DecisionTreeClassifier

In [3]:
df = pd.read_csv("~/Data sets/covid_toy.csv")
df.shape

(100, 6)

In [4]:
df

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No
...,...,...,...,...,...,...
95,12,Female,104.0,Mild,Bangalore,No
96,51,Female,101.0,Strong,Kolkata,Yes
97,20,Female,101.0,Mild,Bangalore,No
98,5,Female,98.0,Strong,Mumbai,No


In [80]:
df['cough'].value_counts()


cough
Mild      62
Strong    38
Name: count, dtype: int64

In [81]:
y_test

77     No
8      No
98     No
13    Yes
86    Yes
62    Yes
22    Yes
93    Yes
17     No
41    Yes
64     No
31     No
55    Yes
52    Yes
37     No
32    Yes
14     No
89     No
29    Yes
39     No
Name: has_covid, dtype: object

# fever ---> SimpleImputer
# cough ---> ordinalEncoder
# gender,city ---> OHE
# has_covid ---> labelEncoder

In [82]:
#train_test_split
x_train,x_test,y_train,y_test = train_test_split(df.drop(columns=['has_covid']),
                                                         df['has_covid'],
                                                         test_size=0.2)

In [83]:
#SimpleImputer
trf1 = ColumnTransformer([('impute_fever',SimpleImputer(),[2])],remainder='passthrough')

In [84]:
#OrdinalEncoder
trf2 = ColumnTransformer([('Encode_cough',OrdinalEncoder(categories=[['Mild','Strong']]),[3])],remainder='passthrough')

In [85]:
#OnehotEncoding
trf3 = ColumnTransformer([('Encode_gender_city',OneHotEncoder(sparse_output=False,handle_unknown='ignore'),[1,4])],remainder='passthrough')

In [86]:
trf4 = ColumnTransformer([('scale',MinMaxScaler(),slice(0,10))])

In [87]:
trf5 = DecisionTreeClassifier()

In [88]:
#LabelEncoder
le = LabelEncoder()
y_train_hascovid = le.fit_transform(y_train)
y_test_hascovid = le.transform(y_test)

## PIPELINE

In [91]:
pipes = Pipeline([('trf1',trf1),('trf2',trf2),('trf3',trf3),('trf4',trf4),('trf5',trf5)])

In [92]:
# yadi hum kisi model ko train nahi kar rahe hote to fit_transform call karte,prantu yaha ek model "Decisiontreeclassifier" train ho raha hai
# isliye hum kewal Fit karenge taki predict kar paye
pipes.fit(x_train,y_train)

In [94]:
y_pred = pipes.predict(x_test)

In [95]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.45

In [96]:
y_pred

array(['No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'No', 'No', 'No',
       'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No'],
      dtype=object)

In [97]:
y_test

75    Yes
78    Yes
69     No
25     No
81     No
42    Yes
34    Yes
60    Yes
66     No
47     No
20     No
82    Yes
21    Yes
26    Yes
17     No
27     No
22    Yes
19    Yes
94    Yes
71     No
Name: has_covid, dtype: object