  ## Importing Dependencies

In [28]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
covid_data = pd.read_csv("corona_tested_individuals_ver_006.english.csv")
covid_data.head()

  covid_data = pd.read_csv("corona_tested_individuals_ver_006.english.csv")


Unnamed: 0,test_date,cough,fever,sore_throat,shortness_of_breath,head_ache,corona_result,age_60_and_above,gender,test_indication
0,2020-04-30,0,0,0,0,0,negative,,female,Other
1,2020-04-30,1,0,0,0,0,negative,,female,Other
2,2020-04-30,0,1,0,0,0,negative,,male,Other
3,2020-04-30,1,0,0,0,0,negative,,female,Other
4,2020-04-30,1,0,0,0,0,negative,,male,Other


In [3]:
# check for null values
covid_data.isnull().sum()

test_date              0
cough                  0
fever                  0
sore_throat            0
shortness_of_breath    0
head_ache              0
corona_result          0
age_60_and_above       0
gender                 0
test_indication        0
dtype: int64

In [4]:
# check for duplicates in the data
covid_data.duplicated().sum()

272068

In [5]:
#Drop duplicates
covid_data.drop_duplicates(inplace = True)

In [6]:
covid_data

Unnamed: 0,test_date,cough,fever,sore_throat,shortness_of_breath,head_ache,corona_result,age_60_and_above,gender,test_indication
0,2020-04-30,0,0,0,0,0,negative,,female,Other
1,2020-04-30,1,0,0,0,0,negative,,female,Other
2,2020-04-30,0,1,0,0,0,negative,,male,Other
4,2020-04-30,1,0,0,0,0,negative,,male,Other
6,2020-04-30,1,1,0,0,0,negative,,male,Abroad
...,...,...,...,...,...,...,...,...,...,...
278830,2020-03-11,0,1,0,0,0,positive,,,Contact with confirmed
278831,2020-03-11,1,1,1,0,1,positive,,,Abroad
278833,2020-03-11,0,1,1,0,0,positive,,,Abroad
278834,2020-03-11,1,0,1,0,1,positive,,,Abroad


In [7]:
covid_data.columns

Index(['test_date', 'cough', 'fever', 'sore_throat', 'shortness_of_breath',
       'head_ache', 'corona_result', 'age_60_and_above', 'gender',
       'test_indication'],
      dtype='object')

In [8]:
#dropping columns containing categorical data
covid_data.drop(["age_60_and_above","gender","test_indication"],axis=1,inplace=True)

In [9]:
covid_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6780 entries, 0 to 278835
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   test_date            6780 non-null   object
 1   cough                6780 non-null   object
 2   fever                6780 non-null   object
 3   sore_throat          6780 non-null   object
 4   shortness_of_breath  6780 non-null   object
 5   head_ache            6780 non-null   object
 6   corona_result        6780 non-null   object
dtypes: object(7)
memory usage: 423.8+ KB


In [10]:
# Drop test_date column
covid_data.drop("test_date" , axis = 1, inplace = True)
covid_data.head()

Unnamed: 0,cough,fever,sore_throat,shortness_of_breath,head_ache,corona_result
0,0,0,0,0,0,negative
1,1,0,0,0,0,negative
2,0,1,0,0,0,negative
4,1,0,0,0,0,negative
6,1,1,0,0,0,negative


In [11]:
# convert corona_result to numerical
mapping ={"negative": 0, "positive" : 1, "other" : 2}
covid_data["corona_result"].replace(mapping, inplace = True)

In [12]:
covid_data


Unnamed: 0,cough,fever,sore_throat,shortness_of_breath,head_ache,corona_result
0,0,0,0,0,0,0
1,1,0,0,0,0,0
2,0,1,0,0,0,0
4,1,0,0,0,0,0
6,1,1,0,0,0,0
...,...,...,...,...,...,...
278830,0,1,0,0,0,1
278831,1,1,1,0,1,1
278833,0,1,1,0,0,1
278834,1,0,1,0,1,1


In [13]:
# check the no. of "None" in the dataset
print("no. of None values in cough", covid_data[covid_data["cough"] == "None"].shape[0])
print("no. of None values in fever", covid_data[covid_data["fever"] == "None"].shape[0])
print("no. of None values in sore_throat", covid_data[covid_data["sore_throat"] == "None"].shape[0])
print("no. of None values in shortness_Of_breath", covid_data[covid_data["shortness_of_breath"] == "None"].shape[0])
print("no. of None values in head_ache", covid_data[covid_data["head_ache"] == "None"].shape[0])


no. of None values in cough 52
no. of None values in fever 52
no. of None values in sore_throat 1
no. of None values in shortness_Of_breath 1
no. of None values in head_ache 1


In [14]:
mapping ={"None": 0}

In [15]:
covid_data["cough"].replace(mapping, inplace = True)
covid_data["fever"].replace(mapping, inplace = True)
covid_data["sore_throat"].replace(mapping, inplace = True)
covid_data["shortness_of_breath"].replace(mapping, inplace = True)
covid_data["head_ache"].replace(mapping, inplace = True)

### Train Data

In [17]:
X=covid_data.drop("corona_result", axis = 1)
y= covid_data["corona_result"]

In [18]:
# split data into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [19]:
# fit data
model = LogisticRegression()
model.fit(X_train, y_train)

LogisticRegression()

In [25]:
# predict
pred = model.predict(X_test)
print(pred)

[1 0 1 ... 0 1 1]


In [26]:
# Evaluate
accuracy_score(y_test, pred)

0.616519174041298

In [29]:
# function for training the model
def model_trainer(data, models):
    for model in models:
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        accuracy = accuracy_score(y_test, preds)
        print(f'model: {model}, accuracy: {accuracy}')

In [31]:
#Define parameters
data = X_train, X_test, y_train, y_test

#Define models
svc = SVC()
forest = RandomForestClassifier()
dt = DecisionTreeClassifier()

models = [svc, forest, dt]

In [32]:
# Train model and get best performance
model_trainer(data=data, models=models)

model: SVC(), accuracy: 0.616519174041298
model: RandomForestClassifier(), accuracy: 0.616519174041298
model: DecisionTreeClassifier(), accuracy: 0.616519174041298
