In [2]:
import pandas as pd 
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import pickle

import warnings
warnings.filterwarnings('ignore')

In [3]:
data = pd.read_csv('data1.csv')
data.tail(10)

Unnamed: 0,test_date,cough,fever,sore_throat,shortness_of_breath,head_ache,corona_result,age_60_and_above,gender,test_indication
2742586,2020-03-11,0,0,0,0,0,negative,,male,Other
2742587,2020-03-11,0,0,0,0,0,negative,,male,Other
2742588,2020-03-11,0,0,0,0,0,negative,,female,Contact with confirmed
2742589,2020-03-11,0,0,0,0,0,negative,,male,Other
2742590,2020-03-11,0,0,0,0,0,negative,,male,Other
2742591,2020-03-11,0,0,0,0,0,negative,,female,Other
2742592,2020-03-11,0,0,0,0,0,negative,,female,Other
2742593,2020-03-11,0,0,0,0,0,other,,male,Other
2742594,2020-03-11,0,0,0,0,0,negative,,female,Other
2742595,2020-03-11,0,1,0,0,0,negative,,male,Other


In [4]:
data.info()
data.drop(['test_date'], axis = 1, inplace = True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2742596 entries, 0 to 2742595
Data columns (total 10 columns):
 #   Column               Dtype 
---  ------               ----- 
 0   test_date            object
 1   cough                int64 
 2   fever                int64 
 3   sore_throat          int64 
 4   shortness_of_breath  int64 
 5   head_ache            int64 
 6   corona_result        object
 7   age_60_and_above     object
 8   gender               object
 9   test_indication      object
dtypes: int64(5), object(5)
memory usage: 209.2+ MB


In [60]:
is_result = data.corona_result.map(lambda a: 0 if a == 'positive' else 1)
data['corona_result_flag'] = is_result
data.drop(['corona_result'], axis = 1 , inplace = True)
data.tail(10)


Unnamed: 0,cough,fever,sore_throat,shortness_of_breath,head_ache,age_60_and_above,gender,test_indication,corona_result_flag
2742586,0,0,0,0,0,,male,Other,1
2742587,0,0,0,0,0,,male,Other,1
2742588,0,0,0,0,0,,female,Contact with confirmed,1
2742589,0,0,0,0,0,,male,Other,1
2742590,0,0,0,0,0,,male,Other,1
2742591,0,0,0,0,0,,female,Other,1
2742592,0,0,0,0,0,,female,Other,1
2742593,0,0,0,0,0,,male,Other,1
2742594,0,0,0,0,0,,female,Other,1
2742595,0,1,0,0,0,,male,Other,1


In [61]:
categorical_columns = data.select_dtypes('object').columns
categorical_columns

Index(['age_60_and_above', 'gender', 'test_indication'], dtype='object')

In [62]:
data = pd.get_dummies(data, columns = categorical_columns)
data.tail()
# age_encoder = LabelEncoder()
# gender_encoder = LabelEncoder()
# test_ind_encoder = LabelEncoder()

# a = age_encoder.fit_transform(data['age_60_and_above'])
# b = gender_encoder.fit_transform(data['gender'])
# c = test_ind_encoder.fit_transform(data['test_indication'])

Unnamed: 0,cough,fever,sore_throat,shortness_of_breath,head_ache,corona_result_flag,age_60_and_above_No,age_60_and_above_Yes,gender_female,gender_male,test_indication_Abroad,test_indication_Contact with confirmed,test_indication_Other
2742591,0,0,0,0,0,1,0,0,1,0,0,0,1
2742592,0,0,0,0,0,1,0,0,1,0,0,0,1
2742593,0,0,0,0,0,1,0,0,0,1,0,0,1
2742594,0,0,0,0,0,1,0,0,1,0,0,0,1
2742595,0,1,0,0,0,1,0,0,0,1,0,0,1


In [10]:
X = data.drop("corona_result_flag", axis = 1 )
y = data.corona_result_flag

In [11]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [25]:
chi_selector = SelectKBest(score_func=chi2, k=3)
features_for_training = chi_selector.fit_transform(X,y)

chi_selector = SelectKBest(score_func=chi2, k=5)
features_for_training = chi_selector.fit_transform(X,y)
X.columns[chi_selector.get_support(indices=True)]
features = X.columns[chi_selector.get_support(indices=True)].tolist()
features

['cough',
 'fever',
 'sore_throat',
 'head_ache',
 'test_indication_Contact with confirmed']

In [27]:
y.value_counts(normalize=True)

1    0.919429
0    0.080571
Name: corona_result_flag, dtype: float64

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)

In [29]:
y_train.value_counts(normalize=True)

1    0.919429
0    0.080571
Name: corona_result_flag, dtype: float64

In [30]:
y_test.value_counts(normalize=True)

1    0.919428
0    0.080572
Name: corona_result_flag, dtype: float64

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=45)
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import (accuracy_score, confusion_matrix, recall_score, precision_score)
# Create the Model Object
nb_classifier = GaussianNB()

# Fit Model to Data
nb_classifier.fit(X_train , y_train)

y_preds = nb_classifier.predict(X_test)
acc_score = accuracy_score(y_test, y_preds)
print("Accuracy Score = {}".format(acc_score))

confusion_matrix(y_test, y_preds)

Accuracy Score = 0.9176292569095019


array([[ 29084,  15292],
       [ 29890, 474254]], dtype=int64)

In [42]:
import pickle
filename = "model.pkl"
model = pickle.dump(nb_classifier, open(filename, "wb"))

In [43]:
# model = pickle.load(open('nb_model.pkl','rb'))


In [44]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split 

X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size = 0.2, random_state = 0)

clf = LogisticRegression()
clf.fit(X_train, y_train)

LogisticRegression()

In [35]:
y_pred = clf.predict(X_test)
from sklearn import metrics
print("Mean squared error: %.2f" % np.mean((clf.predict(X_train) - y_train) ** 2))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred))) 

Mean squared error: 0.07
Root Mean Squared Error: 0.26520693595736705


In [36]:
acc_score = accuracy_score(y_test, y_preds)
print("Accuracy Score = {}".format(acc_score))


Accuracy Score = 0.8309359731641508


In [37]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [38]:
chi_selector = SelectKBest(score_func=chi2, k=3)
features_for_training = chi_selector.fit_transform(X,y)

chi_selector = SelectKBest(score_func=chi2, k=3)
features_for_training = chi_selector.fit_transform(X,y)
X.columns[chi_selector.get_support(indices=True)]
features = X.columns[chi_selector.get_support(indices=True)].tolist()
features

['fever', 'head_ache', 'test_indication_Contact with confirmed']

In [39]:
X_train

Unnamed: 0,cough,fever,sore_throat,head_ache,test_indication_Contact with confirmed
777537,0,1,0,1,1
1489317,0,0,0,0,0
1058998,0,0,0,0,0
2480565,0,0,0,0,0
1770874,0,0,0,0,0
...,...,...,...,...,...
2249467,0,0,0,0,0
963395,0,0,0,0,0
2215104,0,0,0,0,0
1484405,0,0,0,0,0


In [55]:
def predict_covid(cough: int,
                 fever: int,
                 sore_throat: int,
                 short_breath: int,
                 headache: int,
                 under_60: int,
                 over_60: int,
                 is_female: int,
                 is_male: int,
                 gone_abroad: int,
                 confirmed_contact:int,
                 other_indication: int):
    """
    Predict if Patient is covid positive or not
    """
    global result
    data = [[cough, fever, sore_throat, short_breath, headache, 
            under_60, over_60, is_female, is_male, gone_abroad, confirmed_contact, other_indication]]
    
    result = nb_classifier.predict_proba(data)[0][1]
    message = "The probabilty of having Covid-19 is {:.2f}"
    return message.format(result)

predict_covid(
    cough=1,
    fever=1,
    sore_throat=1,
    short_breath=1,
    headache=1,
    under_60=1,
    over_60=1,
    is_female=1,
    is_male=1,
    gone_abroad=0,
    confirmed_contact=1,
    other_indication=1
)

'The probabilty of having Covid-19 is 0.00'