In [2]:
from joblib import parallel_backend
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB, CategoricalNB
from sklearn.metrics import classification_report, confusion_matrix
from timeit import default_timer as timer
import pandas as pd
import numpy as np
import pickle
from timeit import default_timer as timer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.svm import SVC
import cupy as cp

In [3]:
df = pd.read_csv('covid.csv')
print(df.head())

  df = pd.read_csv('covid.csv')


    test_date  cough  fever  sore_throat  shortness_of_breath  head_ache   
0  2020-11-12      0      0            0                    0          0  \
1  2020-11-12      0      1            0                    0          0   
2  2020-11-12      0      0            0                    0          0   
3  2020-11-12      0      0            0                    0          0   
4  2020-11-12      0      1            0                    0          0   

  corona_result age_60_and_above  gender         test_indication  
0      negative               No    male                   Other  
1      negative               No    male                   Other  
2      negative              Yes  female                   Other  
3      negative               No    male                   Other  
4      negative               No    male  Contact with confirmed  


In [4]:
# Extract only negative and positive cases
df = df[(df['corona_result'] == 'positive') | (df['corona_result'] == 'negative')]

In [5]:
# Correlation test: Measuring the percentage of individuals that got COVID with the symptoms.
# test indication in this case refers to contact with COVID-19 patients
features = ['cough', 'fever', 'sore_throat', 'shortness_of_breath', 'head_ache', 'test_indication']
df['test_indication'].replace({'Contact with confirmed': 1, 'Other': 0, 'Abroad': 0}, inplace=True)
df['corona_result'].replace({'positive': 1, 'negative': 0}, inplace=True)
print("Percentage of COVID-19 Patients that had:")
for i in range(len(features)):
  df_feat = df[[features[i], 'corona_result']]
  df_feat = df_feat[df_feat['corona_result'] == 1]
  df_pos = df_feat[df_feat[features[i]]==1]
  print(f'{features[i]}: {len(df_pos)/len(df_feat)}')

Percentage of COVID-19 Patients that had:
cough: 0.19237017762190292
fever: 0.22831994569521438
sore_throat: 0.09267111664215409
shortness_of_breath: 0.03375042425613757
head_ache: 0.18797601538635592
test_indication: 0.4450503450616586


In [6]:
X = df[features]
y = df['corona_result']

# Parameters selected for hyperparameter tuning
parameters = {'min_samples_split': np.linspace(2, 20, num=6, dtype=int),
              'min_samples_leaf': np.linspace(1, 10, num=3, dtype=int),
              'max_depth': np.linspace(5, 20, num=4, dtype=int),
              'max_features': ['sqrt', 'log2', None],
              'criterion': ['gini', 'entropy']
              }

In [19]:
# Testing different classifiers
# don't run this cell for hyperparameter tuning
def model_eval(model):
  start = timer()
  model.fit(X_train, y_train)
  end = timer()
  acc = model.score(X_test, y_test)
  y_pred = model.predict(X_test)
  print(f"Model evaluation for {type(model).__name__}")
  print(f"Training time: {end-start}")
  print(f'Model accuracy: {acc}')
  print("Classification report:")
  print(classification_report(y_test, y_pred))
  print("Confusion matrix:")
  print(confusion_matrix(y_test, y_pred))

# list of models tested
model_list = [RandomForestClassifier(), DecisionTreeClassifier(), XGBClassifier(), GradientBoostingClassifier(),
              HistGradientBoostingClassifier(), LGBMClassifier(), SGDClassifier(), GaussianNB(), BernoulliNB(), CategoricalNB()]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Using joblib to parallelize the processing
with parallel_backend('multiprocessing', n_jobs=5):
  for model in model_list:
    model_eval(model)

Model evaluation for RandomForestClassifier
Training time: 11.670600999999806
Model accuracy: 0.9353305841212023
Classification report:
              precision    recall  f1-score   support

           0       0.96      0.97      0.96    743951
           1       0.62      0.56      0.59     66463

    accuracy                           0.94    810414
   macro avg       0.79      0.77      0.78    810414
weighted avg       0.93      0.94      0.93    810414

Confusion matrix:
[[720661  23290]
 [ 29119  37344]]
Model evaluation for DecisionTreeClassifier
Training time: 0.4017521999994642
Model accuracy: 0.9353305841212023
Classification report:
              precision    recall  f1-score   support

           0       0.96      0.97      0.96    743951
           1       0.62      0.56      0.59     66463

    accuracy                           0.94    810414
   macro avg       0.79      0.77      0.78    810414
weighted avg       0.93      0.94      0.93    810414

Confusion matrix:
[[7

In [None]:
def model_eval(model):
    start = timer()
    model.fit(cp.asarray(X_train), cp.asarray(y_train))
    end = timer()
    acc = model.score(cp.asarray(X_test), cp.asarray(y_test))
    y_pred = model.predict(cp.asarray(X_test))
    print(f"Model evaluation for {type(model).__name__}")
    print(f"Training time: {end-start}")
    print(f'Model accuracy: {acc}')
    print("Classification report:")
    print(classification_report(cp.asnumpy(y_test), cp.asnumpy(y_pred)))
    print("Confusion matrix:")
    print(confusion_matrix(cp.asnumpy(y_test), cp.asnumpy(y_pred)))

# list of models tested
model_list = [cp.random_forest.RandomForestClassifier(), cp.tree.DecisionTreeClassifier(),
              cp.xgboost.XGBClassifier(), cp.ensemble.GradientBoostingClassifier(),
              cp.experimental.histogram_gradient_boosting.HistGradientBoostingClassifier(),
              cp.lightgbm.LGBMClassifier(), cp.linear_model.SGDClassifier(),
              cp.naive_bayes.GaussianNB(), cp.naive_bayes.BernoulliNB(),
              cp.naive_bayes.CategoricalNB()]
X_train, X_test, y_train, y_test = train_test_split(cp.asarray(X), cp.asarray(y), test_size=0.3, random_state=42)

# Using joblib to parallelize the processing
with parallel_backend('multiprocessing', n_jobs=5):
  for model in model_list:
    model_eval(model)