In [None]:
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


In [None]:
import pandas as pd
data= pd.read_csv('/content/gdrive/MyDrive/data/corona_tested_individuals_ver_0083.english.csv')

In [None]:
#Import required libraries
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import seaborn as sns

from matplotlib.collections import PathCollection
from statsmodels.graphics.gofplots import qqplot
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.metrics import classification_report, accuracy_score
from xgboost import XGBClassifier
from yellowbrick.classifier import PrecisionRecallCurve, ROCAUC, ConfusionMatrix
from yellowbrick.style import set_palette
from yellowbrick.model_selection import LearningCurve, FeatureImportances
from yellowbrick.contrib.wrapper import wrap

# --- Libraries Settings ---

sns.set_style('whitegrid')
plt.rcParams['figure.dpi']=100

In [None]:
data.head()

Unnamed: 0,test_date,cough,fever,sore_throat,shortness_of_breath,head_ache,corona_result,age_60_and_above,gender,test_indication
0,11/12/2020,0,0,0,0,0,negative,No,male,Other
1,11/12/2020,0,1,0,0,0,negative,No,male,Other
2,11/12/2020,0,0,0,0,0,negative,Yes,female,Other
3,11/12/2020,0,0,0,0,0,negative,No,male,Other
4,11/12/2020,0,1,0,0,0,negative,No,male,Contact with confirmed


In [None]:
#Feature Selection

import scipy.stats as stats
from scipy.stats import chi2_contingency

# Data Modeling

from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# Model Evaluation & saving the model

from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, recall_score, accuracy_score, precision_score, f1_score
import pickle

In [None]:
for column in data.columns:
    print(f"Number of unique classes in '{column}': {data[column].nunique()}")

Number of unique classes in 'test_date': 64
Number of unique classes in 'cough': 2
Number of unique classes in 'fever': 2
Number of unique classes in 'sore_throat': 2
Number of unique classes in 'shortness_of_breath': 2
Number of unique classes in 'head_ache': 2
Number of unique classes in 'corona_result': 3
Number of unique classes in 'age_60_and_above': 2
Number of unique classes in 'gender': 2
Number of unique classes in 'test_indication': 3


In [None]:
data = data.loc[data['corona_result'] != 'other']
data['corona_result'].unique()

array(['negative', 'positive'], dtype=object)

In [None]:
# Label Encoding

data['corona_result'] = data['corona_result'].map({'negative': 0, 'positive': 1})
data['gender'] = data['gender'].map({'female': 0, 'male': 1})
data['age_60_and_above'] = data['age_60_and_above'].map({'No': 0, 'Yes': 1})

In [None]:
data['test_indication'].unique()

array(['Other', 'Contact with confirmed', 'Abroad'], dtype=object)

In [None]:
data.head()

Unnamed: 0,test_date,cough,fever,sore_throat,shortness_of_breath,head_ache,corona_result,age_60_and_above,gender,test_indication
0,11/12/2020,0,0,0,0,0,0,0.0,1.0,Other
1,11/12/2020,0,1,0,0,0,0,0.0,1.0,Other
2,11/12/2020,0,0,0,0,0,0,1.0,0.0,Other
3,11/12/2020,0,0,0,0,0,0,0.0,1.0,Other
4,11/12/2020,0,1,0,0,0,0,0.0,1.0,Contact with confirmed


In [None]:
# One Hot Encoding

data['test_indication'] = data['test_indication'].map({'Abroad': 'abroad', 'Contact with confirmed': 'contact_with_covid_positive_patient', 'Other': 'other'})
data = pd.get_dummies(data, columns = ['test_indication'])


In [None]:
# Renaming the encoded feature names & dropping test_indication_other & test_date feature

data.rename({'test_indication_abroad': 'abroad',
             'test_indication_contact_with_covid_positive_patient': 'contact_with_covid_positive_patient'},
             axis='columns', inplace = True)

data.drop(['test_date','test_indication_other'], axis = 1, inplace = True)

In [None]:
data.isnull().sum()

cough                                      0
fever                                      0
sore_throat                                0
shortness_of_breath                        0
head_ache                                  0
corona_result                              0
age_60_and_above                       50988
gender                                  2601
abroad                                     0
contact_with_covid_positive_patient        0
dtype: int64

In [None]:
data.head()

Unnamed: 0,cough,fever,sore_throat,shortness_of_breath,head_ache,corona_result,age_60_and_above,gender,abroad,contact_with_covid_positive_patient
0,0,0,0,0,0,0,0.0,1.0,0,0
1,0,1,0,0,0,0,0.0,1.0,0,0
2,0,0,0,0,0,0,1.0,0.0,0,0
3,0,0,0,0,0,0,0.0,1.0,0,0
4,0,1,0,0,0,0,0.0,1.0,0,1


In [None]:
df2 = data.copy()
df2.head()

Unnamed: 0,cough,fever,sore_throat,shortness_of_breath,head_ache,corona_result,age_60_and_above,gender,abroad,contact_with_covid_positive_patient
0,0,0,0,0,0,0,0.0,1.0,0,0
1,0,1,0,0,0,0,0.0,1.0,0,0
2,0,0,0,0,0,0,1.0,0.0,0,0
3,0,0,0,0,0,0,0.0,1.0,0,0
4,0,1,0,0,0,0,0.0,1.0,0,1


In [None]:
df1 = data.copy()
df1.head()

Unnamed: 0,cough,fever,sore_throat,shortness_of_breath,head_ache,corona_result,age_60_and_above,gender,abroad,contact_with_covid_positive_patient
0,0,0,0,0,0,0,0.0,1.0,0,0
1,0,1,0,0,0,0,0.0,1.0,0,0
2,0,0,0,0,0,0,1.0,0.0,0,0
3,0,0,0,0,0,0,0.0,1.0,0,0
4,0,1,0,0,0,0,0.0,1.0,0,1


In [None]:
x = data.drop(columns = ['corona_result'], axis = 1) # Independent Features
y = data['corona_result'] # Target Feature
# Creating feature risk coeficient

In [None]:
import numpy as np

# Check for NaN values in y
nan_indices = np.isnan(y)
nan_count = np.sum(nan_indices)
print("Number of NaN values in y:", nan_count)


Number of NaN values in y: 0


In [None]:
print(data['corona_result'].value_counts())


0    925290
1    105434
Name: corona_result, dtype: int64


In [None]:
#Undersampling Covid Negative Cases (Setting n(+ve cases) = 0.6 * n(-ve cases))

under = RandomUnderSampler(sampling_strategy = 0.6, random_state = 42)
os_x, os_y = under.fit_resample(x, y)

In [None]:
#Undersampling Covid Negative Cases (Setting n(+ve cases) = 0.6 * n(-ve cases))

os_y.value_counts()

0    175723
1    105434
Name: corona_result, dtype: int64

In [None]:
import pandas as pd
from sklearn.impute import SimpleImputer

# Identify columns with null values
null_cols = os_x.columns[os_x.isnull().any()]

# Impute null values with mean value
imputer = SimpleImputer(strategy='mean')
os_x[null_cols] = imputer.fit_transform(os_x[null_cols])

In [None]:
# Check for NaN values in y
nan_indices = np.isnan(os_x)
nan_count = np.sum(nan_indices)
print("Number of NaN values in os_x:", nan_count)

Number of NaN values in os_x: cough                                  0
fever                                  0
sore_throat                            0
shortness_of_breath                    0
head_ache                              0
age_60_and_above                       0
gender                                 0
abroad                                 0
contact_with_covid_positive_patient    0
dtype: int64


In [None]:
# Setting Train:Test ratio as 70:30

x_train, x_test, y_train, y_test = train_test_split(os_x, os_y, test_size = 0.3, random_state = 42)

print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)
print()
print(y_train.value_counts())
print()
print(y_test.value_counts())

(196809, 9) (84348, 9) (196809,) (84348,)

0    123235
1     73574
Name: corona_result, dtype: int64

0    52488
1    31860
Name: corona_result, dtype: int64


In [None]:

df2.head()

Unnamed: 0,cough,fever,sore_throat,shortness_of_breath,head_ache,corona_result,age_60_and_above,gender,abroad,contact_with_covid_positive_patient
0,0,0,0,0,0,0,0.0,1.0,0,0
1,0,1,0,0,0,0,0.0,1.0,0,0
2,0,0,0,0,0,0,1.0,0.0,0,0
3,0,0,0,0,0,0,0.0,1.0,0,0
4,0,1,0,0,0,0,0.0,1.0,0,1


In [None]:
import pandas as pd
from sklearn.impute import SimpleImputer

# Identify columns with null values
null_cols = df2.columns[df2.isnull().any()]

# Impute null values with mean value
imputer = SimpleImputer(strategy='mean')
df2[null_cols] = imputer.fit_transform(df2[null_cols])

In [None]:
x = df2.drop(columns = ['corona_result'], axis = 1) # Independent Features
y = df2['corona_result'] # Target Feature

In [None]:
from imblearn.over_sampling import SMOTE

# Instantiate SMOTE object
smote = SMOTE()

# Resample the data
X_resampled, y_resampled = smote.fit_resample(x, y)

# Check the class distribution after resampling
print(f"Class distribution after SMOTE: {np.bincount(y_resampled)}")

Class distribution after SMOTE: [925290 925290]


In [None]:
# Get the list of feature names
feature_names = list(X_resampled.columns)

# Print the list of feature names
print(feature_names)

['cough', 'fever', 'sore_throat', 'shortness_of_breath', 'head_ache', 'age_60_and_above', 'gender', 'abroad', 'contact_with_covid_positive_patient']


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=41)

In [None]:

df1.head()

Unnamed: 0,cough,fever,sore_throat,shortness_of_breath,head_ache,corona_result,age_60_and_above,gender,abroad,contact_with_covid_positive_patient
0,0,0,0,0,0,0,0.0,1.0,0,0
1,0,1,0,0,0,0,0.0,1.0,0,0
2,0,0,0,0,0,0,1.0,0.0,0,0
3,0,0,0,0,0,0,0.0,1.0,0,0
4,0,1,0,0,0,0,0.0,1.0,0,1


In [None]:

df1 = df1.dropna()


In [None]:
df1.isnull().sum()

cough                                  0
fever                                  0
sore_throat                            0
shortness_of_breath                    0
head_ache                              0
corona_result                          0
age_60_and_above                       0
gender                                 0
abroad                                 0
contact_with_covid_positive_patient    0
dtype: int64

In [None]:
X = df1.drop(columns = ['corona_result'], axis = 1) # Independent Features
Y = df1['corona_result'] # Target Feature

In [None]:
#Undersampling Covid Negative Cases (Setting n(+ve cases) = 0.6 * n(-ve cases))

under = RandomUnderSampler(sampling_strategy = 0.6, random_state = 42)
os_X, os_Y = under.fit_resample(X, Y)

In [None]:
from sklearn.model_selection import train_test_split
train_x, test_x, train_y, test_y = train_test_split(os_X, os_Y, test_size=0.2, random_state=41)

In [None]:
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, matthews_corrcoef, confusion_matrix

# --- Applying LSVM ---
LSVMclassifier = LinearSVC(max_iter=1000, C=5)
LSVMclassifier.fit(train_x, train_y)

y_pred_LSVM = LSVMclassifier.predict(test_x)

# Calculate evaluation metrics
ac = accuracy_score(test_y, y_pred_LSVM)
error = 1 - ac
precision = precision_score(test_y, y_pred_LSVM)
recall = recall_score(test_y, y_pred_LSVM)
f1 = f1_score(test_y, y_pred_LSVM)
auc_roc = roc_auc_score(test_y, LSVMclassifier.decision_function(test_x))
gini_coeff = (2*auc_roc) - 1

# Calculate MCC
mcc = matthews_corrcoef(test_y, y_pred_LSVM)

# Print evaluation metrics
print("Accuracy of LSVM:", ac)
print("Error of LSVM:", error)
print("Precision of LSVM:", precision)
print("Recall of LSVM:", recall)
print("F1-score of LSVM:", f1)
print("AUC-ROC of LSVM:", auc_roc)
print("Gini Coefficient of LSVM:", gini_coeff)
print("MCC of LSVM:", mcc)

# Calculate prediction time
import time
start_time = time.time()
p_time = time.time() - start_time
print("Prediction Time of LSVM:", p_time)

Accuracy of LSVM: 0.7989299557787848
Error of LSVM: 0.20107004422121522
Precision of LSVM: 0.8973579809234405
Recall of LSVM: 0.5194976544175137
F1-score of LSVM: 0.6580421528272106
AUC-ROC of LSVM: 0.7698486984410144
Gini Coefficient of LSVM: 0.5396973968820289
MCC of LSVM: 0.5692707245672856
Prediction Time of LSVM: 5.0067901611328125e-05


**LR**

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, matthews_corrcoef, confusion_matrix

# --- Applying LSVM ---
logreg = LogisticRegression()
logreg.fit(train_x, train_y)

y_pred_logreg = logreg.predict(test_x)

# Calculate evaluation metrics
ac = accuracy_score(test_y, y_pred_logreg)
error = 1 - ac
precision = precision_score(test_y, y_pred_logreg)
recall = recall_score(test_y, y_pred_logreg)
f1 = f1_score(test_y, y_pred_logreg)
auc_roc = roc_auc_score(test_y, logreg.decision_function(test_x))
gini_coeff = (2*auc_roc) - 1

# Calculate MCC
mcc = matthews_corrcoef(test_y, y_pred_logreg)

# Print evaluation metrics
print("Accuracy of LR:", ac)
print("Error of LR:", error)
print("Precision of LR:", precision)
print("Recall of LR:", recall)
print("F1-score of LR:", f1)
print("AUC-ROC of LR:", auc_roc)
print("Gini Coefficient of LR:", gini_coeff)
print("MCC of LR:", mcc)

# Calculate prediction time
import time
start_time = time.time()
p_time = time.time() - start_time
print("Prediction Time of logreg:", p_time)

Accuracy of LR: 0.8017324525486342
Error of LR: 0.19826754745136577
Precision of LR: 0.892654903569963
Recall of LR: 0.5315187646598906
F1-score of LR: 0.6662991209531686
AUC-ROC of LR: 0.7702703808879905
Gini Coefficient of LR: 0.540540761775981
MCC of LR: 0.5744186075979908
Prediction Time of logreg: 5.173683166503906e-05


**GB**

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, matthews_corrcoef

#Instantiate Naive Bayes classifier
nb = GaussianNB()

#Fit the classifier to the training data
nb.fit(train_x, train_y)

#Predict the labels for the test data
y_pred_nb = nb.predict(test_x)

#Calculate evaluation metrics
ac = accuracy_score(test_y, y_pred_nb)
error = 1 - ac
precision = precision_score(test_y, y_pred_nb)
recall = recall_score(test_y, y_pred_nb)
f1 = f1_score(test_y, y_pred_nb)
auc_roc = roc_auc_score(test_y, nb.predict_proba(test_x)[:, 1])
gini_coeff = (2 * auc_roc) - 1

#Calculate MCC
mcc = matthews_corrcoef(test_y, y_pred_nb)

#Print evaluation metrics
print("Accuracy of Naive Bayes:", ac)
print("Error of Naive Bayes:", error)
print("Precision of Naive Bayes:", precision)
print("Recall of Naive Bayes:", recall)
print("F1-score of Naive Bayes:", f1)
print("AUC-ROC of Naive Bayes:", auc_roc)
print("Gini Coefficient of Naive Bayes:", gini_coeff)
print("MCC of Naive Bayes:", mcc)

#Calculate prediction time
import time
start_time = time.time()
p_time = time.time() - start_time
print("Prediction Time of Naive Bayes:", p_time)

Accuracy of Naive Bayes: 0.8030427107786937
Error of Naive Bayes: 0.1969572892213063
Precision of Naive Bayes: 0.8849317256248502
Recall of Naive Bayes: 0.5415363565285379
F1-score of Naive Bayes: 0.671901051929548
AUC-ROC of Naive Bayes: 0.7656100189580903
Gini Coefficient of Naive Bayes: 0.5312200379161807
MCC of Naive Bayes: 0.5759668305998308
Prediction Time of Naive Bayes: 3.457069396972656e-05


In [None]:
import lightgbm as lgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, matthews_corrcoef

#Convert the data to LightGBM Dataset format
train_data = lgb.Dataset(train_x, label=train_y)

#Set the parameters for LightGBM classifier
params = {
'objective': 'binary',
'metric': 'binary_logloss',
'boosting_type': 'gbdt',
'num_leaves': 31,
'learning_rate': 0.05,
'feature_fraction': 0.9,
'bagging_fraction': 0.8,
'bagging_freq': 5,
'verbose': 0
}
#Train the LightGBM classifier
lgb_classifier = lgb.train(params, train_data)

#Predict the labels for the test data
y_pred_lgb = lgb_classifier.predict(test_x)
y_pred_lgb = [1 if pred >= 0.5 else 0 for pred in y_pred_lgb]

#Calculate evaluation metrics
ac = accuracy_score(test_y, y_pred_lgb)
error = 1 - ac
precision = precision_score(test_y, y_pred_lgb)
recall = recall_score(test_y, y_pred_lgb)
f1 = f1_score(test_y, y_pred_lgb)
auc_roc = roc_auc_score(test_y, y_pred_lgb)
gini_coeff = (2 * auc_roc) - 1

#Calculate MCC
mcc = matthews_corrcoef(test_y, y_pred_lgb)

#Print evaluation metrics
print("Accuracy of LightGBM:", ac)
print("Error of LightGBM:", error)
print("Precision of LightGBM:", precision)
print("Recall of LightGBM:", recall)
print("F1-score of LightGBM:", f1)
print("AUC-ROC of LightGBM:", auc_roc)
print("Gini Coefficient of LightGBM:", gini_coeff)
print("MCC of LightGBM:", mcc)
#Calculate prediction time
import time
start_time = time.time()
p_time = time.time() - start_time
print("Prediction Time of LightGBM:", p_time)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Accuracy of LightGBM: 0.8032428891193972
Error of LightGBM: 0.19675711088060277
Precision of LightGBM: 0.8862654073955498
Recall of LightGBM: 0.5410965598123534
F1-score of LightGBM: 0.6719461132350264
AUC-ROC of LightGBM: 0.7499463139479895
Gini Coefficient of LightGBM: 0.499892627895979
MCC of LightGBM: 0.5765995531495394
Prediction Time of LightGBM: 3.7670135498046875e-05


**AdaBoost**

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, matthews_corrcoef

#Instantiate AdaBoost classifier
adaboost = AdaBoostClassifier(n_estimators=100)

#Fit the classifier to the training data
adaboost.fit(train_x, train_y)

#Predict the labels for the test data
y_pred_adaboost = adaboost.predict(test_x)

#Calculate evaluation metrics
ac = accuracy_score(test_y, y_pred_adaboost)
error = 1 - ac
precision = precision_score(test_y, y_pred_adaboost)
recall = recall_score(test_y, y_pred_adaboost)
f1 = f1_score(test_y, y_pred_adaboost)
auc_roc = roc_auc_score(test_y, adaboost.predict_proba(test_x)[:, 1])
gini_coeff = (2 * auc_roc) - 1

#Calculate MCC
mcc = matthews_corrcoef(test_y, y_pred_adaboost)

#Print evaluation metrics
print("Accuracy of AdaBoost:", ac)
print("Error of AdaBoost:", error)
print("Precision of AdaBoost:", precision)
print("Recall of AdaBoost:", recall)
print("F1-score of AdaBoost:", f1)
print("AUC-ROC of AdaBoost:", auc_roc)
print("Gini Coefficient of AdaBoost:", gini_coeff)
print("MCC of AdaBoost:", mcc)

#Calculate prediction time
import time
start_time = time.time()
p_time = time.time() - start_time
print("Prediction Time of AdaBoost:", p_time)

Accuracy of AdaBoost: 0.7969827664646685
Error of AdaBoost: 0.20301723353533152
Precision of AdaBoost: 0.8964900323734878
Recall of AdaBoost: 0.5142200938232995
F1-score of AdaBoost: 0.6535618905658034
AUC-ROC of AdaBoost: 0.7697589871826923
Gini Coefficient of AdaBoost: 0.5395179743653846
MCC of AdaBoost: 0.5649950435436013
Prediction Time of AdaBoost: 4.38690185546875e-05
