In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

# DATASET: https://github.com/nshomron/covidpred/tree/master/data
dataset = pd.read_csv("corona_tested_individuals_ver_006.english.csv")

## Data Type Conversion
dataset['cough'] = pd.to_numeric(dataset['cough'], errors='coerce')
dataset['fever'] = pd.to_numeric(dataset['fever'], errors='coerce')
dataset['sore_throat'] = pd.to_numeric(dataset['sore_throat'], errors='coerce')
dataset['shortness_of_breath'] = pd.to_numeric(dataset['shortness_of_breath'], errors='coerce')
dataset['head_ache'] = pd.to_numeric(dataset['head_ache'], errors='coerce')

dataset['age_60_and_above'] = dataset['age_60_and_above'].replace({'None':'No'})

# Handling Missing Values
# convert the Na's i.e "None" to 0 for the first 5 columns
dataset = dataset.replace(np.nan, 0, regex=True)

## Dimension Reduction
new_dataframe = dataset.drop(columns = ["test_date", "gender"])
new_dataframe = new_dataframe.reset_index(drop=True)

new_dataframe = new_dataframe[new_dataframe["corona_result"] != "other"]
new_dataframe = new_dataframe.reset_index(drop=True)

def freq_encoder(df, col):
  fe = df.groupby(col).size()/len(df)
  df[col+"_encoded"] = df[col].map(fe)
  return df

freq_enc_df = freq_encoder(new_dataframe, 'age_60_and_above')
freq_enc_df = freq_encoder(new_dataframe, 'test_indication')
freq_enc_df = freq_enc_df.drop(columns=['age_60_and_above','test_indication'], axis=1)

In [2]:
# splitting the features and the target
X_features = freq_enc_df.drop(columns = ['corona_result'], axis=1)
y_target = freq_enc_df['corona_result']

encoded_y_target = np.where(freq_enc_df['corona_result'] == 'negative',0,1)

# selecting the best algorithm
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier

models = []
models.append(('Linear_Regression', LinearRegression()))
models.append(('Random_Forest_Classifier', RandomForestClassifier()))

results = []
names = []
for name, model in models:
    kfold = KFold(n_splits=10)
    cv_results = -(cross_val_score(model, X_features, encoded_y_target, cv=kfold, scoring='neg_mean_squared_error'))
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

Linear_Regression: 0.037002 (0.012972)
Random_Forest_Classifier: 0.034089 (0.016126)


In [5]:
# Modeling
X_train, X_test, y_train,y_test = train_test_split(X_features, encoded_y_target, test_size=0.2, random_state=42)
covid_detector = RandomForestClassifier()
covid_detector.fit(X_train,y_train)
predictions = covid_detector.predict(X_test)

# Model Evaluation using MSE and accuracy_score
def mse(y_true, y_predicted):
    return  np.mean((y_true-y_predicted)**2)
    
mse_value = mse(y_test, predictions)
acc_score = round(covid_detector.score(X_test, y_test)*100,2)

print("Model Evaluation")
print("_"*35)
print("Mean Squared Error: ",mse_value,sep="")
print("Accuracy Score:     ",acc_score,"%",sep="")

Model Evaluation
___________________________________
Mean Squared Error: 0.03267748036077975
Accuracy Score:     96.73%


In [6]:
# Saving the model
import joblib
from joblib import dump
dump(covid_detector, 'covid_detector.joblib')

['covid_detector.joblib']