In [3]:
#Import dependencies and other necessary items
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.preprocessing import StandardScaler

In [4]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix

In [5]:
# Load in the provisional data as a CSV
provisional_data_df = pd.read_csv("Resources/provisionaldb.csv")
provisional_data_df.head()

#Now we have to load in the data from the actual SQL database and NOT a CSV

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Date_Onset,State,ST_Code,County,FIPS,Age_Range,Sex,Race,Ethnicity,Case_Positive_Specimen,Current_Status,Symptom_Status,Hosp_yn,ICU_yn,Death_yn,Date_Onset_Month,Date_Onset_Year
0,128,128,8/1/2021,PA,42,WASHINGTON,42125,0 - 17 years,Female,White,Non-Hispanic/Latino,1,Probable Case,Symptomatic,False,False,False,8,2021
1,129,129,8/1/2021,PA,42,WASHINGTON,42125,0 - 17 years,Female,White,Non-Hispanic/Latino,1,Probable Case,Symptomatic,False,False,False,8,2021
2,287,287,4/1/2020,TN,47,SHELBY,47157,18 to 49 years,Female,Black,Non-Hispanic/Latino,1,Laboratory-confirmed case,Symptomatic,False,False,False,4,2020
3,378,378,10/1/2020,TN,47,DAVIDSON,47037,50 to 64 years,Male,Black,Non-Hispanic/Latino,0,Laboratory-confirmed case,Symptomatic,False,False,False,10,2020
4,470,470,2/1/2022,KS,20,RILEY,20161,0 - 17 years,Female,White,Non-Hispanic/Latino,0,Probable Case,Symptomatic,False,False,False,2,2022


# Clean the data and prepare it for the machine learning model- Preprocessing

In [6]:
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder

In [7]:
for column in provisional_data_df.columns: 
    le = preprocessing.LabelEncoder()
    test = le.fit(provisional_data_df[column])
    provisional_data_df[column] = le.transform(provisional_data_df[column])

In [8]:
provisional_data_df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Date_Onset,State,ST_Code,County,FIPS,Age_Range,Sex,Race,Ethnicity,Case_Positive_Specimen,Current_Status,Symptom_Status,Hosp_yn,ICU_yn,Death_yn,Date_Onset_Month,Date_Onset_Year
0,0,0,26,14,14,178,168,0,0,5,1,6,1,1,0,0,0,7,1
1,1,1,26,14,14,178,168,0,0,5,1,6,1,1,0,0,0,7,1
2,2,2,13,15,15,159,203,1,0,2,1,6,0,1,0,0,0,3,0
3,3,3,2,15,15,47,179,2,1,2,1,5,0,1,0,0,0,9,0
4,4,4,9,5,5,144,39,0,0,5,1,5,1,1,0,0,0,1,2


In [9]:
#Drop st_code, unnamed columns, FIPS
provisional_data_df = provisional_data_df.drop(['Unnamed: 0', 'Unnamed: 0.1', 'ST_Code', 'FIPS'], axis=1)
provisional_data_df.head()

Unnamed: 0,Date_Onset,State,County,Age_Range,Sex,Race,Ethnicity,Case_Positive_Specimen,Current_Status,Symptom_Status,Hosp_yn,ICU_yn,Death_yn,Date_Onset_Month,Date_Onset_Year
0,26,14,178,0,0,5,1,6,1,1,0,0,0,7,1
1,26,14,178,0,0,5,1,6,1,1,0,0,0,7,1
2,13,15,159,1,0,2,1,6,0,1,0,0,0,3,0
3,2,15,47,2,1,2,1,5,0,1,0,0,0,9,0
4,9,5,144,0,0,5,1,5,1,1,0,0,0,1,2


# Split the data into training and testing

In [10]:
#Create features
X = provisional_data_df.drop(['Death_yn'], axis=1)

#Create targets
y = provisional_data_df['Death_yn']

In [11]:
#Check balance of the target values
y.value_counts()

0    2808
1      67
Name: Death_yn, dtype: int64

In [12]:
#Import train test split and split the data into training and testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1,stratify=y)

In [13]:
#Will need to do sampling to shift the unbalanced data
#Add in the Smoeteen sampling algorithm

from imblearn.combine import SMOTEENN

smote_enn = SMOTEENN(random_state=1)

In [14]:
X_resampled, y_resampled = smote_enn.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({0: 2037, 1: 2094})

# Use Logistic Regression Model to train and predict

In [15]:
# # Train the Logistic Regression model using the training data 
#Using the data that is now resampled

from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver="lbfgs", random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [16]:
#Predict using the trained data

y_pred = model.predict(X_test)

In [17]:
#Compare the prediction data vs the actual data from y_test

compare_df = pd.DataFrame({"Prediction": y_pred, "Actual": y_test})
print(compare_df)

      Prediction  Actual
559            0       0
2669           0       0
1345           0       0
168            0       0
2234           1       0
...          ...     ...
1492           0       0
2819           0       0
413            0       0
702            1       0
242            0       0

[719 rows x 2 columns]


# Showcase results of the machine learning model

In [18]:
# Calculated the balanced accuracy score

from sklearn.metrics import balanced_accuracy_score, confusion_matrix

acc_score = balanced_accuracy_score(y_test, y_pred)

print(f"The balanced accuracy score for the Logistic Regression Model is : {acc_score}")

The balanced accuracy score for the Logistic Regression Model is : 0.9228674375733199


In [19]:
# Display the confusion matrix

cm = pd.DataFrame(confusion_matrix(y_test, y_pred), index = ["Actual Alive", "Actual Dead"], columns = ["Pred Alive", "Pred Dead"])
print(cm)

              Pred Alive  Pred Dead
Actual Alive         635         67
Actual Dead            1         16


In [20]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      0.90      0.94      0.95      0.92      0.85       702
          1       0.19      0.94      0.90      0.32      0.92      0.85        17

avg / total       0.98      0.91      0.94      0.93      0.92      0.85       719



# Notes to add to ReadME

In [None]:
#Sampling data using Smoteen improved accuracy score from 71% to 89.5%!!

In [None]:
#Dropping some columns that have duplicate data, like the ST_Code compared to State

In [21]:
#After dropping those unneccesary columns, the accuracy score jumped to 92.3%!!