In [1]:
#Import dependencies and other necessary items
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.preprocessing import StandardScaler
from config import db_password

In [2]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix

In [3]:
pip install psycopg2

Note: you may need to restart the kernel to use updated packages.


In [4]:
#Loading in data from the PostgreSQL database
from sqlalchemy import create_engine
import psycopg2

In [5]:
#This is the actual code needed to read in the data from the database
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/COVID_MSU"
engine = create_engine(db_string)

cdc_analysis_df = pd.read_sql_query('''SELECT * FROM cdc_df_import''', con=engine)
cdc_analysis_df.head()

Unnamed: 0.2,index,Unnamed: 0,Unnamed: 0.1,date_onset,state,st_code,county,fips,age_range,sex,race,ethnicity,case_positive_specimen,current_status,symptom_status,hosp_yn,icu_yn,death_yn,date_onset_month,date_onset_year
0,0,128,128,2021-08-01,PA,42,WASHINGTON,42125.0,0 - 17 years,Female,White,Non-Hispanic/Latino,1,Probable Case,Symptomatic,False,False,False,8,2021
1,1,129,129,2021-08-01,PA,42,WASHINGTON,42125.0,0 - 17 years,Female,White,Non-Hispanic/Latino,1,Probable Case,Symptomatic,False,False,False,8,2021
2,2,287,287,2020-04-01,TN,47,SHELBY,47157.0,18 to 49 years,Female,Black,Non-Hispanic/Latino,1,Laboratory-confirmed case,Symptomatic,False,False,False,4,2020
3,3,378,378,2020-10-01,TN,47,DAVIDSON,47037.0,50 to 64 years,Male,Black,Non-Hispanic/Latino,0,Laboratory-confirmed case,Symptomatic,False,False,False,10,2020
4,4,470,470,2022-02-01,KS,20,RILEY,20161.0,0 - 17 years,Female,White,Non-Hispanic/Latino,0,Probable Case,Symptomatic,False,False,False,2,2022


# Clean the data and prepare it for the machine learning model- Preprocessing

In [6]:
cdc_analysis_df = pd.get_dummies(data=cdc_analysis_df, columns = ['age_range', 'race', 'ethnicity', \
                                                                          'current_status', 'symptom_status'])
cdc_analysis_df.head()

Unnamed: 0.2,index,Unnamed: 0,Unnamed: 0.1,date_onset,state,st_code,county,fips,sex,case_positive_specimen,...,race_Black,race_Multiple/Other,race_Native Hawaiian/Other Pacific Islander,race_White,ethnicity_Hispanic/Latino,ethnicity_Non-Hispanic/Latino,current_status_Laboratory-confirmed case,current_status_Probable Case,symptom_status_Asymptomatic,symptom_status_Symptomatic
0,0,128,128,2021-08-01,PA,42,WASHINGTON,42125.0,Female,1,...,0,0,0,1,0,1,0,1,0,1
1,1,129,129,2021-08-01,PA,42,WASHINGTON,42125.0,Female,1,...,0,0,0,1,0,1,0,1,0,1
2,2,287,287,2020-04-01,TN,47,SHELBY,47157.0,Female,1,...,1,0,0,0,0,1,1,0,0,1
3,3,378,378,2020-10-01,TN,47,DAVIDSON,47037.0,Male,0,...,1,0,0,0,0,1,1,0,0,1
4,4,470,470,2022-02-01,KS,20,RILEY,20161.0,Female,0,...,0,0,0,1,0,1,0,1,0,1


In [7]:
#Drop st_code, unnamed columns, FIPS
cdc_analysis_df = cdc_analysis_df.drop(['Unnamed: 0', 'Unnamed: 0.1', 'st_code', 'fips'], axis=1)
cdc_analysis_df.head()

Unnamed: 0,index,date_onset,state,county,sex,case_positive_specimen,hosp_yn,icu_yn,death_yn,date_onset_month,...,race_Black,race_Multiple/Other,race_Native Hawaiian/Other Pacific Islander,race_White,ethnicity_Hispanic/Latino,ethnicity_Non-Hispanic/Latino,current_status_Laboratory-confirmed case,current_status_Probable Case,symptom_status_Asymptomatic,symptom_status_Symptomatic
0,0,2021-08-01,PA,WASHINGTON,Female,1,False,False,False,8,...,0,0,0,1,0,1,0,1,0,1
1,1,2021-08-01,PA,WASHINGTON,Female,1,False,False,False,8,...,0,0,0,1,0,1,0,1,0,1
2,2,2020-04-01,TN,SHELBY,Female,1,False,False,False,4,...,1,0,0,0,0,1,1,0,0,1
3,3,2020-10-01,TN,DAVIDSON,Male,0,False,False,False,10,...,1,0,0,0,0,1,1,0,0,1
4,4,2022-02-01,KS,RILEY,Female,0,False,False,False,2,...,0,0,0,1,0,1,0,1,0,1


In [8]:
cdc_analysis_df.columns.tolist()

['index',
 'date_onset',
 'state',
 'county',
 'sex',
 'case_positive_specimen',
 'hosp_yn',
 'icu_yn',
 'death_yn',
 'date_onset_month',
 'date_onset_year',
 'age_range_0 - 17 years',
 'age_range_18 to 49 years',
 'age_range_50 to 64 years',
 'age_range_65+ years',
 'race_American Indian/Alaska Native',
 'race_Asian',
 'race_Black',
 'race_Multiple/Other',
 'race_Native Hawaiian/Other Pacific Islander',
 'race_White',
 'ethnicity_Hispanic/Latino',
 'ethnicity_Non-Hispanic/Latino',
 'current_status_Laboratory-confirmed case',
 'current_status_Probable Case',
 'symptom_status_Asymptomatic',
 'symptom_status_Symptomatic']

In [9]:
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder

In [10]:
for column in ['date_onset', 'state', 'county', 'sex', 'hosp_yn', 'icu_yn', 'death_yn', 'date_onset_year', 'date_onset_month']: 
    le = preprocessing.LabelEncoder()
    test = le.fit(cdc_analysis_df[column])
    cdc_analysis_df[column] = le.transform(cdc_analysis_df[column])

In [11]:
cdc_analysis_df.head()

Unnamed: 0,index,date_onset,state,county,sex,case_positive_specimen,hosp_yn,icu_yn,death_yn,date_onset_month,...,race_Black,race_Multiple/Other,race_Native Hawaiian/Other Pacific Islander,race_White,ethnicity_Hispanic/Latino,ethnicity_Non-Hispanic/Latino,current_status_Laboratory-confirmed case,current_status_Probable Case,symptom_status_Asymptomatic,symptom_status_Symptomatic
0,0,17,14,178,0,1,0,0,0,7,...,0,0,0,1,0,1,0,1,0,1
1,1,17,14,178,0,1,0,0,0,7,...,0,0,0,1,0,1,0,1,0,1
2,2,1,15,159,0,1,0,0,0,3,...,1,0,0,0,0,1,1,0,0,1
3,3,7,15,47,1,0,0,0,0,9,...,1,0,0,0,0,1,1,0,0,1
4,4,23,5,144,0,0,0,0,0,1,...,0,0,0,1,0,1,0,1,0,1


In [12]:
#cdc_analysis_df.dtypes

# Split the data into training and testing

In [13]:
#Create features
X = cdc_analysis_df.drop(['death_yn'], axis=1)

#Create targets
y = cdc_analysis_df['death_yn']

In [14]:
#Check balance of the target values
y.value_counts()

0    2789
1      66
Name: death_yn, dtype: int64

In [15]:
#Import train test split and split the data into training and testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1,stratify=y)

In [16]:
#Will need to do sampling to shift the unbalanced data
#Add in the Smoeteen sampling algorithm

from imblearn.combine import SMOTEENN

smote_enn = SMOTEENN(random_state=1)

In [17]:
X_resampled, y_resampled = smote_enn.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({0: 1364, 1: 1767})

# Use Logistic Regression Model to train and predict

In [18]:
# # Train the Logistic Regression model using the training data 
#Using the data that is now resampled

from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver="lbfgs", random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [19]:
#Predict using the trained data

y_pred = model.predict(X_test)

In [20]:
#Compare the prediction data vs the actual data from y_test

compare_df = pd.DataFrame({"Prediction": y_pred, "Actual": y_test})
print(compare_df)

      Prediction  Actual
740            0       0
2286           1       0
1805           0       0
1863           0       0
1878           0       0
...          ...     ...
1765           0       0
1011           1       1
966            0       0
251            0       0
2059           1       0

[714 rows x 2 columns]


# Showcase results of the machine learning model

In [21]:
# Calculated the balanced accuracy score

from sklearn.metrics import balanced_accuracy_score, confusion_matrix

acc_score = balanced_accuracy_score(y_test, y_pred)

print(f"The balanced accuracy score for the Logistic Regression Model is : {acc_score}")

The balanced accuracy score for the Logistic Regression Model is : 0.9461979913916787


In [22]:
# Display the confusion matrix

cm = pd.DataFrame(confusion_matrix(y_test, y_pred), index = ["Actual Alive", "Actual Dead"], columns = ["Pred Alive", "Pred Dead"])
print(cm)

              Pred Alive  Pred Dead
Actual Alive         622         75
Actual Dead            0         17


In [23]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      0.89      1.00      0.94      0.94      0.88       697
          1       0.18      1.00      0.89      0.31      0.94      0.90        17

avg / total       0.98      0.89      1.00      0.93      0.94      0.88       714

