In [1]:
#Import dependencies and other necessary items
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.preprocessing import StandardScaler

In [2]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix

In [3]:
pip install psycopg2

Note: you may need to restart the kernel to use updated packages.


In [4]:
#Loading in data from the PostgreSQL database
from sqlalchemy import create_engine
import psycopg2

In [6]:
# #load df from csv

# loadfile = "Resources/cdc_df1.csv"
# cdc_df1 = pd.read_csv(loadfile)

# # Create and connect to a Postgres database, import data
# db_string = f"postgresql://postgres:Kidapup1@127.0.0.1:5432/COVID_MSU"
# engine = create_engine(db_string)
# cdc_df1.to_sql(name='cdc_df_import', con=engine, if_exists='replace')

In [None]:
# # Load in the provisional data as a CSV
# provisional_data_df = pd.read_csv("Resources/provisionaldb.csv")
# provisional_data_df.head()

# #Now we have to load in the data from the actual SQL database and NOT a CSV

In [8]:
#This is the actual code needed to read in the data from the database
db_string = f"postgresql://postgres:Kidapup1@127.0.0.1:5432/COVID_MSU"
engine = create_engine(db_string)

cdc_analysis_df = pd.read_sql_query('''SELECT * FROM cdc_df_import''', con=engine)
cdc_analysis_df.head()

Unnamed: 0.2,index,Unnamed: 0,Unnamed: 0.1,date_onset,state,st_code,county,fips,age_range,sex,race,ethnicity,case_positive_specimen,current_status,symptom_status,hosp_yn,icu_yn,death_yn,date_onset_month,date_onset_year
0,0,128,128,2021-08-01,PA,42,WASHINGTON,42125.0,0 - 17 years,Female,White,Non-Hispanic/Latino,1,Probable Case,Symptomatic,False,False,False,8,2021
1,1,129,129,2021-08-01,PA,42,WASHINGTON,42125.0,0 - 17 years,Female,White,Non-Hispanic/Latino,1,Probable Case,Symptomatic,False,False,False,8,2021
2,2,287,287,2020-04-01,TN,47,SHELBY,47157.0,18 to 49 years,Female,Black,Non-Hispanic/Latino,1,Laboratory-confirmed case,Symptomatic,False,False,False,4,2020
3,3,378,378,2020-10-01,TN,47,DAVIDSON,47037.0,50 to 64 years,Male,Black,Non-Hispanic/Latino,0,Laboratory-confirmed case,Symptomatic,False,False,False,10,2020
4,4,470,470,2022-02-01,KS,20,RILEY,20161.0,0 - 17 years,Female,White,Non-Hispanic/Latino,0,Probable Case,Symptomatic,False,False,False,2,2022


# Clean the data and prepare it for the machine learning model- Preprocessing

In [None]:
provisional_data_df = pd.get_dummies(data=provisional_data_df, columns = ['Age_Range', 'Race', 'Ethnicity', \
                                                                          'Current_Status', 'Symptom_Status'])
provisional_data_df.head()

In [None]:
#Drop st_code, unnamed columns, FIPS
provisional_data_df = provisional_data_df.drop(['Unnamed: 0', 'Unnamed: 0.1', 'ST_Code', 'FIPS'], axis=1)
provisional_data_df.head()

In [None]:
provisional_data_df.columns.tolist()

In [None]:
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder

In [None]:
for column in ['Date_Onset', 'State', 'County', 'Sex', 'Hosp_yn', 'ICU_yn', 'Death_yn', 'Date_Onset_Year', 'Date_Onset_Month']: 
    le = preprocessing.LabelEncoder()
    test = le.fit(provisional_data_df[column])
    provisional_data_df[column] = le.transform(provisional_data_df[column])

In [None]:
provisional_data_df.head()

In [None]:
provisional_data_df.dtypes

# Split the data into training and testing

In [None]:
#Create features
X = provisional_data_df.drop(['Death_yn'], axis=1)

#Create targets
y = provisional_data_df['Death_yn']

In [None]:
#Check balance of the target values
y.value_counts()

In [None]:
#Import train test split and split the data into training and testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1,stratify=y)

In [None]:
#Will need to do sampling to shift the unbalanced data
#Add in the Smoeteen sampling algorithm

from imblearn.combine import SMOTEENN

smote_enn = SMOTEENN(random_state=1)

In [None]:
X_resampled, y_resampled = smote_enn.fit_resample(X_train, y_train)
Counter(y_resampled)

# Use Logistic Regression Model to train and predict

In [None]:
# # Train the Logistic Regression model using the training data 
#Using the data that is now resampled

from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver="lbfgs", random_state=1)
model.fit(X_resampled, y_resampled)

In [None]:
#Predict using the trained data

y_pred = model.predict(X_test)

In [None]:
#Compare the prediction data vs the actual data from y_test

compare_df = pd.DataFrame({"Prediction": y_pred, "Actual": y_test})
print(compare_df)

# Showcase results of the machine learning model

In [None]:
# Calculated the balanced accuracy score

from sklearn.metrics import balanced_accuracy_score, confusion_matrix

acc_score = balanced_accuracy_score(y_test, y_pred)

print(f"The balanced accuracy score for the Logistic Regression Model is : {acc_score}")

In [None]:
# Display the confusion matrix

cm = pd.DataFrame(confusion_matrix(y_test, y_pred), index = ["Actual Alive", "Actual Dead"], columns = ["Pred Alive", "Pred Dead"])
print(cm)

In [None]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

# Notes to add to ReadME

In [None]:
#Sampling data using Smoteen improved accuracy score from 71% to 89.5%!!

In [None]:
#Dropping some columns that have duplicate data, like the ST_Code compared to State

In [None]:
#After dropping those unneccesary columns, the accuracy score jumped to 92.3%!!

In [None]:
#After changing categorical variables to get dummies, the accuracy score jumped to 95%!!