In [1]:
#Import dependencies and other necessary items
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [2]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix

In [3]:
# Load in the provisional data as a CSV
provisional_data_df = pd.read_csv("Resources/provisionaldb.csv")
provisional_data_df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Date_Onset,State,ST_Code,County,FIPS,Age_Range,Sex,Race,Ethnicity,Case_Positive_Specimen,Current_Status,Symptom_Status,Hosp_yn,ICU_yn,Death_yn,Date_Onset_Month,Date_Onset_Year
0,128,128,8/1/2021,PA,42,WASHINGTON,42125,0 - 17 years,Female,White,Non-Hispanic/Latino,1,Probable Case,Symptomatic,False,False,False,8,2021
1,129,129,8/1/2021,PA,42,WASHINGTON,42125,0 - 17 years,Female,White,Non-Hispanic/Latino,1,Probable Case,Symptomatic,False,False,False,8,2021
2,287,287,4/1/2020,TN,47,SHELBY,47157,18 to 49 years,Female,Black,Non-Hispanic/Latino,1,Laboratory-confirmed case,Symptomatic,False,False,False,4,2020
3,378,378,10/1/2020,TN,47,DAVIDSON,47037,50 to 64 years,Male,Black,Non-Hispanic/Latino,0,Laboratory-confirmed case,Symptomatic,False,False,False,10,2020
4,470,470,2/1/2022,KS,20,RILEY,20161,0 - 17 years,Female,White,Non-Hispanic/Latino,0,Probable Case,Symptomatic,False,False,False,2,2022


# Clean the data and prepare it for the machine learning model

In [21]:
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder

In [35]:
for name in provisional_data_df.columns: 
    le = preprocessing.LabelEncoder()
    test = le.fit(provisional_data_df[name])
    provisional_data_df[name] = le.transform(provisional_data_df[name])

In [36]:
provisional_data_df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Date_Onset,State,ST_Code,County,FIPS,Age_Range,Sex,Race,Ethnicity,Case_Positive_Specimen,Current_Status,Symptom_Status,Hosp_yn,ICU_yn,Death_yn,Date_Onset_Month,Date_Onset_Year
0,0,0,26,14,14,178,168,0,0,5,1,6,1,1,0,0,0,7,1
1,1,1,26,14,14,178,168,0,0,5,1,6,1,1,0,0,0,7,1
2,2,2,13,15,15,159,203,1,0,2,1,6,0,1,0,0,0,3,0
3,3,3,2,15,15,47,179,2,1,2,1,5,0,1,0,0,0,9,0
4,4,4,9,5,5,144,39,0,0,5,1,5,1,1,0,0,0,1,2


In [37]:
provisional_data_df.dtypes

Unnamed: 0                int64
Unnamed: 0.1              int64
Date_Onset                int32
State                     int32
ST_Code                   int64
County                    int32
FIPS                      int64
Age_Range                 int64
Sex                       int32
Race                      int32
Ethnicity                 int32
Case_Positive_Specimen    int64
Current_Status            int32
Symptom_Status            int32
Hosp_yn                   int64
ICU_yn                    int64
Death_yn                  int64
Date_Onset_Month          int64
Date_Onset_Year           int64
dtype: object

In [4]:
# # #Load the data
# url = "https://raw.githubusercontent.com/momentarypause/COVID_Death_Analysis/arutledge/segment1/database/data/test.csv"

# covid_data_df = pd.read_csv(url)
# covid_data_df.head()
# # OR 
# # covid_data_df = pd.read_json()

# Split the data into training and testing

In [6]:
#Ensure all data types are numerical for the machine learning model to process

In [38]:
#Create features
X = provisional_data_df.drop(['Death_yn'], axis=1)

#Create targets
y = provisional_data_df['Death_yn']

In [39]:
#Check balance of the target values
y.value_counts()

0    2808
1      67
Name: Death_yn, dtype: int64

In [40]:
#Will need to do sampling to shift the unbalanced data

In [41]:
#Import train test split and split the data into training and testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1,stratify=y)

# Use Logistic Regression Model to train and predict

In [42]:
# # Train the Logistic Regression model using the training data

from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver="lbfgs", random_state=1)
model.fit(X_train, y_train)

LogisticRegression(random_state=1)

In [43]:
#Predict using the trained data

y_pred = model.predict(X_test)

In [44]:
#Compare the prediction data vs the actual data from y_test

compare_df = pd.DataFrame({"Prediction": y_pred, "Actual": y_test})
print(compare_df)

      Prediction  Actual
559            0       0
2669           0       0
1345           0       0
168            0       0
2234           0       0
...          ...     ...
1492           0       0
2819           0       0
413            0       0
702            0       0
242            0       0

[719 rows x 2 columns]


# Showcase results of the machine learning model

In [45]:
# Calculated the balanced accuracy score

from sklearn.metrics import balanced_accuracy_score, confusion_matrix

acc_score = balanced_accuracy_score(y_test, y_pred)

print(f"The balanced accuracy score for the Logistic Regression Model is : {acc_score}")

The balanced accuracy score for the Logistic Regression Model is : 0.7317328640858052


In [47]:
# Display the confusion matrix

cm = pd.DataFrame(confusion_matrix(y_test, y_pred), index = ["Actual Alive", "Actual Dead"], columns = ["Pred Alive", "Pred Dead"])
print(cm)

              Pred Alive  Pred Dead
Actual Alive         697          5
Actual Dead            9          8


# Expected Results

In [18]:
# The machine learning model we chose for this project is a Logistic Regression Model
#because it will categorize the results into two parts: yes or no. This will effectively 
#answer the question if COVID deaths can be predicted based on several factors or underlying conditions.