### 1. Project Setup

In [None]:
%matplotlib inline

import joblib
import  datetime
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, plot_roc_curve
from take_home.data.process_data import BaseETL, EncountersETL
from take_home.models.classification_pipelines import build_logistic_regression_pipeline, build_random_forest_pipeline
from take_home.visualizations.stat_plots import cross_tab_prop_plot
from config import FilePaths, PatientsConfig, EncountersConfig

### 2. Prepare the for modeling and conduct EDA

#### 2.1 Prepare the patient data for modeling
data: `patients.csv`
We ingest only the columns necessary for modeling. See `config.py` for a list of the columns used.

In [None]:
patients_data = BaseETL(FilePaths.PATIENTS_DATA, PatientsConfig.COLUMNS, PatientsConfig.DATE_COLUMNS)

In [None]:
# Get the shape of the dataframe
print(patients_data)

In [None]:
# Check to see the data was read in correctly
patients_data.df.info()

### 2.2 Prepare the ecounters data for modeling and conduct EDA
data: `encounters.csv`
We ingest only the columns necessary for modeling. See `config.py` for a list of the columns used.


In [None]:
encounters_data = EncountersETL(FilePaths.ENCOUNTERS_DATA, EncountersConfig.COLUMNS, EncountersConfig.DATE_COLUMNS)

In [None]:
# Create a new dataframe that contains patients who were admitted for COVID
encounters_data.subset_dataframe()

In [None]:
encounters_data.covid_df.head()

In [None]:
# Get the last admitted records for patients
encounters_data.get_last_admitted_records()

In [None]:
encounters_data.last_admitted_df.head()

In [None]:
# Combine the encounters and the paitents data
encounters_data.merge_encounters_and_patients_data(df=patients_data.df)

In [None]:
# Add labels for covid related deaths (1=dead, 0=not dead)
encounters_data.label_covid_deaths()

In [None]:
encounters_data.research_df.head()

In [None]:
# Save the research dataset for modeling
path = f"{FilePaths.PROCESSED_DATA}/research.csv"
encounters_data.research_df.to_csv(path, index=False)

In [None]:
encounters_data.research_df.info()

The research dataset consists of 13 columns and 1867 rows.
NOTE: Only columns will be used for modeling:
target: covid_death
features: GENDER, RACE, age_admitted

In [None]:
# Check for missing values
encounters_data.research_df.isnull().sum()

The research dataset is only missing values for DEATHDATE.

In [None]:
# Frequency of reasoncodes
encounters_data.research_df['REASONCODE'].value_counts()

In [None]:
# Frequency of codes
encounters_data.research_df['CODE'].value_counts()

There are 3 unique codes for COVID. There is a 4th code but that has a description death certificate. Those were exlcuded from this analysis.

In [None]:
# unique descriptions
encounters_data.research_df['DESCRIPTION'].unique()

In [None]:
# Target variable counts
encounters_data.research_df['covid_death'].value_counts()

In [None]:
# We see that the dataset is imbalanced
sns.countplot(x='covid_death', data=encounters_data.research_df);

In [None]:
# Get the age of the patient based on the last time they were admitted to the hospital due to COVID
encounters_data.calculate_age_at_last_admittance()

In [None]:
encounters_data.research_df['age_admitted'].hist();

In [None]:
encounters_data.research_df['age_admitted'].describe()

We see that the average age for a patient being admitted to the hospital for COVID is around 53 years of age.

In [None]:
# Distribution of patient age by gender
sns.boxplot(x='covid_death', y='age_admitted', hue='GENDER', data=encounters_data.research_df);

It appears that for both males and females that patients who are older are at a higher risk of dying
due to COVID complications within 30 days after being admitted to the hospital.

In [None]:
# Distribution of patient age by race
sns.boxplot(x='covid_death', y='age_admitted', hue='RACE', data=encounters_data.research_df);

In [None]:
encounters_data.research_df['RACE'].value_counts()

We see that the data is biased towards white people. It will be hard to interpret RACE as a factor with likelihood of dying due to COVID.

In [None]:
# Get the correlation between age and covid_death
encounters_data.research_df[['covid_death', 'age_admitted']].corr()

There appears to be a weak correlation between the age a patient is when admitted and dying from COVID within 30 days

In [None]:
# Proportion of COVID deaths by race
cross_tab_prop_plot(encounters_data.research_df, 'RACE', 'covid_death')

There is slight variation in the proportion of deaths by race. Again the data is biased towards whties.

In [None]:
# Proportion of COVID deaths by gender
cross_tab_prop_plot(encounters_data.research_df, 'GENDER', 'covid_death')

In this dataset the proportion of covid deaths for Men is higher than women.

### 3. Build models to predict if a patient will die in the next 30 days after being admitted to the hospital for covid

In [None]:
# Build the train and test datasets
#TODO: move to process_data
X = encounters_data.research_df[['GENDER', 'RACE', 'age_admitted']]
y = encounters_data.research_df['covid_death']

# Get a list of numeric and categorical features to pass to the model pipeline
numeric_features = X.select_dtypes(exclude='object').columns.to_list()
categorical_features = X.select_dtypes(include='object').columns.to_list()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### 3.1 Build a logistic regression model using a grid search and cross validation

In [None]:
lr_grid = build_logistic_regression_pipeline(numeric_features,categorical_features)

In [None]:
lr_grid.fit(X_train, y_train)

In [None]:
lr_grid.best_estimator_

In [None]:
# Evaluate the best LR model on the holdout set
lr_grid_preds = lr_grid.predict(X_test)

In [None]:
confusion_matrix(y_test, lr_grid_preds)

In [None]:
# Classification report for Logistic Regression:
print(classification_report(y_test, lr_grid_preds))

In [None]:
plot_roc_curve(lr_grid, X_test, y_test)

The logistic regression model has an AUC of 0.87 which indicates that the model does a decent job separating positive cases from the negative ones. However, further inspection of the confusion matrix shows that the model labeled a fair amount of patients as not dead when they did die within 30 days of being admitted. The false positives appear to be low. We could test adjusting the threshold to allow for more false positives.

In [None]:
# Get the coeficents
lr_coefs = list(lr_grid.best_estimator_.named_steps['preprocessor'].get_feature_names_out())

In [None]:
pd.DataFrame(lr_grid.best_estimator_.named_steps['lr'].coef_, columns=lr_coefs)

In [None]:
# Save the model
time_now = datetime.datetime.now()
lr_file_name = f"{FilePaths.MODEL_DIR}/lr-model-{time_now}.pkl"
joblib.dump(lr_grid, lr_file_name)

### 3.2 Build a Random Forest model using a grid search and cross validation

In [None]:
rf_grid = build_random_forest_pipeline(numeric_features,categorical_features)

In [None]:
rf_grid.fit(X_train, y_train)

In [None]:
rf_grid.best_estimator_

In [None]:
# Evaluate the best RF model on the holdout set
rf_grid_preds = rf_grid.predict(X_test)

In [None]:
confusion_matrix(y_test, rf_grid_preds)

In [None]:
# Classification report for Logistic Regression:
print(classification_report(y_test, rf_grid_preds))

In [None]:
plot_roc_curve(rf_grid, X_test, y_test)

The random forest model has an AUC of 0.86 which indicates that the model does a decent job separating positive cases from the negative ones. Similarly to the logistic regression model the RF model has an equal amount of False Negatives and has a few more false positives.

In [None]:
rf_feature_names = list(rf_grid.best_estimator_.named_steps['preprocessor'].get_feature_names_out())
rf_importances = rf_grid.best_estimator_.named_steps['rf'].feature_importances_
forest_importances = pd.Series(rf_importances, index=rf_feature_names)

fig, ax = plt.subplots()
forest_importances.plot.bar(ax=ax)
ax.set_title("Feature importances using permutation on full model")
ax.set_ylabel("Mean accuracy decrease")
fig.tight_layout()
plt.show()

From the data analysis and examining both random forest and logistic regression models. It appears that a patient's age is an important predictoring when determining if a patient will survive COVID after being admitted to a hospital

In [None]:
# Save the model
time_now = datetime.datetime.now()
rf_file_name = f"{FilePaths.MODEL_DIR}/rf-model-{time_now}.pkl"
joblib.dump(rf_grid, rf_file_name)