## Model Building

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from glob import glob

In [None]:
csv_files = glob('/content/drive/MyDrive/CSE6242 Clean Partitions/*.csv')
csv_files[:5]

['/content/drive/MyDrive/CSE6242 Clean Partitions/df_partition_0.csv',
 '/content/drive/MyDrive/CSE6242 Clean Partitions/df_partition_1.csv',
 '/content/drive/MyDrive/CSE6242 Clean Partitions/df_partition_2.csv',
 '/content/drive/MyDrive/CSE6242 Clean Partitions/df_partition_3.csv',
 '/content/drive/MyDrive/CSE6242 Clean Partitions/df_partition_4.csv']

### 1.1 Train-Val-Test Splitting

In [None]:
def get_class_probs(label):
  yes_nrows = 0
  no_nrows = 0
  nrows = 0

  for file in csv_files:
    df = pd.read_csv(file)
    df = df.loc[df[label].notnull(),:]
    nrows += df.shape[0]
    yes_nrows += df.loc[df.hosp_yn==1, :].shape[0]
    no_nrows += df.loc[df.hosp_yn==0, :].shape[0]

  print(f'Number of rows: {nrows}')
  print(f'Number of rows with a yes: {yes_nrows}')
  print(f'Number of rows with a no: {no_nrows}')
  # return probabilities
  return yes_nrows / nrows, no_nrows / nrows

In [None]:
labels = ['hosp_yn', 'icu_yn', 'death_yn']

for label in labels:
  p = 0
  train = pd.DataFrame()
  val = pd.DataFrame()
  test = pd.DataFrame()

  for file in csv_files:
    df = pd.read_csv(file)
    df = df.drop([l for l in labels if l!=label], axis=1)
    df = df.loc[df[label].notnull(), :]
    
    # prob a point will be picked

    df_y = df.loc[df[label]==1, :]
    df_n = df.loc[df[label]==0, :]
    if df_n.shape[0]>df_y.shape[0]:
      df_n = df_n.sample(n=len(df_y), random_state=45)
    df = pd.concat([df_y, df_n], axis=0).sample(frac=1, random_state=45)
    df['month'] = df['month'].apply(lambda x: (x - 1)/11.0)
    df['year'] = df['year'].apply(lambda x: (x - 2020)/2.0)
    # prob it will go to train, val, and test
    train_tmp, other_tmp = train_test_split(df, test_size=0.3, random_state=45, stratify=df[label])
    val_tmp, test_tmp = train_test_split(other_tmp, test_size=0.5, random_state=45, stratify=other_tmp[label])

    train = train.append(train_tmp)
    val = val.append(val_tmp)
    test = test.append(test_tmp)
      
    if train.shape[0]>=1000000:
      train.to_csv(f'/content/drive/MyDrive/CSE6242 Clean Partitions/{label}/train_p{p}.csv', index=False)
      val.to_csv(f'/content/drive/MyDrive/CSE6242 Clean Partitions/{label}/val_p{p}.csv', index=False)
      test.to_csv(f'/content/drive/MyDrive/CSE6242 Clean Partitions/{label}/test_p{p}.csv', index=False)
      train = pd.DataFrame()
      val = pd.DataFrame()
      test = pd.DataFrame()
      p +=1
  train.to_csv(f'/content/drive/MyDrive/CSE6242 Clean Partitions/{label}/train_p{p}.csv', index=False)
  val.to_csv(f'/content/drive/MyDrive/CSE6242 Clean Partitions/{label}/val_p{p}.csv', index=False)
  test.to_csv(f'/content/drive/MyDrive/CSE6242 Clean Partitions/{label}/test_p{p}.csv', index=False)
  train = pd.DataFrame()
  val = pd.DataFrame()
  test = pd.DataFrame()

### 1.2 Hospitalization Model

In [None]:
train_files = glob('/content/drive/MyDrive/CSE6242 Clean Partitions/hosp_yn/train*.csv')
val_files = glob('/content/drive/MyDrive/CSE6242 Clean Partitions/hosp_yn/val*.csv')
test_files = glob('/content/drive/MyDrive/CSE6242 Clean Partitions/hosp_yn/test*.csv')

train_df = pd.concat([pd.read_csv(file) for file in train_files], axis=0).dropna()
val_df = pd.concat([pd.read_csv(file) for file in val_files], axis=0).dropna()
test_df = pd.concat([pd.read_csv(file) for file in test_files], axis=0).dropna()

In [None]:
new_month = val.month.apply(lambda x: x*11 + 1)
new_month.unique()

array([ 5.,  3.,  1.,  9., 12.,  8.,  2.,  7., 11., 10.,  6.,  4.])

In [None]:
train_df.shape

(2110192, 75)

In [None]:
train_df.hosp_yn.to_csv('/content/drive/MyDrive/CSE6242 Clean Partitions/Final_dfs/hosp_train_targets.csv', index=False)
val_df.hosp_yn.to_csv('/content/drive/MyDrive/CSE6242 Clean Partitions/Final_dfs/hosp_val_targets.csv', index=False)
test_df.hosp_yn.to_csv('/content/drive/MyDrive/CSE6242 Clean Partitions/Final_dfs/hosp_test_targets.csv', index=False)

train_df.drop('hosp_yn', axis=1).to_csv('/content/drive/MyDrive/CSE6242 Clean Partitions/Final_dfs/hosp_train_features.csv', index=False)
val_df.drop('hosp_yn', axis=1).to_csv('/content/drive/MyDrive/CSE6242 Clean Partitions/Final_dfs/hosp_val_features.csv', index=False)
test_df.drop('hosp_yn', axis=1).to_csv('/content/drive/MyDrive/CSE6242 Clean Partitions/Final_dfs/hosp_test_features.csv', index=False)

In [None]:
train_df.head()

Unnamed: 0,current_status_Probable_Case,current_status_Laboratory-confirmed_case,race_White,race_Native_Hawaiian/Other_Pacific_Islander,race_Multiple/Other,race_Black,race_Asian,race_American_Indian/Alaska_Native,sex_Other,sex_Male,...,res_state_AZ,res_state_AR,res_state_AL,res_state_AK,county_metro_yn,county_SVI,county_census_2019,year,month,hosp_yn
0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,4.0,618795.0,0.0,0.363636,1.0
1,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,4.0,618795.0,0.5,0.181818,1.0
2,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,2.0,803907.0,0.0,0.181818,1.0
3,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,2180085.0,1.0,0.0,0.0
4,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,2.0,1952778.0,0.5,0.0,1.0


In [None]:
test_df.hosp_yn.value_counts(normalize=True)

1.0    0.500015
0.0    0.499985
Name: hosp_yn, dtype: float64

In [None]:
X_train, y_train = train_df.drop('hosp_yn', axis=1), train_df['hosp_yn']
X_val, y_val = val_df.drop('hosp_yn', axis=1), val_df['hosp_yn']
X_test, y_test = test_df.drop('hosp_yn', axis=1), test_df['hosp_yn']

rf = RandomForestClassifier(max_depth=18, n_estimators=100, criterion='gini', n_jobs=-1).fit(X_train, y_train)

In [None]:
train_preds = rf.predict(X_train)
val_preds = rf.predict(X_val)
test_preds = rf.predict(X_test)

print(f'Hosp Model Train Accuracy: {accuracy_score(y_train, train_preds)}, F1 score: {f1_score(y_train, train_preds)}')
print(f'Hosp Model Validation Accuracy: {accuracy_score(y_val, val_preds)}, F1 score: {f1_score(y_val, val_preds)}')
print(f'Hosp Model Test Accuracy: {accuracy_score(y_test, test_preds)}, F1 score: {f1_score(y_test, test_preds)}')

Hosp Model Train Accuracy: 0.7788049618233791, F1 score: 0.7762813052989185
Hosp Model Validation Accuracy: 0.774122496638596, F1 score: 0.7713095851151961
Hosp Model Test Accuracy: 0.7750047543861976, F1 score: 0.7724770231892484


In [None]:
pd.Series(train_preds, name='hosp_train_preds').to_csv('/content/drive/MyDrive/CSE6242 Clean Partitions/Final_dfs/hosp_train_preds.csv', index=False)
pd.Series(val_preds, name='hosp_val_preds').to_csv('/content/drive/MyDrive/CSE6242 Clean Partitions/Final_dfs/hosp_val_preds.csv', index=False)
pd.Series(test_preds, name='hosp_test_preds').to_csv('/content/drive/MyDrive/CSE6242 Clean Partitions/Final_dfs/hosp_test_preds.csv', index=False)

In [None]:
from joblib import dump, load
dump(rf, '/content/drive/MyDrive/CSE6242 Clean Partitions/hosp_yn/RF_hosp.joblib')

['/content/drive/MyDrive/CSE6242 Clean Partitions/hosp_yn/RF_hosp.joblib']

### 1.3 ICU Model

In [None]:
train_files = glob('/content/drive/MyDrive/CSE6242 Clean Partitions/icu_yn/train*.csv')
val_files = glob('/content/drive/MyDrive/CSE6242 Clean Partitions/icu_yn/val*.csv')
test_files = glob('/content/drive/MyDrive/CSE6242 Clean Partitions/icu_yn/test*.csv')

train_df = pd.concat([pd.read_csv(file) for file in train_files], axis=0).dropna()
val_df = pd.concat([pd.read_csv(file) for file in val_files], axis=0).dropna()
test_df = pd.concat([pd.read_csv(file) for file in test_files], axis=0).dropna()

In [None]:
train_df.icu_yn.to_csv('/content/drive/MyDrive/CSE6242 Clean Partitions/Final_dfs/icu_train_targets.csv', index=False)
val_df.icu_yn.to_csv('/content/drive/MyDrive/CSE6242 Clean Partitions/Final_dfs/icu_val_targets.csv', index=False)
test_df.icu_yn.to_csv('/content/drive/MyDrive/CSE6242 Clean Partitions/Final_dfs/icu_test_targets.csv', index=False)

train_df.drop('icu_yn', axis=1).to_csv('/content/drive/MyDrive/CSE6242 Clean Partitions/Final_dfs/icu_train_features.csv', index=False)
val_df.drop('icu_yn', axis=1).to_csv('/content/drive/MyDrive/CSE6242 Clean Partitions/Final_dfs/icu_val_features.csv', index=False)
test_df.drop('icu_yn', axis=1).to_csv('/content/drive/MyDrive/CSE6242 Clean Partitions/Final_dfs/icu_test_features.csv', index=False)

In [None]:
X_train, y_train = train_df.drop('icu_yn', axis=1), train_df['icu_yn']
X_val, y_val = val_df.drop('icu_yn', axis=1), val_df['icu_yn']
X_test, y_test = test_df.drop('icu_yn', axis=1), test_df['icu_yn']

rf = RandomForestClassifier(max_depth=18, n_estimators=100, criterion='gini', n_jobs=-1)
rf.fit(X_train, y_train)

RandomForestClassifier(max_depth=18, n_jobs=-1)

In [None]:
train_preds = rf.predict(X_train)
val_preds = rf.predict(X_val)
test_preds = rf.predict(X_test)

print(f'ICU Model Train Accuracy: {accuracy_score(y_train, train_preds)}, F1 score: {f1_score(y_train, train_preds)}')
print(f'ICU Model Validation Accuracy: {accuracy_score(y_val, val_preds)}, F1 score: {f1_score(y_val, val_preds)}')
print(f'ICU Model Test Accuracy: {accuracy_score(y_test, test_preds)}, F1 score: {f1_score(y_test, test_preds)}')

ICU Model Train Accuracy: 0.8493367280242525, F1 score: 0.8585389232756453
ICU Model Validation Accuracy: 0.8278286129266521, F1 score: 0.8384177094408549
ICU Model Test Accuracy: 0.827848395147716, F1 score: 0.8391278407550036


In [None]:
pd.Series(train_preds, name='icu_train_preds').to_csv('/content/drive/MyDrive/CSE6242 Clean Partitions/Final_dfs/icu_train_preds.csv', index=False)
pd.Series(val_preds, name='icu_val_preds').to_csv('/content/drive/MyDrive/CSE6242 Clean Partitions/Final_dfs/icu_val_preds.csv', index=False)
pd.Series(test_preds, name='icu_test_preds').to_csv('/content/drive/MyDrive/CSE6242 Clean Partitions/Final_dfs/icu_test_preds.csv', index=False)

In [None]:
from joblib import dump, load
dump(rf, '/content/drive/MyDrive/CSE6242 Clean Partitions/icu_yn/RF_icu.joblib')

['/content/drive/MyDrive/CSE6242 Clean Partitions/icu_yn/RF_icu.joblib']

### 1.4 Death Model

In [None]:
train_files = glob('/content/drive/MyDrive/CSE6242 Clean Partitions/death_yn/train*.csv')
val_files = glob('/content/drive/MyDrive/CSE6242 Clean Partitions/death_yn/val*.csv')
test_files = glob('/content/drive/MyDrive/CSE6242 Clean Partitions/death_yn/test*.csv')

train_df = pd.concat([pd.read_csv(file) for file in train_files], axis=0).dropna()
val_df = pd.concat([pd.read_csv(file) for file in val_files], axis=0).dropna()
test_df = pd.concat([pd.read_csv(file) for file in test_files], axis=0).dropna()

In [None]:
train_df.death_yn.to_csv('/content/drive/MyDrive/CSE6242 Clean Partitions/Final_dfs/death_train_targets.csv', index=False)
val_df.death_yn.to_csv('/content/drive/MyDrive/CSE6242 Clean Partitions/Final_dfs/death_val_targets.csv', index=False)
test_df.death_yn.to_csv('/content/drive/MyDrive/CSE6242 Clean Partitions/Final_dfs/death_test_targets.csv', index=False)

train_df.drop('death_yn', axis=1).to_csv('/content/drive/MyDrive/CSE6242 Clean Partitions/Final_dfs/death_train_features.csv', index=False)
val_df.drop('death_yn', axis=1).to_csv('/content/drive/MyDrive/CSE6242 Clean Partitions/Final_dfs/death_val_features.csv', index=False)
test_df.drop('death_yn', axis=1).to_csv('/content/drive/MyDrive/CSE6242 Clean Partitions/Final_dfs/death_test_features.csv', index=False)

In [None]:
X_train, y_train = train_df.drop('death_yn', axis=1), train_df['death_yn']
X_val, y_val = val_df.drop('death_yn', axis=1), val_df['death_yn']
X_test, y_test = test_df.drop('death_yn', axis=1), test_df['death_yn']

rf = RandomForestClassifier(max_depth=18, n_estimators=100, criterion='gini', n_jobs=-1)
rf.fit(X_train, y_train)

RandomForestClassifier(max_depth=18, n_jobs=-1)

In [None]:
train_preds = rf.predict(X_train)
val_preds = rf.predict(X_val)
test_preds = rf.predict(X_test)

print(f'Death Model Train Accuracy: {accuracy_score(y_train, train_preds)}, F1 score: {f1_score(y_train, train_preds)}')
print(f'Death Model Validation Accuracy: {accuracy_score(y_val, val_preds)}, F1 score: {f1_score(y_val, val_preds)}')
print(f'Death Model Test Accuracy: {accuracy_score(y_test, test_preds)}, F1 score: {f1_score(y_test, test_preds)}')

Death Model Train Accuracy: 0.9381131701516202, F1 score: 0.9413352774561399
Death Model Validation Accuracy: 0.9370369253350223, F1 score: 0.9403574931672524
Death Model Test Accuracy: 0.9370458588355896, F1 score: 0.9402723215988406


In [None]:
pd.Series(train_preds, name='death_train_preds').to_csv('/content/drive/MyDrive/CSE6242 Clean Partitions/Final_dfs/death_train_preds.csv', index=False)
pd.Series(val_preds, name='death_val_preds').to_csv('/content/drive/MyDrive/CSE6242 Clean Partitions/Final_dfs/death_val_preds.csv', index=False)
pd.Series(test_preds, name='death_test_preds').to_csv('/content/drive/MyDrive/CSE6242 Clean Partitions/Final_dfs/death_test_preds.csv', index=False)

In [None]:
from joblib import dump, load
dump(rf, '/content/drive/MyDrive/CSE6242 Clean Partitions/death_yn/RF_death.joblib')

['/content/drive/MyDrive/CSE6242 Clean Partitions/death_yn/RF_death.joblib']