In [1]:
import numpy as np
import pandas as pd
from   sklearn.compose         import *
from   sklearn.experimental    import enable_iterative_imputer
from   sklearn.impute          import *
from   sklearn.linear_model    import LinearRegression 
from   sklearn.linear_model    import LogisticRegression, RidgeClassifier
from   sklearn.metrics         import mean_absolute_error
from   sklearn.model_selection import train_test_split
from   sklearn.pipeline        import Pipeline
from   sklearn.preprocessing   import *
from   sklearn.metrics         import balanced_accuracy_score
from   sklearn.inspection      import permutation_importance
from   sklearn.decomposition   import PCA
from   sklearn.dummy           import DummyClassifier
from   sklearn.ensemble        import RandomForestClassifier
from   sklearn.model_selection import RandomizedSearchCV
from   sklearn.base            import BaseEstimator, TransformerMixin

In [2]:
df = pd.read_csv('data/sub_COVID-19_Case_Surveillance_Public_Use_Data.csv')

### Assumption of Supervised Machine Learning is that each instance has a label.  We will discard instances with missing/unknown targets before beginning
(Target transformations are ok to do outside of pipeline)

In [3]:
df_clean_target = df[(df['death_yn'] != 'Unknown') & (df['death_yn'] != 'Missing')]

### Separate target from rest of DataFrame

In [4]:
df_X = df_clean_target.drop('death_yn', axis=1)
df_y = pd.DataFrame(df_clean_target['death_yn'])
X = df_X.to_numpy()
y = df_y.to_numpy()

### Split into train, validation, and test

In [5]:
X_train_pre, X_test, y_train_pre, y_test = train_test_split(X, y, train_size=0.8)
X_train, X_validate, y_train, y_validate = train_test_split(X_train_pre, y_train_pre, train_size=0.8)

### Build Pipeline

In [6]:
# get categorical columns
categorical_columns = (df_X.dtypes == object)

In [7]:
# continuous variable preprocessing pipeline
con_pipe = Pipeline([('imputer', SimpleImputer(missing_values=np.nan, strategy='median', add_indicator=True)),
                     ('scaler', StandardScaler())
                    ])

# categorical variable preprocessing pipeline
cat_pipe = Pipeline([('imputer_nan', SimpleImputer(missing_values=np.nan, strategy='most_frequent', add_indicator=True)),
                     ('imputer_missing', SimpleImputer(missing_values='Missing', strategy='most_frequent', add_indicator=True)),
                     ('imputer_unknown', SimpleImputer(missing_values='Unknown', strategy='most_frequent', add_indicator=True)),
                     ('ohe'    , OneHotEncoder(handle_unknown='ignore'))
                    ])

# combine preprocessing together
preprocessing = ColumnTransformer([('categorical', cat_pipe, categorical_columns),
                                   ('continuous' , con_pipe, ~categorical_columns)
                                  ])

### Simple attempt to get baseline

In [8]:
pipe = Pipeline([('prep' , preprocessing),
                 ('lg' , LogisticRegression(solver='liblinear'))
                ])

In [9]:
pipe.fit(X_train, y_train.ravel())
y_pred = pipe.predict(X_validate)
balanced_accuracy_score(y_validate.ravel(), y_pred)

0.6945317725752509

### Hyperparameter Tuning and Cross Validation
get hyperparams for LR, RF, Ridge

In [10]:
RandomForestClassifier().get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [11]:
LogisticRegression().get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [12]:
RidgeClassifier().get_params()

{'alpha': 1.0,
 'class_weight': None,
 'copy_X': True,
 'fit_intercept': True,
 'max_iter': None,
 'normalize': False,
 'random_state': None,
 'solver': 'auto',
 'tol': 0.001}

In [13]:
class DummyEstimator(BaseEstimator):
    "Pass through class, methods are present but do nothing."
    def fit(self): pass
    def score(self): pass

In [14]:
pipe = Pipeline([
                 ('prep', preprocessing),
                 ('clf', DummyEstimator())
                ])

search_space = [
                 # LogisticRegression
                 {'clf' : [LogisticRegression(solver='liblinear')],
                  'clf__penalty': ['l1', 'l2'],
                  'clf__C' : np.logspace(0, 4, 10),
                  'clf__class_weight' : [None, 'balanced']
                 },
    
                 # RidgeClassifier
                 {'clf' : [RidgeClassifier(solver='auto')],
                  'clf__tol' : [0.001, 0.01, 0.1],
                  'clf__max_iter' : [None, 1, 10, 100, 1000],
                  'clf__class_weight' : [None, 'balanced'],
                  'clf__normalize' : [False, True]
                 },
                 
                 # RandomForest
                 {'clf' : [RandomForestClassifier(n_jobs=-1)],
                  'clf__criterion' : ['gini', 'entropy'],
                  'clf__class_weight' : [None, 'balanced'],
                  'clf__max_depth' : list(range(2,11)),
                  'clf__max_features' : ['auto', 'log2', 'sqrt'],
                  'clf__n_estimators' : list(range(50, 250, 50)),
                  'clf__bootstrap' : [True, False]
                 }
    
    
               ]

In [15]:
clf_algos_rand = RandomizedSearchCV(estimator=pipe,
                                    param_distributions=search_space,
                                    scoring='balanced_accuracy',
                                    n_iter=100,
                                    cv=5,
                                    n_jobs=-1)

### Fit and get best model
(uncomment to run, takes a few minutes)

In [16]:
# fit it and see results
# best_model = clf_algos_rand.fit(X_train, y_train.ravel())
# best_model.best_estimator_

In [17]:
clf_pipe = Pipeline([(
                       'clf', RandomForestClassifier(class_weight='balanced',
                                                     max_depth=10,
                                                     max_features='sqrt',
                                                     bootstrap=False,
                                                     n_estimators = 150,
                                                     n_jobs=-1,
                                                     criterion='entropy')
)])

In [18]:
# create end-to-end pipeline

pipe = Pipeline([
                 ('prep' , preprocessing),
                 ('rf', clf_pipe )
                ])


### Fit model, get Balanced Accuracy Score

In [19]:
pipe.fit(X_train, y_train.ravel())
y_pred = pipe.predict(X_validate)
bal = (balanced_accuracy_score(y_validate.ravel(), y_pred))
bal

0.9042140468227424

### Fit against test set for final evaluation

In [20]:
pipe = Pipeline([
                 ('prep' , preprocessing),
                 ('rf', clf_pipe )
                ])



pipe.fit(X_train_pre, y_train_pre.ravel())
y_pred = pipe.predict(X_test)
bal = (balanced_accuracy_score(y_test.ravel(), y_pred))
bal

0.9055451505016723