# Setup

In [1]:
# TODO: add annotations describing usage of different modules

from operator import mod
from os import getcwd
from os.path import exists, join

import joblib
from sklearn.datasets import fetch_california_housing
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler, KBinsDiscretizer
from sklearn.svm import SVR
import pandas as pd
import numpy as np
# from ydata_profiling import ProfileReport

from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.linear_model import LogisticRegression, LinearRegression
import warnings
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
import matplotlib.pyplot as plt
from sklearn.ensemble import  GradientBoostingClassifier
# import xgboost as xgb
import seaborn as sns
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC, LinearSVC 
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import KFold

from sklearn.metrics import recall_score

from sklearn import tree
from sklearn.decomposition import PCA, SparsePCA

from sklearn.mixture import GaussianMixture
from sklearn.decomposition import PCA
import json
import pickle
from IPython.display import Image
import warnings

from sklearn.datasets import make_classification
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from collections import Counter

import altair as alt
import random
import warnings

warnings.filterwarnings('ignore')

# set seaborn whitegrid theme
sns.set(style="whitegrid")

from sklearn.inspection import permutation_importance
from random import sample
from itertools import combinations

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [2]:
def get_model_data(original_df, 
                   columns, 
                   test_size_to_use=0.2,
                   drop_null_rows=False,
                   null_imputer_strategy='median', # mean, median, most_frequent
                   use_value_scaler=True,
                   use_smote=False,
                   return_indices=False):
    """
    Function to build feature & indicator matrices for both train & test.
    """
    
    # add target column (MDD)
    cols_to_use = columns.copy()
    cols_to_use.insert(0, 'MDD')
    #cols_to_use.insert(0, 'SEQN')
    
    df_to_use = original_df[cols_to_use]
    
    if drop_null_rows:
        df_to_use.dropna(inplace=True)
    
    # Create test & train data
    x = df_to_use.iloc[:,1:].values
    y = df_to_use['MDD'].values
    indices = np.arange(y.shape[0])
    
    if not drop_null_rows:
        # SimpleImputer() = fill in missing values
        # note imputer may drop columns if no values exist for it
        imputer = SimpleImputer(strategy=null_imputer_strategy)  
        x = imputer.fit_transform(x)

    # RobustScaler() = scale features to remove outliers
    if use_value_scaler:
        trans = RobustScaler()
        x = trans.fit_transform(x)

    x_train, x_test, y_train, y_test, idx_train, idx_test = train_test_split(
        x, 
        y, 
        indices,
        test_size=test_size_to_use, 
        random_state=42
    ) 
    
    # Technique to de-risk from positive class imbalance
    if use_smote:
        sm = SMOTE(random_state=42)
        x_train, y_train = sm.fit_resample(x_train, y_train)
    
    if return_indices:
        return x_train, x_test, y_train, y_test, idx_train, idx_test
    else:
        return x_train, x_test, y_train, y_test
    
    
def get_performance_df(label_actual, label_pred, model_name):
    """
    Function to calculate performance metrics for model.
    Includes precision, recal, F1, & support.
    """
    # create classification report
    result_table = classification_report(label_actual, label_pred, output_dict=True)
    result_table = pd.DataFrame.from_dict(result_table)

    # store for later
    accuracies = result_table['accuracy'][0]
    
    column_key = {
        '0':'Depressed (No)',
         '1':'Depressed (Yes)',
         'accuracy':'accuracy',
         'macro avg':'Macro Avg',
         'weighted avg':'Weighted Avg'
    }

    # rename grouping
    result_table.columns = [column_key.get(key) for key in result_table.columns]

    # create dataframe with 1 row per grouping
    result_table.drop(labels = 'accuracy', axis = 1, inplace=True)
    result_table = result_table.transpose()
    result_table['accuracy'] = [accuracies for i in range(result_table.shape[0])]
    result_table = result_table.reset_index()
    result_table.rename(columns = {'index':'grouping'},inplace=True)
    result_table['model'] = model_name
    result_table = result_table[['model','grouping','precision','recall','f1-score','support','accuracy']]
    return result_table    

In [3]:
cdc_survey = pd.read_csv('../../../data/cdc_nhanes_survey_responses_clean.csv')
print(cdc_survey.shape)

# filter to pregnant moms
cdc_survey_pmom = cdc_survey[cdc_survey['has_been_pregnant'] == 1].reset_index()
print(cdc_survey_pmom.shape)

(35706, 863)
(7741, 864)


In [4]:
dep_screener_cols = [
    'little_interest_in_doing_things',
    'feeling_down_depressed_hopeless',
    'trouble_falling_or_staying_asleep',
    'feeling_tired_or_having_little_energy',
    'poor_appetitie_or_overeating',
    'feeling_bad_about_yourself',
    'trouble_concentrating',
    'moving_or_speaking_to_slowly_or_fast',
    'thoughts_you_would_be_better_off_dead',
    'difficult_doing_daytoday_tasks'
]

cdc_survey_pmom['num_dep_screener_0'] = (cdc_survey_pmom[dep_screener_cols]==0).sum(axis=1)
cdc_survey_pmom = cdc_survey_pmom[cdc_survey_pmom['num_dep_screener_0'] >= 9]
cdc_survey_pmom.shape

(3347, 865)

# GBC Classifier

## Current Model

In [5]:
gb_model_features = [
    'times_with_12plus_alc',
    'seen_mental_health_professional',
    'count_days_seen_doctor_12mo',
    'count_lost_10plus_pounds',
    'arthritis',
    'horomones_not_bc',
    'is_usa_born',
    'times_with_8plus_alc',
    'time_since_last_healthcare',
    'duration_last_healthcare_visit',
    'work_schedule',
    'age_in_years'
]

In [16]:
gb_x_train, gb_x_test, gb_y_train, gb_y_test = get_model_data(
    original_df = cdc_survey_pmom,
    columns = gb_model_features
)

rus = RandomUnderSampler(
    random_state=42, 
    sampling_strategy=0.12,
    replacement=False
)
gb_x_train_rus, gb_y_train_rus = rus.fit_resample(gb_x_train,gb_y_train)

print(gb_x_train.shape)
print(gb_x_test.shape)
print(gb_y_train.shape)
print(gb_y_test.shape)
print(gb_x_train_rus.shape)
print(gb_y_train_rus.shape)

gb = GradientBoostingClassifier(random_state=42)
gb.fit(gb_x_train_rus, gb_y_train_rus)
gb_pred = gb.predict(gb_x_test)
gb_score = get_performance_df(gb_y_test, gb_pred,'Gradient Boosting Classifier')
gb_score

(2677, 12)
(670, 12)
(2677,)
(670,)
(849, 12)
(849,)


Unnamed: 0,model,grouping,precision,recall,f1-score,support,accuracy
0,Gradient Boosting Classifier,Depressed (No),0.967239,0.96875,0.967994,640.0,0.938806
1,Gradient Boosting Classifier,Depressed (Yes),0.310345,0.3,0.305085,30.0,0.938806
2,Gradient Boosting Classifier,Macro Avg,0.638792,0.634375,0.636539,670.0,0.938806
3,Gradient Boosting Classifier,Weighted Avg,0.937826,0.938806,0.938311,670.0,0.938806


## Test Hyperparameters

In [18]:
# https://www.kaggle.com/code/hatone/gradientboostingclassifier-with-gridsearchcv/script

gb_param_grid = {
    "loss":["log_loss","exponential","deviance"],
    "learning_rate": [0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2],
    "min_samples_split": np.linspace(0.1, 0.5, 12),
    "min_samples_leaf": np.linspace(0.1, 0.5, 12),
    "max_depth":[2,3,5,8],
    "max_features":["log2","sqrt"],
    "criterion": ["friedman_mse", "mae","squared_error"],
    "subsample":[0.5, 0.618, 0.8, 0.85, 0.9, 0.95, 1.0],
    "n_estimators":[10, 30, 70, 100, 200]
}
#gb_param_grid

{'loss': ['log_loss', 'exponential', 'deviance'],
 'learning_rate': [0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2],
 'min_samples_split': array([0.1       , 0.13636364, 0.17272727, 0.20909091, 0.24545455,
        0.28181818, 0.31818182, 0.35454545, 0.39090909, 0.42727273,
        0.46363636, 0.5       ]),
 'min_samples_leaf': array([0.1       , 0.13636364, 0.17272727, 0.20909091, 0.24545455,
        0.28181818, 0.31818182, 0.35454545, 0.39090909, 0.42727273,
        0.46363636, 0.5       ]),
 'max_depth': [2, 3, 5, 8],
 'max_features': ['log2', 'sqrt'],
 'criterion': ['friedman_mse', 'mae', 'squared_error'],
 'subsample': [0.5, 0.618, 0.8, 0.85, 0.9, 0.95, 1.0],
 'n_estimators': [10, 30, 70, 100, 200]}

In [None]:
gb_grid_search = GridSearchCV(
    GradientBoostingClassifier(),
    param_grid=gb_param_grid
)
gb_grid_search.fit(gb_x_train_rus, gb_y_train_rus)

In [None]:
print(gb_grid_search.best_estimator_)

In [None]:
print(gb_x_train_rus.shape)

gb_hyper = GradientBoostingClassifier(
    n_estimators=,
    max_features=,
    max_depth=,
    min_samples_split=,
    min_samples_leaf=,
    bootstrap=,
    random_state=42
)
gb_hyper.fit(gb_x_train_rus, gb_y_train_rus)
gb_hyper_pred = gb_hyper.predict(gb_x_test)
gb_hyper_score = get_performance_df(gb_y_test, gb_hyper_pred,'Gradient Boosting Classifier (Hyper)')
gb_hyper_score

# RF Classifier

## Current Model

In [13]:
def create_bin_lookup(
        feature,
        n_bins,
        encode,
        strategy,
        df_to_use):
        # make a new column with _bin suffix
        new_column_name = feature + '_bin'

        # get non-null values per column
        feature_values = df_to_use[feature].dropna()

        # reshape to be 1 column
        feature_values = feature_values.to_numpy().reshape([feature_values.shape[0],1])

        # create bins using estimator
        est = KBinsDiscretizer(
            n_bins=n_bins,
            encode=encode, 
            strategy=strategy, 
            subsample=None
        )
        est.fit(feature_values)
        feature_values_bin = pd.DataFrame(est.transform(feature_values))

        # dataframe with binned values
        feature_values_bin.columns = [new_column_name]

        # get original
        feature_values = pd.DataFrame(feature_values)
        feature_values.columns = ['original']

        # merge bin & non-binned values together to make a lookup
        feature_values = feature_values.merge(feature_values_bin, left_index=True, right_index=True)
        feature_value_bin_lookup = feature_values.groupby(['original',new_column_name]).count().reset_index()

        return feature_value_bin_lookup, new_column_name
    
feature_value_bin_lookup, new_column_name = create_bin_lookup(
    'count_days_seen_doctor_12mo',
    n_bins=10,
    encode='ordinal',
    strategy='uniform',
    df_to_use=cdc_survey_pmom
)  

# prevent creating a column if already exists
# happens if you run this block multiple times
if new_column_name in cdc_survey_pmom.columns:
    cdc_survey_pmom.drop(columns=new_column_name,inplace=True)

# add bin column in a way that doesn't drop nulls
cdc_survey_pmom = cdc_survey_pmom.merge(
    feature_value_bin_lookup, 
    left_on='count_days_seen_doctor_12mo', 
    right_on='original', 
    how = 'left'
)

# drop column called "original" as was only used to join
cdc_survey_pmom.drop(columns=['original'], inplace=True)  


rf_model_features = [
    'count_days_seen_doctor_12mo_bin',
    'times_with_12plus_alc',
    'seen_mental_health_professional',
    'count_lost_10plus_pounds',
    'arthritis',
    'horomones_not_bc',
    'is_usa_born',
    'times_with_8plus_alc',
    'time_since_last_healthcare',
    'duration_last_healthcare_visit',
    'work_schedule'
]

In [14]:
rf_x_train, rf_x_test, rf_y_train, rf_y_test = get_model_data(
    original_df = cdc_survey_pmom,
    columns = rf_model_features
)

rus = RandomUnderSampler(
    random_state=42, 
    sampling_strategy=0.12,
    replacement=False
)
rf_x_train_rus, rf_y_train_rus = rus.fit_resample(rf_x_train,rf_y_train)

print(rf_x_train.shape)
print(rf_x_test.shape)
print(rf_y_train.shape)
print(rf_y_test.shape)
print(rf_x_train_rus.shape)
print(rf_y_train_rus.shape)

rf = RandomForestClassifier(random_state=42)
rf.fit(rf_x_train_rus, rf_y_train_rus)
rf_pred = rf.predict(rf_x_test)
rf_score = get_performance_df(rf_y_test, rf_pred,'Random Forest Classifier')
rf_score

(2677, 11)
(670, 11)
(2677,)
(670,)
(849, 11)
(849,)


Unnamed: 0,model,grouping,precision,recall,f1-score,support,accuracy
0,Random Forest Classifier,Depressed (No),0.966154,0.98125,0.973643,640.0,0.949254
1,Random Forest Classifier,Depressed (Yes),0.4,0.266667,0.32,30.0,0.949254
2,Random Forest Classifier,Macro Avg,0.683077,0.623958,0.646822,670.0,0.949254
3,Random Forest Classifier,Weighted Avg,0.940804,0.949254,0.944376,670.0,0.949254


## Test Hyperparams

In [37]:
# https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74

# Number of trees in random forest
n_estimators = list(range(10,110, 10))
# Number of features to consider at every split
max_features = ['auto', 'sqrt', 'log2', None]
# Maximum number of levels in tree
max_depth = list(range(10,120, 10))
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

rf_param_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

rf_param_grid

{'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
 'max_features': ['auto', 'sqrt', 'log2', None],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'min_samples_split': [2, 5, 10],
 'min_samples_leaf': [1, 2, 4],
 'bootstrap': [True, False]}

In [None]:
rf_grid_search = GridSearchCV(
    RandomForestClassifier(),
    param_grid=rf_param_grid
)
rf_grid_search.fit(rf_x_train_rus, rf_y_train_rus)

In [17]:
print(rf_grid_search.best_estimator_)

RandomForestClassifier(max_depth=3, max_leaf_nodes=9, n_estimators=25)


In [None]:
print(rf_x_train_rus.shape)

rf_hyper = RandomForestClassifier(
    n_estimators=,
    max_features=,
    max_depth=,
    min_samples_split=,
    min_samples_leaf=,
    bootstrap=,
    random_state=42
)
rf_hyper.fit(rf_x_train_rus, rf_y_train_rus)
rf_hyper_pred = rf_hyper.predict(rf_x_test)
rf_hyper_score = get_performance_df(rf_y_test, rf_hyper_pred,'Random Forest Classifier')
rf_hyper_score