# Modeling

In [None]:
from operator import mod
from os import getcwd
from os.path import exists, join

import joblib
from sklearn.datasets import fetch_california_housing
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.svm import SVR
import pandas as pd

In [None]:
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LogisticRegression, LinearRegression
import warnings
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
import matplotlib.pyplot as plt
from sklearn.ensemble import  GradientBoostingClassifier
import seaborn as sns
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC, LinearSVC 
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import recall_score

from sklearn import tree
from sklearn.decomposition import PCA, SparsePCA

from sklearn.mixture import GaussianMixture
from sklearn.decomposition import PCA
import json
import pickle
from IPython.display import Image
import warnings

In [None]:
df_cdc_clean = pd.read_csv('../data/cdc_nhanes_survey_responses_clean.csv')
# filter to moms
df_cdc_clean = df_cdc_clean[df_cdc_clean['has_been_pregnant'] == 1]
df_cdc_clean

In [None]:
cols_to_keep = ['SEQN']
cols_to_keep.extend(df_cdc_clean.columns.tolist()[-38:])
cols_to_keep

In [None]:
df_cdc_clean = df_cdc_clean[cols_to_keep]
df_cdc_clean

In [None]:
X = df_cdc_clean.iloc[:,2:].values
y = df_cdc_clean['MDD'].values


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
def plot_confusion_matrix(y_test, pred_labels):
    """
    Function that displays a confusion matrix for provided true and predicted classes
    """
    #print(f'cover type 1 and type 2 total correct {np.sum(np.diag(metrics.confusion_matrix(y_test, pred_labels))[:2])}')

    cm = confusion_matrix(y_test, pred_labels)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    fig, ax = plt.subplots(figsize=(5,5))
    disp = disp.plot(include_values=True, cmap='viridis', ax=ax, xticks_rotation='horizontal')    
    plt.grid(False)
    plt.show()
    return

def baseline_models(
    X_train, 
    y_train, 
    X_test, 
    y_test,
    show_confusion_matrix=True,
    show_score_dataframe=True):
    """
    Function that trains and makes predictions using 5 of the classifiers went over during the class.
    Meant as a helper function for easier testing of different modeling pipelines.
    """



    # K-Nearest Neighbors
    knn = make_pipeline(SimpleImputer(), RobustScaler(), KNeighborsClassifier())
    knn.fit(X_train, y_train)
    pred_labels_knn  = knn.predict(X_test)
    score_knn = recall_score(y_test, pred_labels_knn, average='macro')
    
    
    # Logistic Regression
    lm = make_pipeline(SimpleImputer(), RobustScaler(), LogisticRegression()) 
    lm.fit(X_train, y_train)
    pred_labels_lr  = lm.predict(X_test)
    score_lr = recall_score(y_test, pred_labels_lr, average='macro')
        
    # Bernoulii Naive Bayes
    bnb = make_pipeline(SimpleImputer(), RobustScaler(), BernoulliNB())  
    bnb.fit(X_train, y_train)
    pred_labels_bnb  = bnb.predict(X_test)
    score_bnb = recall_score(y_test, pred_labels_bnb, average='macro')
        
    # Gaussian Naive Bayes
    gnb = make_pipeline(SimpleImputer(), RobustScaler(), GaussianNB())
    gnb.fit(X_train, y_train)
    score_gnb = (gnb.score(X_test, y_test))
    pred_labels_gnb  = gnb.predict(X_test)

    # Random Forest
    rf = make_pipeline(SimpleImputer(), RobustScaler(), RandomForestClassifier(random_state=0))
    rf.fit(X_train, y_train)
    pred_labels_rf  = rf.predict(X_test)
    predictions_posterior_rf = rf.predict_proba(X_test)
    score_rf = recall_score(y_test, pred_labels_rf, average='macro')

    
    # make dataframe with scores
    scores = {
        'Model' : ['Knn','Logistic Regression', 'Bernoulli Naive Bayes', 'Gaussian Naive Bayes' ,'Random Forest'],
        'Recall' : [score_knn, score_lr, score_bnb, score_gnb, score_rf]
    }
    scores = pd.DataFrame(scores).sort_values('Recall',ascending=False)
    
    if show_score_dataframe:
        display(scores.style.set_table_attributes('style="font-size: 17px"').hide_index())
    
    if show_confusion_matrix:
        print('\nK-Nearest Neighbors Confusion Matrix')
        plot_confusion_matrix(y_test, pred_labels_knn)
        print('Logistic Regression Confusion Matrix')
        plot_confusion_matrix(y_test, pred_labels_lr)
        print('Bernoulli Naive Bayes Confusion Matrix')
        plot_confusion_matrix(y_test, pred_labels_bnb)
        print('Gaussian Naive Bayes Confusion Matrix')
        plot_confusion_matrix(y_test, pred_labels_gnb)
        print('Random Forest Confusion Matrix')
        plot_confusion_matrix(y_test, pred_labels_rf)

    
    return scores



In [None]:

baseline_accuracies = baseline_models(X_train, y_train, X_test, y_test)

## Transform pipeline
SVG is terrible, but need to do some gridsearch on above baselines

In [None]:

# Model Pipeline
processing_pipeline = make_pipeline(SimpleImputer(), RobustScaler(), SVR())

params = {
    "simpleimputer__strategy": ["mean", "median"],
    "robustscaler__quantile_range": [(25.0, 75.0), (30.0, 70.0)],
    "svr__C": [0.1, 1.0],
    "svr__gamma": ["auto", 0.1],
}

grid = GridSearchCV(processing_pipeline, param_grid=params, n_jobs=-1, cv=5, verbose=3)

model_filename = "model_pipeline.pkl"
model_path = join(getcwd(), model_filename)
print(model_path)
grid.fit(X_train, y_train)

print(f"Train R^2 Score : {grid.best_estimator_.score(X_train, y_train):.3f}")
print(f"Test R^2 Score : {grid.best_estimator_.score(X_test, y_test):.3f}")
print(f"Best R^2 Score Through Grid Search : {grid.best_score_:.3f}")
print(f"Best Parameters : {grid.best_params_}")
