In [None]:
ProfileReport(df_cdc_clean, title="Profiling Report")

In [None]:
def plot_confusion_matrix(y_test, pred_labels):
    """
    Function that displays a confusion matrix for provided true and predicted classes
    """
    #print(f'cover type 1 and type 2 total correct {np.sum(np.diag(metrics.confusion_matrix(y_test, pred_labels))[:2])}')

    cm = confusion_matrix(y_test, pred_labels)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    fig, ax = plt.subplots(figsize=(5,5))
    disp = disp.plot(include_values=True, cmap='viridis', ax=ax, xticks_rotation='horizontal')    
    plt.grid(False)
    plt.show()
    return

def get_performance_df(label_actual, label_pred, index_label):
    """
    Function to calculate performance metrics for model.
    Includes precision, recal, F1, & support.
    """
    performance_report = classification_report(label_actual, label_pred, output_dict=True)
    performance_report = performance_report['macro avg']
    result_table = pd.DataFrame(performance_report, index = [index_label])
    return result_table

def baseline_models(
    X_train, 
    y_train, 
    X_test, 
    y_test,
    do_smote=True,
    show_confusion_matrix=False,
    show_score_dataframe=False):
    """
    Function that trains and makes predictions using 5 of the classifiers went over during the class.
    Meant as a helper function for easier testing of different modeling pipelines.
    """

    #  do_smote
    if do_smote == True:
        # have to impute first because smote won't take nulls
        my_imputer = SimpleImputer()
        X_train = my_imputer.fit_transform(X_train)
        X_test = my_imputer.fit_transform(X_test)

        #print(X_train.shape)
        #print(X_test.shape)

        sm = SMOTE(random_state=42)
        X_train, y_train = sm.fit_resample(X_train, y_train)
        # after smote oversampling
        #print(X_train.shape)
        
    # SimpleImputer() = fill in missing values 
    # RobustScaler() = scale features to remove outliers

    # K-Nearest Neighbors
    knn = make_pipeline(SimpleImputer(), RobustScaler(), KNeighborsClassifier())
    knn.fit(X_train, y_train)
    pred_labels_knn  = knn.predict(X_test)
    #score_knn = recall_score(y_test, pred_labels_knn, average='macro')
    score_knn = get_performance_df(y_test, pred_labels_knn,'Knn')
    
    # Logistic Regression
    lm = make_pipeline(SimpleImputer(), RobustScaler(), LogisticRegression()) 
    lm.fit(X_train, y_train)
    pred_labels_lr  = lm.predict(X_test)
    #score_lr = recall_score(y_test, pred_labels_lr, average='macro')
    score_lr = get_performance_df(y_test, pred_labels_lr,'Logistic Regression')
        
    # Bernoulii Naive Bayes
    bnb = make_pipeline(SimpleImputer(), RobustScaler(), BernoulliNB())  
    bnb.fit(X_train, y_train)
    pred_labels_bnb  = bnb.predict(X_test)
    #score_bnb = recall_score(y_test, pred_labels_bnb, average='macro')
    score_bnb = get_performance_df(y_test, pred_labels_bnb,'Bernoulli Naive Bayes')    
        
    # Gaussian Naive Bayes
    gnb = make_pipeline(SimpleImputer(), RobustScaler(), GaussianNB())
    gnb.fit(X_train, y_train)
    pred_labels_gnb  = gnb.predict(X_test)
    # score_gnb = gnb.score(X_test, y_test)
    score_gnb = get_performance_df(y_test, pred_labels_gnb,'Gaussian Naive Bayes')    

    # Random Forest
    rf = make_pipeline(SimpleImputer(), RobustScaler(), RandomForestClassifier(random_state=0))
    rf.fit(X_train, y_train)
    pred_labels_rf  = rf.predict(X_test)
    predictions_posterior_rf = rf.predict_proba(X_test)
    #score_rf = recall_score(y_test, pred_labels_rf, average='macro')
    score_rf = get_performance_df(y_test, pred_labels_rf,'Random Forest')      

    # make dataframe with scores
    scores = pd.concat([score_knn, score_lr, score_bnb, score_gnb, score_rf])
    scores = scores.sort_values(by = 'recall', ascending=False)
    
    if show_score_dataframe:
        display(scores.style.set_table_attributes('style="font-size: 17px"').hide_index())
    
    if show_confusion_matrix:
        print('\nK-Nearest Neighbors Confusion Matrix')
        plot_confusion_matrix(y_test, pred_labels_knn)
        print('Logistic Regression Confusion Matrix')
        plot_confusion_matrix(y_test, pred_labels_lr)
        print('Bernoulli Naive Bayes Confusion Matrix')
        plot_confusion_matrix(y_test, pred_labels_bnb)
        print('Gaussian Naive Bayes Confusion Matrix')
        plot_confusion_matrix(y_test, pred_labels_gnb)
        print('Random Forest Confusion Matrix')
        plot_confusion_matrix(y_test, pred_labels_rf)

    return scores

## Hyperprameter grid search
SVG is terrible, but need to do some gridsearch on above baselines

In [None]:

# Model Pipeline
processing_pipeline = make_pipeline(SimpleImputer(), RobustScaler(), SVR())

params = {
    "simpleimputer__strategy": ["mean", "median"],
    "robustscaler__quantile_range": [(25.0, 75.0), (30.0, 70.0)],
    "svr__C": [0.1, 1.0],
    "svr__gamma": ["auto", 0.1],
}

grid = GridSearchCV(processing_pipeline, param_grid=params, n_jobs=-1, cv=5, verbose=3)

model_filename = "model_pipeline.pkl"
model_path = join(getcwd(), model_filename)
print(model_path)
grid.fit(X_train, y_train)

print(f"Train R^2 Score : {grid.best_estimator_.score(X_train, y_train):.3f}")
print(f"Test R^2 Score : {grid.best_estimator_.score(X_test, y_test):.3f}")
print(f"Best R^2 Score Through Grid Search : {grid.best_score_:.3f}")
print(f"Best Parameters : {grid.best_params_}")
