# Linear Models

Check the performance of SVM and logistic regression on our engineered data sets before moving to more complex models.

In [12]:
%reload_ext autoreload
%autoreload 2
import sys, os
import pandas as pd
import warnings
import numpy as np
from sklearn.model_selection import train_test_split
from pathlib import Path

warnings.filterwarnings('ignore')
DATA_DIR = Path('../src/shared_util/data')
METRICS_DIR = DATA_DIR / 'metrics'
CSV_PATH = METRICS_DIR / 'metrics.csv'
LOG = True



from shared_util.metrics import metrics_db
from shared_util.dataio import load_csv



if LOG:
    metrics_db.read_in(CSV_PATH)

METRICS_DB_ID = 0 # update this to not overwrite previous metrics for these models
METRIC_NOTES = "first pass" # global metric notes for this run


In [13]:
df = load_csv('linear_frame')

In [14]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['target']), df['target'], test_size=0.2, stratify=df['target'])

In [15]:
from sklearn.model_selection import StratifiedKFold, cross_val_score
def make_cv():
    return StratifiedKFold(
    n_splits=10,
    shuffle=True,
    random_state=42
    )



## Baseline Logistic Regression

In [16]:
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import make_scorer, roc_auc_score
from imblearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler


cv = make_cv()

def get_baseline(sampling_tech) -> float:
    sgd = SGDClassifier(random_state=42)

    pipe = Pipeline([
        ('sampler', sampling_tech),
        ('model', sgd)
    ])

    scores = cross_val_score(
        pipe,
        X_train,
        y_train,
        cv = cv,
        scoring=make_scorer(roc_auc_score)
    )

    print('roc_auc score per fold: ', scores)
    print('mean roc_auc score: ', scores.mean())
    return scores.mean()


print('Baseline SGD Classifier')

roc_auc = get_baseline(RandomUnderSampler(random_state=42))


if LOG:
    metrics_db.log_metric(
        metric_id=METRICS_DB_ID,
        model='sgd_base_undsampl',
        pipeline_notes='under_sample',
        notes=METRIC_NOTES,
        roc_auc=roc_auc
    )



Baseline SGD Classifier
roc_auc score per fold:  [0.55657225 0.51644052 0.52606752 0.57999353 0.53200291 0.53814874
 0.51572614 0.59288614 0.55202988 0.52148174]
mean roc_auc score:  0.5431349375257891


In [17]:
# try SMOTE
from imblearn.over_sampling import SMOTE

print('Baseline ROC_AUC using SMOTE')

roc_auc = get_baseline(SMOTE(random_state=42))

if LOG:
    metrics_db.log_metric(
        metric_id=METRICS_DB_ID,
        model='sgd_base_smote',
        pipeline_notes='SMOTE',
        notes=METRIC_NOTES,
        roc_auc=roc_auc
    )


Baseline ROC_AUC using SMOTE
roc_auc score per fold:  [0.50299684 0.52230439 0.50397785 0.50499412 0.50345801 0.50636255
 0.50990553 0.50285577 0.55401781 0.50812549]
mean roc_auc score:  0.5118998353043852


Interestingly, baseline SGD using under sampling performed worse than baseline random forest using the same, but when using SMOTE, the linear model slightly outperformed the baseline random forest model.

In [19]:
def make_base_sgd(use_smote=False):
    return Pipeline([
        ('sampler',SMOTE(random_state=42) if use_smote else RandomUnderSampler(random_state=42)),
        ('model', SGDClassifier(random_state=42))
    ])

In [20]:
# hyperparam search for SGD
from sklearn.model_selection import RandomizedSearchCV



sgd_param_grid = {
    "model__loss": ["log_loss", "modified_huber", "hinge", "squared_hinge"],
    "model__penalty": ["l2", "l1", "elasticnet"],
    "model__alpha": [1e-6, 1e-5, 1e-4, 1e-3, 1e-2],
    "model__l1_ratio": [0.1, 0.5, 0.9],
    "model__learning_rate": ["constant", "optimal", "invscaling", "adaptive"],
    "model__eta0": [1e-4, 3e-4, 1e-3, 3e-3, 1e-2, 3e-2, 1e-1],
    "model__max_iter": [100, 500, 1000],
    "model__tol": [1e-3, 1e-4],
    "model__class_weight": [None, "balanced"],
    "model__average": [False, True, 5, 10],
    "model__shuffle": [True],
}

rs = RandomizedSearchCV(
    estimator=make_base_sgd(),
    param_distributions=sgd_param_grid,
    n_iter=30,
    scoring='roc_auc',
    cv=make_cv(),
    n_jobs=-1,
    random_state=42,
    refit=True
)

rs.fit(X_train, y_train)

print(f'Best ROC_AUC (cv): {rs.best_score_}')
print(f'best params: {rs.best_params_}')

res_df = pd.DataFrame(
    rs.cv_results_
).sort_values('rank_test_score').reset_index(drop=True)
res_df[['rank_test_score','mean_test_score','std_test_score','params']].head(10)

Best ROC_AUC (cv): 0.6557625384715677
best params: {'model__tol': 0.001, 'model__shuffle': True, 'model__penalty': 'l2', 'model__max_iter': 100, 'model__loss': 'hinge', 'model__learning_rate': 'invscaling', 'model__l1_ratio': 0.1, 'model__eta0': 0.1, 'model__class_weight': None, 'model__average': 5, 'model__alpha': 0.0001}


Unnamed: 0,rank_test_score,mean_test_score,std_test_score,params
0,1,0.655763,0.007474,"{'model__tol': 0.001, 'model__shuffle': True, ..."
1,2,0.655136,0.007946,"{'model__tol': 0.001, 'model__shuffle': True, ..."
2,3,0.654583,0.008272,"{'model__tol': 0.001, 'model__shuffle': True, ..."
3,4,0.654243,0.008371,"{'model__tol': 0.0001, 'model__shuffle': True,..."
4,5,0.653617,0.008124,"{'model__tol': 0.001, 'model__shuffle': True, ..."
5,6,0.653479,0.007097,"{'model__tol': 0.0001, 'model__shuffle': True,..."
6,7,0.653108,0.009318,"{'model__tol': 0.0001, 'model__shuffle': True,..."
7,8,0.652343,0.008005,"{'model__tol': 0.001, 'model__shuffle': True, ..."
8,9,0.65198,0.008667,"{'model__tol': 0.001, 'model__shuffle': True, ..."
9,10,0.649293,0.011812,"{'model__tol': 0.001, 'model__shuffle': True, ..."


In [25]:
y_proba = rs.best_estimator_.decision_function(X_test) #type: ignore
y_proba

array([-0.8049055 ,  0.32642561, -0.89981366, ..., -0.78391024,
        0.79366051, -0.77968065], shape=(20019,))

In [None]:

from shared_util.metrics.printing import print_metrics

y_pred = rs.best_estimator_.predict(X_test) #type: ignore
# roc auc
print('Held Out Test Metrics (SGD)')
#print('ROC AUC:', roc_auc_score(y_test, y_proba))
print_metrics(y_test, y_pred, y_proba, 
              METRICS_DB_ID, 
              METRIC_NOTES,
              model='rf_held_out_test',
              hyperparam_notes="tuned_best_roc_auc",
              pipeline_notes='under_sample',
              log=LOG
              
)



Held Out Test Metrics (Random Forest)
ROC_AUC:     0.653194381302023
Accuracy:      0.6735601178880064
F1 score:      0.27072871331324627
Precision:     0.1813153961136024
Recall:        0.5341259357111404

Confusion matrix:

TN: 12271, FP: 5477, FN: 1058, TP: 1213

Classification report:
               precision    recall  f1-score   support

           0       0.92      0.69      0.79     17748
           1       0.18      0.53      0.27      2271

    accuracy                           0.67     20019
   macro avg       0.55      0.61      0.53     20019
weighted avg       0.84      0.67      0.73     20019

