## Modeling - SVM - Parameter selection

Imports

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
from pipeline import *

from time import time
from datetime import timedelta

from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.svm import LinearSVC

from sklearn.metrics import precision_score, recall_score, f1_score, average_precision_score, roc_auc_score
from sklearn.metrics import make_scorer

from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"
sns.set()
pd.set_option('display.precision', 3)

import warnings
warnings.filterwarnings('ignore')

np.random.seed(42) # for reproducibility
rng = 42

### General functions

In [None]:
f1_class_0_scorer = make_scorer(f1_score, pos_label=0)
f1_class_1_scorer = make_scorer(f1_score, pos_label=1)
recall_class_0_scorer = make_scorer(recall_score, pos_label=0)
precision_class_0_scorer = make_scorer(precision_score, pos_label=0)
average_precision_score_macro = make_scorer(average_precision_score, average='macro')
roc_auc_macro_scorer = make_scorer(roc_auc_score, average='macro')

scoring_dict = {
    'f1_class_0': f1_class_0_scorer,
    'f1_class_1': f1_class_1_scorer,
    'f1_macro': 'f1_macro',
    'recall_0': recall_class_0_scorer,
    'precision_0': precision_class_0_scorer,
    'AP_macro': average_precision_score_macro,
    'roc_auc_macro': roc_auc_macro_scorer,
}

In [None]:
def hypermodel(X_train, y_train, model, params={}, scoring_dict=scoring_dict, prepA=preprocessing_oh_target, 
               prepB=preprocessing_oh, sampA=RandomUnderSampler(random_state=rng), sampB=SMOTE(random_state=rng)):
    
    pipeAA = imbPipeline([
        ('preprocessing', prepA),
        ('undersampler', sampA),
        ('classifier', model)
    ])

    pipeBB = imbPipeline([
        ('preprocessing', prepB),
        ('undersampler', sampB),
        ('classifier', model)
    ])

    pipeAB = imbPipeline([
        ('preprocessing', prepA),
        ('undersampler', sampB),
        ('classifier', model)
    ])

    pipeBA = imbPipeline([
        ('preprocessing', prepB),
        ('undersampler', sampA),
        ('classifier', model)
    ])

    pipeA0 = imbPipeline([
        ('preprocessing', prepA),
        ('classifier', model)
    ])
    
    pipeB0 = imbPipeline([
        ('preprocessing', prepB),
        ('classifier', model)
    ])

    pipes = [pipeAA, pipeAB, pipeA0, pipeBA, pipeBB, pipeB0]
    grids = []
    for pipe in pipes:
        # grid search
        grid = GridSearchCV(pipe, params, cv=5, scoring=scoring_dict, refit='f1_class_0', n_jobs=-1, verbose=1)
        grid.fit(X_train, y_train)
        grids.append(grid)

    # get max index of grids using f1_class_0 as measure
    max_index = np.argmax([grid.best_score_ for grid in grids])
    # get best grid
    best_grid = grids[max_index]


    return best_grid

Get the data

In [None]:
X_train, X_test, y_train, y_test = get_train_test(balanced=False)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=rng)
X_train.shape, X_val.shape, X_test.shape

set()


((44310, 33), (11078, 33), (13848, 33))

### SVM 

In [7]:
model = LinearSVC()

params = {
    'classifier__C': [0.01, 0.05, 0.1, 0.2],
    'classifier__class_weight': ['balanced'],
    'classifier__max_iter': [1000, 2000, 3000, 4000, 5000],
    'classifier__dual': [False],
    'classifier__penalty': ['l1', 'l2']
}

init_time = time()
best_linSVC = hypermodel(X_train, y_train, model, params=params)
print(timedelta(seconds=time()-init_time))

Fitting 5 folds for each of 40 candidates, totalling 200 fits




Fitting 5 folds for each of 40 candidates, totalling 200 fits




Fitting 5 folds for each of 40 candidates, totalling 200 fits




Fitting 5 folds for each of 40 candidates, totalling 200 fits
Fitting 5 folds for each of 40 candidates, totalling 200 fits




Fitting 5 folds for each of 40 candidates, totalling 200 fits




0:30:24.821046


In [8]:
best_linSVC.best_estimator_

In [9]:
best_linSVC.best_params_

{'classifier__C': 0.05,
 'classifier__class_weight': 'balanced',
 'classifier__dual': False,
 'classifier__max_iter': 1000,
 'classifier__penalty': 'l1'}

In [10]:
scoring_cols = ['param_classifier__C', 'param_classifier__class_weight', 'param_classifier__max_iter',
                'param_classifier__dual', 'param_classifier__penalty', 
                'mean_test_f1_class_0', 'mean_test_f1_class_1', 'mean_test_f1_macro', 
                'mean_test_recall_0', 'mean_test_precision_0', 'mean_test_AP_macro', 'mean_test_roc_auc_macro']

pd.DataFrame(best_linSVC.cv_results_).sort_values(by='mean_test_f1_class_0', ascending=False)[scoring_cols].head()

Unnamed: 0,param_classifier__C,param_classifier__class_weight,param_classifier__max_iter,param_classifier__dual,param_classifier__penalty,mean_test_f1_class_0,mean_test_f1_class_1,mean_test_f1_macro,mean_test_recall_0,mean_test_precision_0,mean_test_AP_macro,mean_test_roc_auc_macro
10,0.05,balanced,1000,False,l1,0.51,0.801,0.655,0.724,0.394,0.878,0.719
12,0.05,balanced,2000,False,l1,0.51,0.801,0.655,0.724,0.394,0.878,0.719
16,0.05,balanced,4000,False,l1,0.51,0.801,0.655,0.724,0.394,0.878,0.719
18,0.05,balanced,5000,False,l1,0.51,0.801,0.655,0.723,0.394,0.877,0.719
14,0.05,balanced,3000,False,l1,0.51,0.801,0.655,0.723,0.394,0.877,0.719
