In [138]:
import pandas as pd
import numpy as np

from sklearn.svm import SVC
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.metrics import accuracy_score

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.preprocessing import StandardScaler

import matplotlib as plt

%matplotlib inline

In [215]:
#Read csv
url = '../data/cookies_target_binned.csv'
df = pd.read_csv(url)
target = 'quality_binned'

In [216]:
#Separating train set into X and y
X = df.drop(target, axis=1)
y = df[target]

In [217]:
#Split DF into train and test (20% test):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

In [218]:
# Scale the Train and Test feature set 

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [219]:
#Grid search for parameter selection for a Random Forest Classifier model
params_grid = [{'kernel': ['poly'],'gamma': [1e-2, 1e-3, 1e-4],
                     'C': [10, 100, 1000]}]

In [133]:
#Model creation:
GS = GridSearchCV(SVC(), params_grid, cv=5, verbose = 3)

In [134]:
#LAUNCH MODEL:
GS.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV] C=10, gamma=0.01, kernel=poly ...................................
[CV] ....... C=10, gamma=0.01, kernel=poly, score=0.766, total=   0.2s
[CV] C=10, gamma=0.01, kernel=poly ...................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s


[CV] ....... C=10, gamma=0.01, kernel=poly, score=0.750, total=   0.2s
[CV] C=10, gamma=0.01, kernel=poly ...................................
[CV] ....... C=10, gamma=0.01, kernel=poly, score=0.745, total=   0.2s
[CV] C=10, gamma=0.01, kernel=poly ...................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.4s remaining:    0.0s


[CV] ....... C=10, gamma=0.01, kernel=poly, score=0.751, total=   0.2s
[CV] C=10, gamma=0.01, kernel=poly ...................................
[CV] ....... C=10, gamma=0.01, kernel=poly, score=0.744, total=   0.2s
[CV] C=10, gamma=0.001, kernel=poly ..................................
[CV] ...... C=10, gamma=0.001, kernel=poly, score=0.714, total=   0.2s
[CV] C=10, gamma=0.001, kernel=poly ..................................
[CV] ...... C=10, gamma=0.001, kernel=poly, score=0.714, total=   0.2s
[CV] C=10, gamma=0.001, kernel=poly ..................................
[CV] ...... C=10, gamma=0.001, kernel=poly, score=0.714, total=   0.2s
[CV] C=10, gamma=0.001, kernel=poly ..................................
[CV] ...... C=10, gamma=0.001, kernel=poly, score=0.714, total=   0.2s
[CV] C=10, gamma=0.001, kernel=poly ..................................
[CV] ...... C=10, gamma=0.001, kernel=poly, score=0.715, total=   0.2s
[CV] C=10, gamma=0.0001, kernel=poly .................................
[CV] .

[Parallel(n_jobs=1)]: Done  45 out of  45 | elapsed:    8.2s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='warn', n_jobs=None,
             param_grid=[{'C': [10, 100, 1000], 'gamma': [0.01, 0.001, 0.0001],
                          'kernel': ['poly']}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=3)

In [220]:
#We look for the best parameters of the model:
GS.best_params_

{'C': 1000, 'gamma': 0.01, 'kernel': 'poly'}

In [221]:
#With the best parameters, we create the final model with the train dataset:
SVM = SVC(kernel = 'poly',gamma = 0.015, C=1000)
SVM.fit(X_train_scaled, y_train)

SVC(C=1000, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.015, kernel='poly',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [222]:
#Prediction of the test dataset:
y_train_pred = SVM.predict(X_train_scaled)
y_pred = SVM.predict(X_test_scaled)

In [223]:
#We check scores with train:
f1 = f1_score(y_train, y_train_pred, labels=None, pos_label=1, average='weighted')
accuracy = accuracy_score(y_train, y_train_pred)
conf = confusion_matrix(y_train, y_train_pred)

print ('TRAIN MODEL METRICS:')
print('The F1 score is: ' + str(f1))
print('The accuracy is: ' + str(accuracy))
print('Confusion matrix:')
conf

TRAIN MODEL METRICS:
The F1 score is: 0.8159666342303092
The accuracy is: 0.8317938745746233
Confusion matrix:


array([[ 404,   65,    0],
       [ 104, 2741,   84],
       [   0,  439,  277]])

In [224]:
#We check scores with test:
f1 = f1_score(y_test, y_pred, labels=None, pos_label=1, average='weighted')
accuracy = accuracy_score(y_test, y_pred)
conf = confusion_matrix(y_test, y_pred)

print ('TEST MODEL METRICS:')
print('The F1 score is: ' + str(f1))
print('The accuracy is: ' + str(accuracy))
print('Confusion matrix:')
conf

TEST MODEL METRICS:
The F1 score is: 0.7851186736574328
The accuracy is: 0.803030303030303
Confusion matrix:


array([[ 63,  18,   0],
       [ 31, 482,  14],
       [  0,  80,  38]])