In [1]:
import pandas as pd
import numpy as np

from sklearn.svm import SVC
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix,classification_report

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.preprocessing import StandardScaler

import matplotlib as plt

%matplotlib inline

In [2]:
#Read csv
url = '../data/cookies_clean.csv'
df = pd.read_csv(url, sep='\t')
target = 'quality'

In [4]:
#Separating train set into X and y
X = df.drop(target, axis=1)
y = df[target]

In [5]:
#Split DF into train and test (20% test):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# Scale the Train and Test feature set 

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [7]:
#Grid search for parameter selection for a Random Forest Classifier model
params_grid = [{'kernel': ['rbf'],'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

In [10]:
#Model creation:
GS = GridSearchCV(SVC(), params_grid, cv=5)

In [None]:
#LAUNCH MODEL:
GS.fit(X_train_scaled, y_train)

In [None]:
#We look for the best parameters of the model:
GS.best_params_

In [21]:
#With the best parameters, we create the final model with the train dataset:
SVM = SVC(kernel = 'linear', gamma=1e-3, C=1)
SVM.fit(X_train_scaled, y_train)

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.001, kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [22]:
#Prediction of the test dataset:
y_train_pred = SVM.predict(X_train_scaled)
y_pred = SVM.predict(X_test_scaled)

In [23]:
#We check scores with train:
f1 = f1_score(y_train, y_train_pred, labels=None, pos_label=1, average='weighted')
score = SVM.score(X_train, y_train)

print ('TRAIN MODEL METRICS:')
print('The F1 score is: ' + str(f1))
print('The model score is: ' + str(score))

  'precision', 'predicted', average, warn_for)


TRAIN MODEL METRICS:
The F1 score is: 0.49057116001613904
The model score is: 0.00241196333815726


In [24]:
#We check scores with test:
f1 = f1_score(y_test, y_pred, labels=None, pos_label=1, average='weighted')
score = SVM.score(X_test, y_test)

print ('TEST MODEL METRICS:')
print('The F1 score is: ' + str(f1))
print('The model score is: ' + str(score))

TEST MODEL METRICS:
The F1 score is: 0.4547577335676343
The model score is: 0.003857280617164899


  'precision', 'predicted', average, warn_for)
