### Support Vector Machines

#### Table of Contents <a name='top'></a>

- [Load Modules and Set Notebook Properties](#modules)
- [Define Path and Load Data](#load)
- [Inspect Data](#inspect)
- [Prepare and Clean Data](#prepare)
- [Scale Values](#scale)
- [Fit Model](#fit)
- [Hyperparamter Tuning](#hyperparameter)

[go to end](#end)

#### Load Modules and Set Notebook Properties <a name='modules'></a>

In [1]:
import os
import sys
import warnings
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, MinMaxScaler, Normalizer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
warnings.filterwarnings('ignore')
pd.set_option("display.max_columns", None)

#### Define Path and Load Data  <a name='load'></a> 

In [3]:
INPUT_PATH = 'raw_data_source'
OUTPUT_PATH = 'outputs'

Insert note on the data source

In [4]:
train_data = pd.read_csv(os.path.join(INPUT_PATH, 'titanic_train_data.csv'))
test_data =  pd.read_csv(os.path.join(INPUT_PATH, 'titanic_test_data.csv'))

#### Inspect Data <a name='inspect'></a> 

In [5]:
print(f'Shape of training data : {train_data.shape}')
print(f'Shape of testing data : {test_data.shape}')

Shape of training data : (712, 25)
Shape of testing data : (179, 25)


In [6]:
train_data.sample(5)

Unnamed: 0,Survived,Age,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,SibSp_0,SibSp_1,SibSp_2,SibSp_3,SibSp_4,SibSp_5,SibSp_8,Parch_0,Parch_1,Parch_2,Parch_3,Parch_4,Parch_5,Parch_6,Embarked_C,Embarked_Q,Embarked_S
589,1,29.0,26.0,0,1,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1
29,1,14.0,11.2417,0,0,1,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0
196,0,29.699118,7.8958,0,0,1,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1
400,0,29.0,7.0458,0,0,1,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1
412,0,60.0,26.55,1,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1


In [7]:
test_data.sample(5)

Unnamed: 0,Survived,Age,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,SibSp_0,SibSp_1,SibSp_2,SibSp_3,SibSp_4,SibSp_5,SibSp_8,Parch_0,Parch_1,Parch_2,Parch_3,Parch_4,Parch_5,Parch_6,Embarked_C,Embarked_Q,Embarked_S
63,0,26.0,16.1,0,0,1,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1
73,0,61.0,6.2375,0,0,1,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1
81,0,29.699118,7.75,0,0,1,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0
178,0,29.699118,39.6,1,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0
139,0,51.0,7.0542,0,0,1,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1


#### Prepare and Clean Data <a name='prepare'></a> 

In [22]:
# seperate the independent and target variable on training, and test data
X_train = train_data.drop(columns=['Survived'],axis=1)
y_train = train_data['Survived']
X_test = test_data.drop(columns=['Survived'],axis=1)
y_test = test_data['Survived']

#### Scale Values <a name='scale'></a> 


Insert explanation on why the fitting of the scaler should only be done on the training set. 

In [23]:
def scale_values(X_train, X_test, scaler='standard'):
    
    scaler_dict = {'standard': StandardScaler(), 
                    'minmax': MinMaxScaler(), 
                    'normal': Normalizer()}
    if scaler not in scaler_dict.keys():
        raise ValueError("Enter a valid value for scaler! Choose between 'standard', 'minmax', 'normal'.")
    else:
        scl = scaler_dict[scaler]
        X_train = scl.fit_transform(X_train)
        X_test = scl.transform(X_test) 
        return X_train, X_test

#### Fit a Model <a name='fit'></a> 

In [24]:
def svc_plain(X_train, y_train):
    clf = SVC()
    clf.fit(X_train, y_train)
    return clf

with Standard scaling

In [25]:
X_train_, X_test_ = scale_values(X_train, X_test, scaler='standard')
clf = svc_plain(X_train_, y_train)
y_pred = clf.predict(X_test_)
acc = accuracy_score(y_test, y_pred)
print('Accuracy score on train dataset : {:0.5f}'.format(acc))

Accuracy score on train dataset : 0.80447


with Normal scaling

In [26]:
X_train_, X_test_ = scale_values(X_train, X_test, scaler='normal')
clf = svc_plain(X_train_, y_train)
y_pred = clf.predict(X_test_)
acc = accuracy_score(y_test, y_pred)
print('Accuracy score on train dataset : {:0.5f}'.format(acc))

Accuracy score on train dataset : 0.77095


with MinMax scaling

In [27]:
X_train_, X_test_ = scale_values(X_train, X_test, scaler='minmax')
clf = svc_plain(X_train_, y_train)
y_pred = clf.predict(X_test_)
acc = accuracy_score(y_test, y_pred)
print('Accuracy score on train dataset : {:0.5f}'.format(acc))

Accuracy score on train dataset : 0.82123


No scaling

In [28]:
clf = svc_plain(X_train, y_train)
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print('Accuracy score on train dataset : {:0.3f}'.format(acc))

Accuracy score on train dataset : 0.726


#### Hyperparameter Optimization Using GridSearch CV

[link](https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC)

In [30]:
def svc_with_hyperparameter_tuning(X_train, y_train):
    
    # defining parameter range
    param_grid = {'C': [0.1, 1, 10, 100, 1000], 
                  'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                  'kernel': ['rbf', 'poly', 'linear']} 
    grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=3)
    grid.fit(X_train, y_train)
    
    return grid

In [36]:
X_train_, X_test_ = scale_values(X_train, X_test, scaler='standard')
grid = svc_with_hyperparameter_tuning(X_train, y_train)
y_pred = grid.predict(X_test)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV 1/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.608 total time=   0.0s
[CV 2/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.608 total time=   0.0s
[CV 3/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.606 total time=   0.0s
[CV 4/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.606 total time=   0.0s
[CV 5/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.606 total time=   0.0s
[CV 1/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.608 total time=   0.0s
[CV 2/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.608 total time=   0.0s
[CV 3/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.606 total time=   0.0s
[CV 4/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.606 total time=   0.0s
[CV 5/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.606 total time=   0.0s
[CV 1/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.629 total time=   0.0s
[CV 2/5] END .....C=0.1, gamma=0.01, kernel=rbf

In [37]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.88      0.86       117
           1       0.75      0.69      0.72        62

    accuracy                           0.82       179
   macro avg       0.80      0.79      0.79       179
weighted avg       0.81      0.82      0.81       179




[go to top](#top)

--end--
<a name='end'></a> 