### 1. Package imports

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, roc_auc_score
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

%matplotlib inline

from utility import * # custom functions that are used throughout this script

### 2. Read-in data

In [2]:
path_input_file = os.path.join(os.getcwd(), "01_data", "data_prepared.csv")

data = pd.read_csv(path_input_file)
data = data.set_index("PassengerId", drop=True)
data.head()

Unnamed: 0_level_0,Survived,Pclass,Age,SibSp,Parch,Fare,TrainTest,FamilyAllSurvived,FamilyAllDied,Fsize,...,"AgeBin_(64.0, 72.0]","AgeBin_(72.0, 80.0]",Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_None,Deck_T
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,3,22,1,0,1.98,Train,0,1,2,...,0,0,0,0,0,0,0,0,1,0
2,1.0,1,38,1,0,4.27,Train,0,0,2,...,0,0,0,1,0,0,0,0,0,0
3,1.0,3,26,0,0,2.07,Train,0,0,1,...,0,0,0,0,0,0,0,0,1,0
4,1.0,1,35,1,0,3.97,Train,0,0,2,...,0,0,0,1,0,0,0,0,0,0
5,0.0,3,35,0,0,2.09,Train,0,0,1,...,0,0,0,0,0,0,0,0,1,0


### 3. Data prep

#### Split data

In [3]:
train, test = split_data(data)

#### Apply StandardScaler

In [4]:
columns_to_scale = ["Age", "Fare", "Pclass", "SibSp", "Parch","Fsize"]
train[columns_to_scale] = train[columns_to_scale].astype(float)
scaler = StandardScaler().fit(train[columns_to_scale])

In [5]:
train[columns_to_scale] = scaler.transform(train[columns_to_scale])
test[columns_to_scale] = test[columns_to_scale].astype(float)
test[columns_to_scale] = scaler.transform(test[columns_to_scale])

### 4. Grid Search

In [6]:
def run_grid_search(X, y):
    
    parameter_grid = [{"C": [1, 10, 100, 1000], "kernel": ["linear"]},
                      {"gamma": [0.001, 0.0001], "kernel": ["rbf"]},
                      {"degree": [3, 5, 7], "gamma": [0.001, 0.0001], "kernel": ["poly"]}]
    svc = SVC(probability=True)
    grid_search = GridSearchCV(estimator=svc, param_grid=parameter_grid, cv=10, n_jobs=-1, verbose=2)
    grid_search.fit(X, y)
    
    return grid_search

In [7]:
X = train.drop("Survived", axis=1)
y = train["Survived"]
grid_search = run_grid_search(X, y)

Fitting 10 folds for each of 12 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  8.5min
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed: 20.4min finished


In [9]:
grid_search.best_params_

{'C': 1, 'kernel': 'linear'}

In [10]:
svc = SVC(C=1, kernel="linear", probability=True)
svc.fit(train.drop("Survived", axis=1), train["Survived"])

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=True, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [11]:
path_output_folder = os.path.join(os.getcwd(), "02_predictions")
predictions = pd.DataFrame()
predictions["PassengerId"] = test.index
predictions["Survived"] = svc.predict(test)
predictions["Survived"] = predictions["Survived"].astype(int)
predictions.to_csv(os.path.join(path_output_folder, "predictions_svc_optimized.csv"), index=False)