# Section 2-2 - SVM with Parameter Tuning

Now we simply use Support Vector Machines with parameter tuning using GridSearch SV.

## Pandas - Extracting data

In [125]:
import pandas as pd
import numpy as np

df = pd.read_csv('../data/train.csv')

## Pandas - Cleaning data

In [4]:
df = df.drop(['Name', 'Ticket', 'Cabin'], axis=1)

age_mean = df['Age'].mean()
df['Age'] = df['Age'].fillna(age_mean)

from scipy.stats import mode

mode_embarked = mode(df['Embarked'])[0][0]
df['Embarked'] = df['Embarked'].fillna(mode_embarked)

df['Gender'] = df['Sex'].map({'female': 0, 'male': 1}).astype(int)
df = pd.concat([df, pd.get_dummies(df['Embarked'], prefix='Embarked')], axis=1)

df = df.drop(['Sex', 'Embarked'], axis=1)

cols = df.columns.tolist()
cols = [cols[1]] + cols[0:1] + cols[2:]

df = df[cols]

train_data = df.values

  flag = np.concatenate(([True], aux[1:] != aux[:-1]))


##Scikit-learn - Feature selection
Quite often feature selection could improve model performance if the input data contain very noisy/irrelevant variables, or if the variables are highly correlated.

In [22]:
# Univariate feature ranking and selection
# RFE: Recursive Feature Elimination

from sklearn.svm import SVC
from sklearn.feature_selection import RFE
estimator = SVC(kernel="linear")
selector = RFE(estimator, 5, step=1)
selector = selector.fit(train_data[0:,2:], train_data[0:,0])

In [34]:
print('Selected features using RFE: ', df.columns.values[selector.support_])
print(selector.support_ )
print(selector.ranking_ )
print(train_data.shape)

# Subset of training input with selected features only
print(selector.transform(train_data[0:,2:].shape)

('Selected features using RFE: ', array(['Survived', 'Age', 'Parch', 'Gender', 'Embarked_C'], dtype=object))
[ True False False  True False  True False  True  True]
[1 4 3 1 5 1 2 1 1]
(891, 11)
(891, 5)


## Scikit-learn - Training the model

### Grid search for parameter tuning without feature selection
First, without feature selection we build the parameter grid for SVC, noting that the default parameters are C = 1.0, and  gamma = 0.0. 

In [74]:
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import StratifiedKFold as SKFold

parameter_grid = [
    {'kernel':['linear'], 'C': [1., 10.]},
    {'kernel':['rbf'], 'C': [1., 10.], 'gamma': [0.1, 1.]}
]

from sklearn.cross_validation import StratifiedKFold as SKFold
# Controling the random split of the cross-validation and using stratified CV.
random_seed = 1234
scv = SKFold(y=train_data[0:,0], n_folds=5, random_state=random_seed)

grid_search = GridSearchCV(SVC(), parameter_grid, cv=scv, verbose=3)
grid_search.fit(train_data[0:,2:], train_data[0:,0])

# Sorting the score and print out the parameters with the best scores
sorted(grid_search.grid_scores_, key=lambda x: x.mean_validation_score)
print(grid_search.best_score_)
print(grid_search.best_params_)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] kernel=linear, C=1.0 ............................................
[CV] ................... kernel=linear, C=1.0, score=0.804469 -   6.1s
[CV] kernel=linear, C=1.0 ............................................
[CV] ................... kernel=linear, C=1.0, score=0.810056 -   4.7s
[CV] kernel=linear, C=1.0 ............................................
[CV] ................... kernel=linear, C=1.0, score=0.786517 -   6.6s
[CV] kernel=linear, C=1.0 ............................................
[CV] ................... kernel=linear, C=1.0, score=0.752809 -  34.2s
[CV] kernel=linear, C=1.0 ............................................
[CV] ................... kernel=linear, C=1.0, score=0.785311 -  44.4s
[CV] kernel=linear, C=10.0 ...........................................
[CV] .................. kernel=linear, C=10.0, score=0.804469 -  35.2s
[CV] kernel=linear, C=10.0 ...........................................
[CV] ............

[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    6.1s
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  7.1min finished



0.787878787879
{'kernel': 'linear', 'C': 1.0}


In [77]:
from sklearn.cross_validation import StratifiedKFold as SKFold
# Controling the random split of the cross-validation and using stratified CV.
random_seed = 100000
scv = SKFold(y=train_data[0:,0], n_folds=5, random_state=random_seed)

# Use .transform method to get the input data with selected features
print(selector.transform(train_data[0:,2:]).shape)

grid_search_FS = GridSearchCV(SVC(), parameter_grid, cv=scv, verbose=3)
grid_search_FS.fit(selector.transform(train_data[0:,2:]), train_data[0:,0])

# print out the best scoring parameters
sorted(grid_search_FS.grid_scores_, key=lambda x: x.mean_validation_score)
print(grid_search_FS.best_score_)
print(grid_search_FS.best_params_)

(891, 5)
Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] kernel=linear, C=1.0 ............................................
[CV] ................... kernel=linear, C=1.0, score=0.804469 -   0.0s
[CV] kernel=linear, C=1.0 ............................................
[CV] ................... kernel=linear, C=1.0, score=0.804469 -   0.0s
[CV] kernel=linear, C=1.0 ............................................
[CV] ................... kernel=linear, C=1.0, score=0.786517 -   0.0s
[CV] kernel=linear, C=1.0 ............................................
[CV] ................... kernel=linear, C=1.0, score=0.752809 -   0.0s
[CV] kernel=linear, C=1.0 ............................................
[CV] ................... kernel=linear, C=1.0, score=0.785311 -   0.0s
[CV] kernel=linear, C=10.0 ...........................................
[CV] .................. kernel=linear, C=10.0, score=0.804469 -   0.0s
[CV] kernel=linear, C=10.0 ...........................................
[CV] ...

[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    0.7s finished



0.804713804714
{'kernel': 'rbf', 'C': 10.0, 'gamma': 0.1}


Choose (hyper-)parameters and features that gives the best performance on CV.

In [54]:
#model = SVC(kernel='linear', C=1.0, gamma=0.1)
#model = model.fit(train_data[0:,2:], train_data[0:,0])

model = SVC(kernel='rbf', C=10.0, gamma=0.1)
model = model.fit(selector.transform(train_data[0:,2:]), train_data[0:,0])

## Scikit-learn - Making predictions

In [121]:
df_test = pd.read_csv('../data/test.csv')

df_test = df_test.drop(['Name', 'Ticket', 'Cabin'], axis=1)

df_test['Age'] = df_test['Age'].fillna(age_mean)

fare_means = df.pivot_table('Fare', index='Pclass', aggfunc='mean')
df_test['Fare'] = df_test[['Fare', 'Pclass']].apply(lambda x:
                            fare_means[x['Pclass']] if pd.isnull(x['Fare'])
                            else x['Fare'], axis=1)

df_test['Gender'] = df_test['Sex'].map({'female': 0, 'male': 1}).astype(int)
df_test = pd.concat([df_test, pd.get_dummies(df_test['Embarked'], prefix='Embarked')],
                axis=1)

df_test = df_test.drop(['Sex', 'Embarked'], axis=1)

test_data = df_test.values

df_test.head(5)
# Use the same selected features for prediction
output = model.predict(selector.transform(test_data[0:,1:]))
# It's also possible to output the decision function (the continuous score) before taking the binary output.
output1 = model.decision_function(selector.transform(test_data[0:,1:]))
# Store the prediction output in a DataFrame
df_predict=pd.DataFrame(np.c_[test_data[0:,0], output1, output], columns=['Passengerid', 'Decision_score', 'Prediction'])
df_predict.head(10)


Unnamed: 0,Passengerid,Decision_score,Prediction
0,892,-1.000003,0
1,893,-0.332941,0
2,894,-1.00268,0
3,895,-1.000291,0
4,896,-0.732691,0
5,897,-1.000291,0
6,898,0.99964,1
7,899,-0.737536,0
8,900,0.898513,1
9,901,-1.000291,0


## Pandas - Preparing for submission

In [128]:
result = np.c_[test_data[:,0].astype(int), output.astype(int)]

df_result = pd.DataFrame(result[:,0:2], columns=['PassengerId', 'Survived'])
df_result.to_csv('results/titanic_2-5.csv', index=False)