In [1]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, Imputer
from sklearn.svm import LinearSVC

from matplotlib import pyplot as plt
%matplotlib inline

# First SVM attempt

The Kaggle competiton is here: [Titanic](https://www.kaggle.com/c/titanic)

## Read in the data

In [2]:
df = pd.read_csv('data/train.csv')

df.head(1)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S


## Convert columns to categories and get codes

In [3]:
obj_cols = ['Sex', 'Ticket', 'Cabin', 'Embarked']

for i in obj_cols:
    df[i] = df[i].astype('category').values.codes

## Get new DataFrame with just numerical cols

In [4]:
df1 = pd.DataFrame()

for name, group in df.groupby(df.dtypes, axis=1):
    if name != 'object':
        for i in group.columns:
            df1[i] = df[i].copy()

df1.head(1)

Unnamed: 0,Sex,Embarked,Ticket,Cabin,PassengerId,Survived,Pclass,SibSp,Parch,Age,Fare
0,1,2,523,-1,1,0,3,1,0,22.0,7.25


## The `sklearn` SVM

In [None]:
cols = ['Sex', 'Embarked', 'Pclass', 'Fare']

X = df1[cols].values
y = df1['Survived'].values

svm_clf = Pipeline([
    ('imputer', Imputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('linear_svc', LinearSVC(C=1, loss='hinge'))
])

svm_clf.fit(X,y)

## Combine results

In [9]:
def predictions(x):
    """Use axis=1"""
    p = svm_clf.predict([x])
    return p[0]

In [10]:
df1['Predictions'] = df1[cols].apply(predictions, axis=1)

In [11]:
df1.head()

Unnamed: 0,Sex,Embarked,Ticket,Cabin,PassengerId,Survived,Pclass,SibSp,Parch,Age,Fare,Predictions
0,1,2,523,-1,1,0,3,1,0,22.0,7.25,0
1,0,0,596,81,2,1,1,1,0,38.0,71.2833,1
2,0,2,669,-1,3,1,3,0,0,26.0,7.925,1
3,0,2,49,55,4,1,1,1,0,35.0,53.1,1
4,1,2,472,-1,5,0,3,0,0,35.0,8.05,0


## Performance

In [12]:
df1.pivot_table('Fare',
                index=['Survived'],
                columns=['Predictions'],
                aggfunc='count',
                margins=True)

Predictions,0,1,All
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,468,81,549
1,109,233,342
All,577,314,891


In [13]:
success_rate = (468+233)/891
print(f'This SVM got {success_rate*100:.4}% correct.')

This SVM got 78.68% correct.


## Save the model

In [14]:
import pickle

In [15]:
s = pickle.dumps(svm_clf)

In [18]:
with open('svm_clf_one', 'wb') as f:
    f.write(s)