In [1]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, Imputer
from sklearn.svm import LinearSVC

from matplotlib import pyplot as plt
%matplotlib inline

# First SVM attempt

The Kaggle competiton is here: [Titanic](https://www.kaggle.com/c/titanic)

## Read in the data

In [2]:
df = pd.read_csv('data/train.csv')

df.head(1)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S


## Get new DataFrame with just numerical cols

In [3]:
from transformers import Transformer

In [4]:
transformer = Transformer()

In [5]:
X, y = transformer.transform(df)

In [6]:
for i in (X, y):
    print(i[:10])

[[ 0.73769513  0.58796609  0.82737724 -0.50244517]
 [-1.35557354 -1.91264387 -1.56610693  0.78684529]
 [-1.35557354  0.58796609  0.82737724 -0.48885426]
 [-1.35557354  0.58796609 -1.56610693  0.42073024]
 [ 0.73769513  0.58796609  0.82737724 -0.48633742]
 [ 0.73769513 -0.66233889  0.82737724 -0.47811643]
 [ 0.73769513  0.58796609 -1.56610693  0.39581356]
 [ 0.73769513  0.58796609  0.82737724 -0.22408312]
 [-1.35557354  0.58796609  0.82737724 -0.42425614]
 [-1.35557354 -1.91264387 -0.36936484 -0.0429555 ]]
[0 1 1 1 0 0 0 0 1 1]


## The `sklearn` SVM

In [7]:
# X = df1[cols].values
# y = df1['Survived'].values

svm_clf = Pipeline([
    ('linear_svc', LinearSVC(C=1, loss='hinge'))
])

svm_clf.fit(X,y)

Pipeline(memory=None,
     steps=[('linear_svc', LinearSVC(C=1, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='hinge', max_iter=1000, multi_class='ovr',
     penalty='l2', random_state=None, tol=0.0001, verbose=0))])

## Combine results

In [8]:
def predictions(x):
    """Use axis=1"""
    p = svm_clf.predict([x])
    return p[0]

In [9]:
cols = ['Sex', 'Embarked', 'Pclass', 'Fare']

In [10]:
X[:5]

array([[ 0.73769513,  0.58796609,  0.82737724, -0.50244517],
       [-1.35557354, -1.91264387, -1.56610693,  0.78684529],
       [-1.35557354,  0.58796609,  0.82737724, -0.48885426],
       [-1.35557354,  0.58796609, -1.56610693,  0.42073024],
       [ 0.73769513,  0.58796609,  0.82737724, -0.48633742]])

In [11]:
dfp = pd.DataFrame(X, columns=cols)
dfp.head(1)

Unnamed: 0,Sex,Embarked,Pclass,Fare
0,0.737695,0.587966,0.827377,-0.502445


In [12]:
dfp['Predictions'] = dfp.apply(predictions, axis=1)

In [13]:
dfp.head(10)

Unnamed: 0,Sex,Embarked,Pclass,Fare,Predictions
0,0.737695,0.587966,0.827377,-0.502445,0
1,-1.355574,-1.912644,-1.566107,0.786845,1
2,-1.355574,0.587966,0.827377,-0.488854,1
3,-1.355574,0.587966,-1.566107,0.42073,1
4,0.737695,0.587966,0.827377,-0.486337,0
5,0.737695,-0.662339,0.827377,-0.478116,0
6,0.737695,0.587966,-1.566107,0.395814,0
7,0.737695,0.587966,0.827377,-0.224083,0
8,-1.355574,0.587966,0.827377,-0.424256,1
9,-1.355574,-1.912644,-0.369365,-0.042956,1


## Performance

In [14]:
df['Predictions'] = dfp['Predictions'].copy()

In [15]:
df.pivot_table('Fare',
                index=['Survived'],
                columns=['Predictions'],
                aggfunc='count',
                margins=True)

Predictions,0,1,All
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,468,81,549
1,109,233,342
All,577,314,891


In [16]:
deaths = df[(df['Survived'] == 0) & (df['Predictions'] == 0)]
lives = df[(df['Survived'] == 1) & (df['Predictions'] == 1)]

lived = len(deaths.index)
died = len(lives.index)
total = len(df.index)

success_rate = (lived + died)/total
print(f'This SVM got {success_rate*100:.4}% correct on the training set.')

This SVM got 78.68% correct on the training set.


## Save the model

In [17]:
import pickle

In [18]:
s = pickle.dumps(svm_clf)

In [19]:
with open('svm_clf_one', 'wb') as f:
    f.write(s)