In [1]:
import numpy as np
import pandas as pd
from sklearn import linear_model
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import KFold
from sklearn.svm import SVC
import matplotlib.pyplot as plt
plt.style.use('bmh')
%matplotlib inline

In [2]:
train_raw = pd.read_csv('data_sets/train.csv')

In [3]:
def preprocess(data):
    gender_map = {'male':1,'female':0}
    embarked_map = {'C':0,'Q':1,'S':2}
    data['Sex'] = data['Sex'].map(gender_map)
    data['Embarked'] = data['Embarked'].map(embarked_map)
    data.fillna(-1,inplace=True)
    return data

In [4]:
train = preprocess(train_raw)
ignore = ['Name','Ticket','Cabin']
train_x = train.drop(['Survived']+ignore,axis=1)
train_y = train['Survived']
print train_x.shape
print train_y.shape

(891, 8)
(891,)


In [5]:
X_train, X_valid, Y_train, Y_valid = train_test_split(train_x, train_y, test_size=100, random_state=0)

In [6]:
svm = SVC(kernel='rbf')

In [7]:
svm.fit(X_train,Y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [9]:
predictions = svm.predict(X_valid)

In [10]:
def calc_error(precitions,actuals):
    return np.sum(predictions != actuals,dtype=float)
def error_rate(predictions,actuals):
    return calc_error(predictions,actuals)/len(predictions)

In [11]:
print error_rate(predictions,Y_valid.values)

0.43


In [13]:
k = 10
penalty_range = np.logspace(-10,10,21)
errors = []
for penalty in penalty_range:
    kfold = KFold(len(X_train), n_folds=k,shuffle=True,random_state=0)
    error = 0
    for train_index, val_index in kfold:
        x_train = X_train.iloc[train_index]
        y_train = Y_train.iloc[train_index]
        x_val = X_train.iloc[val_index]
        y_val = Y_train.iloc[val_index]
        svm = SVC(kernel='rbf',C=1./penalty)
        svm.fit(x_train,y_train)
        predicts = svm.predict(x_val)
        err = np.sum(predicts != y_val.values,dtype=float)
        error += err
    print penalty,':',error/k
    errors.append(error/k)
best_penalty = penalty_range[np.argmin(errors)]
print 'best penalty:', best_penalty

1e-10 : 31.4
1e-09 : 31.4
1e-08 : 31.4
1e-07 : 31.4
1e-06 : 31.4
1e-05 : 31.4
0.0001 : 31.4
0.001 : 31.4
0.01 : 31.4
0.1 : 31.4
1.0 : 29.8
10.0 : 29.9
100.0 : 29.9
1000.0 : 29.9
10000.0 : 29.9
100000.0 : 29.9
1000000.0 : 29.9
10000000.0 : 29.9
100000000.0 : 29.9
1000000000.0 : 29.9
10000000000.0 : 29.9
best penalty: 1.0


In [14]:
test_svm = SVC(kernel='rbf',C=1./best_penalty)

In [15]:
test_svm.fit(X_train,Y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [16]:
predictions = test_svm.predict(X_valid)

In [19]:
print error_rate(predictions,Y_valid.values)

0.43


In [24]:
test_svm.fit(X_train,Y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [25]:
test_data = pd.read_csv('data_sets/test.csv')
test = test_data.copy()
test = preprocess(test_data)
test_x = test.drop(ignore,axis=1)
test_predictions = test_svm.predict(test_x)

In [26]:
print test_predictions

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0]


In [22]:
np.savetxt('titanic_rbf_svm.csv',
           np.c_[test_x['PassengerId'],test_predictions],
           delimiter=',',
           header='PassengerId,Survived',
           fmt='%d',
           comments='')