In [3]:
import numpy as np
import pandas as pd
from datetime import datetime
from preprocess import *
from useful_tools import *
from sklearn import linear_model
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import KFold
import matplotlib.pyplot as plt
plt.style.use('bmh')
%matplotlib inline

In [4]:
train_raw = pd.read_csv('data_sets/train.csv')

In [5]:
train = preprocess(train_raw)

In [6]:
train.to_csv('data_sets/preprocessed_v1.csv')

In [7]:
train_x = train.drop(['Survived'],axis=1)
train_y = train['Survived'].values
train_x.describe()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,gender,Same_Last,ticket_suffix,Embarked_S,Embarked_C,...,ticket_prefix_A/S,ticket_prefix_SC/AH Basle,ticket_prefix_A/4,ticket_prefix_WE/P,ticket_prefix_S.W./PP,ticket_prefix_S.O./P.P.,ticket_prefix_F.C.,ticket_prefix_SOTON/O2,ticket_prefix_S.C./PARIS,ticket_prefix_C.A./SOTON
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,...,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,2.308642,29.361582,0.523008,0.381594,32.204208,0.647587,0.882155,297745.874299,0.722783,0.188552,...,0.001122,0.001122,0.003367,0.002245,0.001122,0.003367,0.001122,0.002245,0.002245,0.001122
std,0.836071,13.019697,1.102743,0.806057,49.693429,0.47799,1.488014,656159.899304,0.447876,0.391372,...,0.033501,0.033501,0.057961,0.047351,0.033501,0.057961,0.033501,0.047351,0.047351,0.033501
min,1.0,0.42,0.0,0.0,0.0,0.0,0.0,541.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,22.0,0.0,0.0,7.9104,0.0,0.0,14882.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,3.0,28.0,0.0,0.0,14.4542,1.0,0.0,112379.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,3.0,35.0,1.0,0.0,31.0,1.0,1.0,347082.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,3.0,80.0,8.0,6.0,512.3292,1.0,8.0,3101317.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [8]:
X_train, X_valid, Y_train, Y_valid = train_test_split(train_x, train_y, test_size=0.15, random_state=0)

In [9]:
def cross_validation(train_x,train_y,p_range,k=50,parameter='C',silence=True,**kwargs):
    errors = []
    for p in p_range:
        kfold = KFold(len(train_x), n_folds=k,shuffle=True)
        error = 0
        params = dict({parameter:p},**kwargs)
        for train_index, val_index in kfold:
            x_train = train_x.iloc[train_index]
            y_train = train_y[train_index]
            x_val = train_x.iloc[val_index]
            y_val = train_y[val_index]
            clf = linear_model.LogisticRegression(**params)
            clf.fit(x_train,y_train)
            predictions = clf.predict(x_val)
            err = np.sum((predictions != y_val),dtype=float)/len(y_val)
            error += err/k
        if silence != True:
            print p,':',error
        errors.append(error)
    best = p_range[np.argmin(errors)]
    return best,errors

In [12]:
c_range = np.logspace(-10,10,210)
best_c,c_errors = cross_validation(X_train,Y_train,c_range,parameter='C',silence=False)

1e-10 : 0.378666666667
1.24650429684e-10 : 0.379083333333
1.55377296204e-10 : 0.379
1.9367846735e-10 : 0.378666666667
2.41421041757e-10 : 0.379333333333
3.00932365898e-10 : 0.3785
3.7511348715e-10 : 0.378833333333
4.67580573536e-10 : 0.378583333333
5.82841194032e-10 : 0.379666666667
7.26514052736e-10 : 0.379666666667
9.0560288845e-10 : 0.379083333333
1.12883789168e-09 : 0.378666666667
1.40710128242e-09 : 0.379666666667
1.75395779463e-09 : 0.379416666667
2.18631592748e-09 : 0.379416666667
2.72525219786e-09 : 0.379
3.3970385746e-09 : 0.378583333333
4.23442317977e-09 : 0.3795
5.27822668823e-09 : 0.3795
6.57933224658e-09 : 0.379083333333
8.2011659157e-09 : 0.37925
1.0222788553e-08 : 0.379
1.2742749857e-08 : 0.377833333333
1.58838924504e-08 : 0.376833333333
1.97993401899e-08 : 0.374833333333
2.46799626213e-08 : 0.385583333333
3.07636794534e-08 : 0.386666666667
3.83470586252e-08 : 0.384166666667
4.77997733476e-08 : 0.38225
5.95826228657e-08 : 0.374166666667
7.42699954192e-08 : 0.374666666667

In [13]:
best_c

1.4704969052079676e-05

In [14]:
lrc = linear_model.LogisticRegression(C=best_c)
lrc.fit(X_train,Y_train)

LogisticRegression(C=1.4704969052079676e-05, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0)

In [15]:
predictions = lrc.predict(X_valid)
print error_rate(predictions,Y_valid)

0.283582089552


In [17]:
np.savetxt('Predictions/train_lrc_predictions.csv',
           np.c_[predictions],
           delimiter=',',
           fmt='%d',
           comments='')

In [18]:
test_data = pd.read_csv('data_sets/test.csv')
test = preprocess_test(test_data,train_x)
test.to_csv('data_sets/preprocessed__test_v1.csv')

In [19]:
test_predictions = lrc.predict(test)
np.savetxt('Predictions/test_lrc_predictions.csv',
           np.c_[test_predictions],
           delimiter=',',
           fmt='%d',
           comments='')