In [227]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

In [228]:
train = pd.read_csv('../input/train.csv', index_col='PassengerId')
test = pd.read_csv('../input/test.csv', index_col='PassengerId')

In [229]:
train.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [230]:
test.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [231]:
dataset = train.append(test)

In [232]:
train['Embarked'].unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [233]:
sex_map = {'female': 1, 'male': 0}
embark_map = {None: 0, 'S': 1, 'C': 2, 'Q': 3}
title_map = {
    'Col': 0,
    'Jonkheer': 0,
    'Capt': 0,
    'Don': 0,
    'Lady': 0,
    'Countess': 0,
    'Mme': 0,
    'Mr': 1,
    'Sir': 1,
    'Miss': 2,
    'Ms': 2,
    'Mrs': 3,
    'Master': 4,
}

for t in [train, test]:
    t['Age'] = t['Age'].fillna(dataset['Age'].median())
    t['Sex'] = t['Sex'].map(sex_map)
    t['Embarked'] = t['Embarked'].map(embark_map)
    t['title'] = t['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
    t['title'] = t['title'].map(title_map).fillna(0)
    t['Fare'] = t['Fare'].fillna(dataset['Fare'].median())

In [234]:
train.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,title
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,1,1.0
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,2,3.0
3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,1,2.0
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,1,3.0
5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,1,1.0


In [235]:
s = StandardScaler()
for c in ('Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'title', 'Embarked'):
    train[c] = s.fit_transform(train[c].values.reshape(-1, 1))
    test[c] = s.fit_transform(test[c].values.reshape(-1, 1))



In [236]:
train.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,title
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,0,0.827377,"Braund, Mr. Owen Harris",0,-0.565736,0.432793,-0.473674,A/5 21171,-0.502445,,-0.562619,-0.643709
2,1,-1.566107,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,0.663861,0.432793,-0.473674,PC 17599,0.786845,C85,1.003923,1.528809
3,1,0.827377,"Heikkinen, Miss. Laina",1,-0.258337,-0.474545,-0.473674,STON/O2. 3101282,-0.488854,,-0.562619,0.44255
4,1,-1.566107,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,0.433312,0.432793,-0.473674,113803,0.42073,C123,-0.562619,1.528809
5,0,0.827377,"Allen, Mr. William Henry",0,0.433312,-0.474545,-0.473674,373450,-0.486337,,-0.562619,-0.643709


In [237]:
print(train.isnull().sum())
print(test.isnull().sum())

Survived      0
Pclass        0
Name          0
Sex           0
Age           0
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       687
Embarked      0
title         0
dtype: int64
Pclass        0
Name          0
Sex           0
Age           0
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       327
Embarked      0
title         0
dtype: int64


In [239]:
X_train = train.drop(['Name', 'Survived', 'Ticket', 'Cabin'], axis=1)
y_train = train['Survived']

X_test = test.drop(['Name', 'Ticket', 'Cabin'], axis=1)

In [240]:
X_train.head()

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,title
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0.827377,0,-0.565736,0.432793,-0.473674,-0.502445,-0.562619,-0.643709
2,-1.566107,1,0.663861,0.432793,-0.473674,0.786845,1.003923,1.528809
3,0.827377,1,-0.258337,-0.474545,-0.473674,-0.488854,-0.562619,0.44255
4,-1.566107,1,0.433312,0.432793,-0.473674,0.42073,-0.562619,1.528809
5,0.827377,0,0.433312,-0.474545,-0.473674,-0.486337,-0.562619,-0.643709


In [241]:
X_test.head()

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,title
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
892,0.873482,0,0.371062,-0.49947,-0.400248,-0.497413,2.243165,-0.707748
893,0.873482,1,1.358985,0.616992,-0.400248,-0.512278,-0.677841,1.405385
894,-0.315819,0,2.544493,-0.49947,-0.400248,-0.4641,2.243165,-0.707748
895,0.873482,0,-0.221692,-0.49947,-0.400248,-0.482475,-0.677841,-0.707748
896,0.873482,1,-0.616861,0.616992,0.619896,-0.417492,-0.677841,1.405385


In [253]:
from sklearn.linear_model import LogisticRegressionCV
m = LogisticRegressionCV()
m.fit(X_train, y_train)
print([s.mean() for s in m.scores_[1]])
y_test = m.predict(X_test)

submission = X_test.copy()
submission['Survived'] = y_test
submission.head()

submission.to_csv('submission.csv', columns=['Survived'])

[0.76228956228956224, 0.76868686868686864, 0.78552188552188551]


In [254]:
from sklearn.svm import SVC
m = SVC()
m.fit(X_train, y_train)
print(m.score(X_train, y_train))
y_test = m.predict(X_test)

submission = X_test.copy()
submission['Survived'] = y_test
submission.head()

submission.to_csv('submission.csv', columns=['Survived'])

0.843995510662
