In [1]:
import pandas as pd
import numpy as np

raw_train = pd.read_csv(filepath_or_buffer="~/.kaggle/competitions/titanic/train.csv")
raw_test = pd.read_csv(filepath_or_buffer="~/.kaggle/competitions/titanic/test.csv")

raw_train["is_test"] = False
raw_test["is_test"] = True

all_data = pd.concat((raw_train, raw_test))

In [2]:
all_data.head()

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket,is_test
0,22.0,,S,7.25,"Braund, Mr. Owen Harris",0,1,3,male,1,0.0,A/5 21171,False
1,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,female,1,1.0,PC 17599,False
2,26.0,,S,7.925,"Heikkinen, Miss. Laina",0,3,3,female,0,1.0,STON/O2. 3101282,False
3,35.0,C123,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,female,1,1.0,113803,False
4,35.0,,S,8.05,"Allen, Mr. William Henry",0,5,3,male,0,0.0,373450,False


In [3]:
# Refer to: [Titanic with Keras | Kaggle](https://www.kaggle.com/cstahl12/titanic-with-keras)

def get_title_last_name(name):
    full_name = name.str.split(', ', n=0, expand=True)
    # last_name = full_name[0] # last_name は使っていない模様
    titles = full_name[1].str.split('.', n=0, expand=True)
    titles = titles[0]
    return(titles)

def get_titles_from_names(df):
    """
    """
    df['Title'] = get_title_last_name(df['Name'])
    df = df.drop(['Name'], axis=1)
    return(df)

def get_cabin_letter(df):
    df['Cabin'].fillna('Z', inplace=True) # Cabin が空欄の行は "Z" で埋める
    df['Cabin_letter'] = df['Cabin'].str[0] # ?例えば "C91" ならば "C" でまとめてしまっても問題ないということ？なぜ？
    return(df)

def get_dummy_cats(df):
    return(pd.get_dummies(df, columns=['Title', 'Pclass', 'Sex', 'Embarked',
                                       'Cabin', 'Cabin_letter'])) # なんだこの便利メソッドは！

def process_data(df):
    # preprocess titles, cabin, embarked
    df = get_titles_from_names(df)
    df['Embarked'].fillna('S', inplace=True) # よくよくデータを見ると、Embarkedが空欄の行がある
    df = get_cabin_letter(df)
    
    # drop remaining features
    df = df.drop(['Ticket', 'Fare'], axis=1) # ?なぜdropする？
    
    # create dummies for categorial features
    df = get_dummy_cats(df)
    
    return(df)

proc_data = process_data(all_data)

Build Network to predict missing ages

In [4]:
for_age_train = proc_data.drop(['Survived', 'is_test'], axis=1).dropna(axis=0)
X_train_age = for_age_train.drop('Age', axis=1)
y_train_age = for_age_train['Age']

Create a model to predict missing age

In [5]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation

tmodel = Sequential()
tmodel.add(Dense(input_dim=X_train_age.shape[1], units=128,
                 kernel_initializer='normal', bias_initializer='zeros'))
tmodel.add(Activation('relu'))

for i in range(0, 8):
    tmodel.add(Dense(units=64, kernel_initializer='normal',
                     bias_initializer='zeros'))
    tmodel.add(Activation('relu'))
    tmodel.add(Dropout(.25))

tmodel.add(Dense(units=1))
tmodel.add(Activation('linear'))

tmodel.compile(loss='mean_squared_error', optimizer='rmsprop')

Using TensorFlow backend.


Fit the model

In [6]:
tmodel.fit(X_train_age.values, y_train_age.values, epochs=100, verbose=2)

Epoch 1/100
 - 1s - loss: 589.4844
Epoch 2/100
 - 0s - loss: 512.4903
Epoch 3/100
 - 0s - loss: 494.7624
Epoch 4/100
 - 0s - loss: 461.7037
Epoch 5/100
 - 0s - loss: 446.7656
Epoch 6/100
 - 0s - loss: 398.8651
Epoch 7/100
 - 0s - loss: 379.2041
Epoch 8/100
 - 0s - loss: 329.6203
Epoch 9/100
 - 0s - loss: 298.8570
Epoch 10/100
 - 0s - loss: 246.3977
Epoch 11/100
 - 0s - loss: 255.2395
Epoch 12/100
 - 0s - loss: 229.6321
Epoch 13/100
 - 0s - loss: 241.7561
Epoch 14/100
 - 0s - loss: 231.2556
Epoch 15/100
 - 0s - loss: 254.5543
Epoch 16/100
 - 0s - loss: 232.6130
Epoch 17/100
 - 0s - loss: 233.5638
Epoch 18/100
 - 0s - loss: 228.8490
Epoch 19/100
 - 0s - loss: 226.2028
Epoch 20/100
 - 0s - loss: 226.6595
Epoch 21/100
 - 0s - loss: 221.1596
Epoch 22/100
 - 0s - loss: 233.2341
Epoch 23/100
 - 0s - loss: 237.2354
Epoch 24/100
 - 0s - loss: 214.3832
Epoch 25/100
 - 0s - loss: 214.4217
Epoch 26/100
 - 0s - loss: 229.7090
Epoch 27/100
 - 0s - loss: 211.9812
Epoch 28/100
 - 0s - loss: 220.5673
E

<keras.callbacks.History at 0x10aa052b0>

In [48]:
proc_train = proc_data[proc_data["is_test"] == False].copy()
proc_test = proc_data[proc_data["is_test"] == True].copy()

In [49]:
to_pred = proc_train.loc[proc_train['Age'].isnull()].drop(
          ['Age', 'Survived', 'is_test'], axis=1)
p = tmodel.predict(to_pred.values)

proc_train.loc[proc_train["Age"].isnull(), ("Age")] = p

In [50]:
to_pred = proc_test.loc[proc_test['Age'].isnull()].drop(
          ['Age', 'Survived', 'is_test'], axis=1)
p = tmodel.predict(to_pred.values)

proc_test.loc[proc_test["Age"].isnull(), ("Age")] = p

Build network to predict "Survived"

In [51]:
X = proc_train.drop(["Survived", "is_test"], axis=1)
y = pd.get_dummies(proc_train["Survived"]) # ?なぜこうする？

In [56]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation

# create model
model = Sequential()
model.add(Dense(input_dim=X.shape[1], units=128,
                 kernel_initializer='normal', bias_initializer='zeros'))
model.add(Activation('relu'))

for i in range(0, 15):
    model.add(Dense(units=128, kernel_initializer='normal',
                     bias_initializer='zeros'))
    model.add(Activation('relu'))
    model.add(Dropout(.40))

model.add(Dense(units=2))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(X.values, y.values,
          validation_split=0.1,
          epochs=200,
          batch_size=128)

Train on 801 samples, validate on 90 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Ep

<keras.callbacks.History at 0x11e190320>

In [61]:
X_test = proc_test.drop(["Survived", "is_test"], axis=1)

p_survived = model.predict_classes(X_test.values, batch_size=128)

p_survived

array([0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [71]:
submission = pd.DataFrame()

submission["PassengerId"] = X_test["PassengerId"]
submission["Survived"] = p_survived

submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


In [73]:
submission.to_csv("./submission.csv", index=False)