In [1]:
import os

import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier

from keras.models import Sequential
from keras import layers
from keras import regularizers
from keras.callbacks import EarlyStopping, ModelCheckpoint

import matplotlib.pyplot as plt

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
# Load data

base_dir = 'titanic/data'

# file path
train_csv = os.path.join(base_dir, 'train.csv')
test_csv = os.path.join(base_dir, 'test.csv')
gender_csv = os.path.join(base_dir, 'gender_submission.csv')

# start DataFrame
df_train = pd.read_csv(train_csv)
df_test = pd.read_csv(test_csv)
df_label = pd.read_csv(gender_csv)

print('Num of trains', len(df_train))
print('Num of tests', len(df_test))
print('Num of gender', len(df_label))


Num of trains 891
Num of tests 418
Num of gender 418


In [4]:
df_train.head(1)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S


In [5]:
"""
PassengerId (int) : ID
Survived (bool) : 生存したかどうか
Pclass (int) : チケットクラス (1=上層クラス, 2=中級クラス, 3=仮装クラス)
Name (str) : 名前
Sex (str) : 性別 (male=男性, female=女性)
Age (flaot) : 年齢
SibSp (int) : 兄弟、配偶者の数
Parch (int) : 親、子供の数
Ticket (str) : チケット番号
Fare (float) : 料金
cabin (str) : 客室番号 
Embarked (str) : 出港地
"""

'\nPassengerId (int) : ID\nSurvived (bool) : 生存したかどうか\nPclass (int) : チケットクラス (1=上層クラス, 2=中級クラス, 3=仮装クラス)\nName (str) : 名前\nSex (str) : 性別 (male=男性, female=女性)\nAge (flaot) : 年齢\nSibSp (int) : 兄弟、配偶者の数\nParch (int) : 親、子供の数\nTicket (str) : チケット番号\nFare (float) : 料金\ncabin (str) : 客室番号 \nEmbarked (str) : 出港地\n'

In [6]:
df_train = df_train[['Survived', 'Age', 'Fare', 'Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked']]
df_test = df_test[['Age', 'Fare', 'Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked']]

In [7]:
# one_hotにできるか確認

print(df_train.Sex.value_counts())
print(df_train.Pclass.value_counts())
print(df_train.Survived.value_counts())
print(df_train.SibSp.value_counts())
print(df_train.Parch.value_counts())
print(df_train.Embarked.value_counts())


male      577
female    314
Name: Sex, dtype: int64
3    491
1    216
2    184
Name: Pclass, dtype: int64
0    549
1    342
Name: Survived, dtype: int64
0    608
1    209
2     28
4     18
3     16
8      7
5      5
Name: SibSp, dtype: int64
0    678
1    118
2     80
5      5
3      5
4      4
6      1
Name: Parch, dtype: int64
S    644
C    168
Q     77
Name: Embarked, dtype: int64


In [8]:
# one-hot
df_train = pd.get_dummies(df_train)
df_test = pd.get_dummies(df_test)

# dropna
df_train = df_train.fillna(0)
df_test = df_test.fillna(0)

In [9]:
# numpy array
X = df_train.loc[:, 'Age':].values
y = df_train['Survived'].values
X_test = df_test.values
y_test = df_label['Survived'].values

In [10]:
print('Train size : {}'.format(len(X)))
print('Test size : {}'.format(len(X_test)))


Train size : 891
Test size : 418


In [11]:
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=0)


In [12]:
# scaler
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.fit_transform(X_val)
X_test_scaled = scaler.fit_transform(X_test)



In [13]:
X_train_scaled.shape[1]

10

In [15]:
# Neural Network

features_num = X_train_scaled.shape[1]
model = Sequential()

model.add(layers.Dense(64, activation='relu', input_shape=(features_num,), 
                       kernel_regularizer=regularizers.l2(0.7)))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(
    optimizer='rmsprop',
    loss='binary_crossentropy',
    metrics=['accuracy']
)
early_stopping = EarlyStopping(monitor='val_loss', patience=0, mode='auto')
history = model.fit(
    X_train_scaled, y_train,
    epochs=20,
    batch_size=32,
    validation_data=[X_val, y_val],
    callbacks=[early_stopping],
    verbose=False
)

In [364]:
model.save('titanic074.h5')

In [365]:
nn_predict = model.predict(X_test_scaled)

In [353]:
loss = history.history['loss']
val_loss = history.history['val_loss']
plt.plot(range(15), loss, 'bo', label='Training loss')
plt.plot(range(15), val_loss, 'b', label='Validation loss')
plt.xlabel('epochs')
plt.ylabel('loss')
plt.legend()
plt.show()


In [251]:
# Logistic Regression
lr = LogisticRegression(C=100)
lr.fit(X_train_scaled, y_train)
print('score: {:.2f}'.format(lr.score(X_val_scaled, y_val)))


score: 0.81


In [261]:
# svm
svm = SVC(kernel='rbf', C=10, gamma=0.003)
svm.fit(X_train_scaled, y_train)
svm.score(X_val_scaled, y_val)

0.7802690582959642

In [270]:
# 勾配ブースティング決定木
boost = GradientBoostingClassifier(random_state=0)
boost.fit(X_train, y_train)
boost.score(X_val, y_val)

0.8340807174887892

In [271]:
boost_pred = boost.predict(X_test_scaled)

In [300]:
passengerId = np.array(test_df['PassengerId']).astype(int)


In [302]:
df_result = pd.DataFrame(boost_pred, passengerId, columns=['Survived'])

In [306]:
df_result.to_csv('titanic_ans.csv', index_label=['PassengerId'])