In [None]:
import numpy as np
import pandas as pd

In [None]:
"""
Formatting data

"""
df_train = pd.read_csv('../input/train.csv')
df_test = pd.read_csv('../input/test.csv')

def fill_na(df):
    df["Age"] = df["Age"].fillna(df["Age"].median())
    df["Fare"] = df["Fare"].fillna(df["Fare"].median())
    df["Sex"] = df["Sex"].fillna("")
    df["Embarked"] = df["Embarked"].fillna("S")
    return df

def numerical_form(df):
    df.loc[df["Sex"] == "male", "Sex"] = 0
    df.loc[df["Sex"] == "female", "Sex"] = 1
    df.loc[df["Embarked"] == "S", "Embarked"] = 1
    df.loc[df["Embarked"] == "C", "Embarked"] = 2
    df.loc[df["Embarked"] == "Q", "Embarked"] = 3
    return df

def onehot(df, columns):
    for column in columns:
        df = pd.concat([df, pd.get_dummies(df[column], prefix=column)], axis=1)
        df = df.drop(column, axis=1)
    return df

def classify_honorifics(df):
    df["Honorific"] = df["Name"].str.extract(' ([A-Za-z]+).', expand=False)
    mask = {"Mr": 1, "Mrs": 2, "Miss": 3, "Master": 4, "Don": 5, "Dr": 5, "Mme": 2, "Impe": 2, "Ms": 3, "Major": 5, "Mlle": 3, "the": 5, "Capt": 5, "Col": 5}
    df["Honorific"] = df["Honorific"].map(mask)
    df["Honorific"] = df["Honorific"].fillna(0)
    df["Honorific"] = df["Honorific"].astype(int)
    return df

def normalize(df, columns):
    for column in columns:
        df[column] = df[column] / (df[column].max() - df[column].min())
    return df

def formatting(df):
    df = fill_na(df)
    df = numerical_form(df)
    df = classify_honorifics(df)
    df = onehot(df, ["Pclass", "Embarked", "Honorific"])
    df = normalize(df, ["Age", "SibSp", "Parch"])
    df = df.drop(["Name", "Ticket", "Fare", "Cabin"], axis=1)
    return df

df_train = formatting(df_train)
df_test = formatting(df_test)



In [None]:
"""
16 → 8 → 4 → 2
"""

import sys

from chainer import Chain, optimizers, Variable
import chainer.functions as F
import chainer.links as L
from chainer import training

class TitanicChain(Chain):
    def __init__(self):
        super(TitanicChain, self).__init__(
            l1 = L.Linear(16, 8),
            l2 = L.Linear(8, 4),
            l3 = L.Linear(4, 2))
    
    def forward(self, x):
        h1 = F.relu(self.l1(x))
        h2 = F.relu(self.l2(h1))
        return self.l3(h2)

model = TitanicChain()
optimizer = optimizers.Adam()  # 最適化手法をSGDに指定
optimizer.setup(model)
x
x_train = np.array(df_train.drop(["PassengerId", "Survived"], axis=1), dtype=np.float32)
t_train = np.array(df_train["Survived"], dtype=np.int32)

"""
x_test = x_train[800:890]
t_test = t_train[800:890]
x_train = x_train[:800]
t_train = t_train[:800]

for epoch in range(n_epoch):
    sum_loss = 0
    perm = np.random.permutation(750)
    for i in range(0, 750, batch_size):
        x = Variable(x_train[perm[i:i+batch_size]])
        t = Variable(t_train[perm[i:i+batch_size]])
        y = model.forward(x)
        model.cleargrads()
        loss = F.softmax_cross_entropy(y, t)
        loss.backward()
        optimizer.update()
        sum_loss += loss.data*batch_size

    sys.stdout.write("\repoch: {}, mean loss: {}".format(epoch, sum_loss/750))
    sys.stdout.flush()

# テスト実行
cnt = 0
for i in range(90):
    x = Variable(np.array([x_test[i]], dtype=np.float32))
    t = t_test[i]
    y = model.forward(x)
    y = np.argmax(y.data[0])
    if t == y:
        cnt += 1

# 正解数と正解率を表示
print(cnt)
print("accuracy: {}".format(cnt/(90)))

"""

In [None]:
"""
本番
"""

model = TitanicChain()
optimizer = optimizers.Adam()  # 最適化手法をSGDに指定
optimizer.setup(model)

n_epoch = 2000
batch_size = 50

x_train = np.array(df_train.drop(["PassengerId", "Survived"], axis=1), dtype=np.float32)
t_train = np.array(df_train["Survived"], dtype=np.int32)

for epoch in range(n_epoch):
    sum_loss = 0
    perm = np.random.permutation(890 - batch_size)
    for i in range(0, 890 - batch_size, batch_size):
        x = Variable(x_train[perm[i:i+batch_size]])
        t = Variable(t_train[perm[i:i+batch_size]])
        y = model.forward(x)
        model.cleargrads()
        loss = F.softmax_cross_entropy(y, t)
        loss.backward()
        optimizer.update()
        sum_loss += loss.data*batch_size

    sys.stdout.write("\repoch: {}, mean loss: {}".format(epoch, sum_loss/(890 - batch_size)))
    sys.stdout.flush()


x_test = np.array(df_test.drop(["PassengerId"], axis=1), dtype=np.float32)

survived = []
for i in range(418):
    x = Variable(np.array([x_test[i]], dtype=np.float32))
    y = model.forward(x)
    y = np.argmax(y.data[0])
    survived.append(y)

PassengerId = np.array(df_test["PassengerId"]).astype(int)
my_solution = pd.DataFrame(survived, PassengerId, columns = ["Survived"])

my_solution.to_csv("dnn_with_chainer.csv", index_label = ["PassengerId"])