# Import

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import category_encoders as ce

# Load

In [2]:
df_train = pd.read_csv('../input/train.csv')
df_test = pd.read_csv('../input/test.csv')
df = pd.concat([df_train, df_test]).reset_index(drop=True)

df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
1304,1305,,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
1305,1306,,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
1306,1307,,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
1307,1308,,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


# EDA

In [3]:
pd.concat([
    pd.concat({
        "unique": df.nunique(),
        "missing": df.isnull().sum(),
        "dtype": df.dtypes
    }, axis=1),
    df.describe().T
], axis=1)

Unnamed: 0,unique,missing,dtype,count,mean,std,min,25%,50%,75%,max
PassengerId,1309,0,int64,1309.0,655.0,378.020061,1.0,328.0,655.0,982.0,1309.0
Survived,2,418,float64,891.0,0.383838,0.486592,0.0,0.0,0.0,1.0,1.0
Pclass,3,0,int64,1309.0,2.294882,0.837836,1.0,2.0,3.0,3.0,3.0
Name,1307,0,object,,,,,,,,
Sex,2,0,object,,,,,,,,
Age,98,263,float64,1046.0,29.881138,14.413493,0.17,21.0,28.0,39.0,80.0
SibSp,7,0,int64,1309.0,0.498854,1.041658,0.0,0.0,0.0,1.0,8.0
Parch,8,0,int64,1309.0,0.385027,0.86556,0.0,0.0,0.0,0.0,9.0
Ticket,929,0,object,,,,,,,,
Fare,281,1,float64,1308.0,33.295479,51.758668,0.0,7.8958,14.4542,31.275,512.3292


## Fare

欠損している人において、Pclass=3 & Embarked=S  
なので、この母集団における Fare の median で missing value を補完

In [4]:
df[df["Fare"].isnull()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1043,1044,,3,"Storey, Mr. Thomas",male,60.5,0,0,3701,,,S


In [5]:
df['Fare'].fillna(df.query('Pclass==3 & Embarked=="S"')['Fare'].median(), inplace=True)

## Age

面倒なので mean で補完

In [6]:
df["Age"].fillna(df["Age"].mean(), inplace=True)

# Preprocess

## Ordinal Encoding

In [7]:
df["Sex"] = df["Sex"].map({"male": 0, "female": 1})

In [8]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,S
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1.0,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,S
4,5,0.0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,S


## One-Hot Encoding

In [9]:
ohe_columns = [
    "Pclass",
    "Embarked"
]

In [10]:
ohe = ce.OneHotEncoder(cols=ohe_columns, handle_unknown='impute')

In [11]:
df = ohe.fit_transform(df)

df.head()

Unnamed: 0,PassengerId,Survived,Pclass_1,Pclass_2,Pclass_3,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked_1,Embarked_2,Embarked_3,Embarked_4
0,1,0.0,1,0,0,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,1,0,0,0
1,2,1.0,0,1,0,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,0,1,0,0
2,3,1.0,1,0,0,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,1,0,0,0
3,4,1.0,0,1,0,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,1,0,0,0
4,5,0.0,1,0,0,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,1,0,0,0


## Scaling

In [12]:
from sklearn.preprocessing import StandardScaler

In [13]:
sc_columns = [
    "Age",
    "Fare"
]

In [14]:
sc = StandardScaler()

df[sc_columns] = sc.fit_transform(df[sc_columns])
df.head()

Unnamed: 0,PassengerId,Survived,Pclass_1,Pclass_2,Pclass_3,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked_1,Embarked_2,Embarked_3,Embarked_4
0,1,0.0,1,0,0,"Braund, Mr. Owen Harris",0,-0.611972,1,0,A/5 21171,-0.503176,,1,0,0,0
1,2,1.0,0,1,0,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,0.630431,1,0,PC 17599,0.734809,C85,0,1,0,0
2,3,1.0,1,0,0,"Heikkinen, Miss. Laina",1,-0.301371,0,0,STON/O2. 3101282,-0.490126,,1,0,0,0
3,4,1.0,0,1,0,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,0.397481,1,0,113803,0.383263,C123,1,0,0,0
4,5,0.0,1,0,0,"Allen, Mr. William Henry",0,0.397481,0,0,373450,-0.487709,,1,0,0,0


## drop

In [15]:
df.drop([
    "Name",
    "Ticket",
    "Cabin"
], axis=1, inplace=True)

df.head()

Unnamed: 0,PassengerId,Survived,Pclass_1,Pclass_2,Pclass_3,Sex,Age,SibSp,Parch,Fare,Embarked_1,Embarked_2,Embarked_3,Embarked_4
0,1,0.0,1,0,0,0,-0.611972,1,0,-0.503176,1,0,0,0
1,2,1.0,0,1,0,1,0.630431,1,0,0.734809,0,1,0,0
2,3,1.0,1,0,0,1,-0.301371,0,0,-0.490126,1,0,0,0
3,4,1.0,0,1,0,1,0.397481,1,0,0.383263,1,0,0,0
4,5,0.0,1,0,0,0,0.397481,0,0,-0.487709,1,0,0,0


In [16]:
df_train = df[:len(df_train)]
df_test = df[len(df_train):]

X_train = df_train.drop(["PassengerId", "Survived"], axis=1)
y_train = df_train["Survived"]
X_test = df_test.drop(["PassengerId", "Survived"], axis=1)

In [17]:
X_train

Unnamed: 0,Pclass_1,Pclass_2,Pclass_3,Sex,Age,SibSp,Parch,Fare,Embarked_1,Embarked_2,Embarked_3,Embarked_4
0,1,0,0,0,-0.611972,1,0,-0.503176,1,0,0,0
1,0,1,0,1,0.630431,1,0,0.734809,0,1,0,0
2,1,0,0,1,-0.301371,0,0,-0.490126,1,0,0,0
3,0,1,0,1,0.397481,1,0,0.383263,1,0,0,0
4,1,0,0,0,0.397481,0,0,-0.487709,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,0,1,0,-0.223721,0,0,-0.392009,1,0,0,0
887,0,1,0,1,-0.844922,0,0,-0.063340,1,0,0,0
888,1,0,0,1,0.000000,1,2,-0.189974,1,0,0,0
889,0,1,0,0,-0.301371,0,0,-0.063340,0,1,0,0


# Training

In [19]:
from model_nn import ModelNN
from keras.callbacks import EarlyStopping
from runner import Runner

In [20]:
runner = Runner("hoge", ModelNN, cv=False)

In [21]:
params = {
    "layers": 4,
    "dropout": 0.2,
    "units": 8
}

train_params = {
    "epochs": 5000,
    "batch_size": 32,
    "verbose": 1,
    "callbacks": [EarlyStopping(monitor="val_loss", min_delta=0, patience=30, verbose=1)]
}

In [22]:
runner.train(X_train, y_train, params, train_params)

Metal device set to: Apple M1 Max
Epoch 1/5000


2022-01-01 10:42:11.926186: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-01-01 10:42:11.926499: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
2022-01-01 10:42:12.025820: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2022-01-01 10:42:12.341380: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


Epoch 2/5000
 5/23 [=====>........................] - ETA: 0s - loss: 0.8103 - acc: 0.5750

2022-01-01 10:42:13.040326: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


Epoch 3/5000
Epoch 4/5000
Epoch 5/5000
Epoch 6/5000
Epoch 7/5000
Epoch 8/5000
Epoch 9/5000
Epoch 10/5000
Epoch 11/5000
Epoch 12/5000
Epoch 13/5000
Epoch 14/5000
Epoch 15/5000
Epoch 16/5000
Epoch 17/5000
Epoch 18/5000
Epoch 19/5000
Epoch 20/5000
Epoch 21/5000
Epoch 22/5000
Epoch 23/5000
Epoch 24/5000
Epoch 25/5000
Epoch 26/5000
Epoch 27/5000
Epoch 28/5000
Epoch 29/5000
Epoch 30/5000
Epoch 31/5000
Epoch 32/5000
Epoch 33/5000
Epoch 34/5000
Epoch 35/5000
Epoch 36/5000
Epoch 37/5000
Epoch 38/5000
Epoch 39/5000
Epoch 40/5000
Epoch 41/5000
Epoch 42/5000
Epoch 43/5000
Epoch 44/5000
Epoch 45/5000
Epoch 46/5000
Epoch 47/5000
Epoch 48/5000
Epoch 49/5000
Epoch 50/5000
Epoch 51/5000
Epoch 52/5000
Epoch 53/5000
Epoch 54/5000
Epoch 55/5000
Epoch 56/5000
Epoch 57/5000
Epoch 58/5000
Epoch 59/5000
Epoch 60/5000
Epoch 61/5000
Epoch 62/5000
Epoch 63/5000
Epoch 64/5000
Epoch 65/5000
Epoch 66/5000
Epoch 67/5000
Epoch 68/5000
Epoch 69/5000
Epoch 70/5000
Epoch 71/5000
Epoch 72/5000
Epoch 73/5000
Epoch 74/5000

In [23]:
runner.save_model()

2022-01-01 10:43:16.374203: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: ram://a7b19af5-dffe-41fe-ba26-aa14d7df196a/assets


In [24]:
runner.load_model()

In [25]:
runner.get_score()

0.37050724029541016

# Inference

In [26]:
pred = runner.predict(X_test)

pred = np.where(pred > 0.5, 1, 0)
pred

2022-01-01 10:43:20.395188: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


array([0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0,
       1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [27]:
submission = pd.DataFrame({
    "PassengerId": df_test["PassengerId"],
    "Survived": pred
})
submission

Unnamed: 0,PassengerId,Survived
891,892,0
892,893,0
893,894,0
894,895,0
895,896,0
...,...,...
1304,1305,0
1305,1306,1
1306,1307,0
1307,1308,0


In [28]:
submission.to_csv(f"../submission/submission_tensorflow.csv", index=False)