# Import

In [1]:
from utils import init_logger, timer, fix_seed, df_info, reduce_memory_usage
import random
import pandas as pd
pd.set_option('display.max_rows', 150)
pd.set_option('display.max_columns', 150)
import numpy as np
import category_encoders as ce
import matplotlib.pyplot as plt
import datetime as dt
from tqdm import tqdm
from tqdm.notebook import tqdm_notebook
tqdm_notebook.pandas(desc="Processing:")

fix_seed()
logger = init_logger()

In [2]:
PROJECT_NAME = "v1"

# Load

In [3]:
import pandas as pd
import category_encoders as ce
from sklearn.preprocessing import StandardScaler

def load_data():
    df_train = pd.read_csv("../input/train.csv")
    df_test = pd.read_csv("../input/test.csv")
    df = pd.concat([df_train, df_test]).reset_index(drop=True)
    
    # missing value
    df['Fare'].fillna(df.query('Pclass==3 & Embarked=="S"')['Fare'].median(), inplace=True)
    df["Age"].fillna(df["Age"].mean(), inplace=True)
    
    # ordinal encoding
    df["Sex"] = df["Sex"].map({"male": 0, "female": 1})
    
    # one hot encoding
    ohe_columns = [
        "Pclass",
        "Embarked"
    ]
    ohe = ce.OneHotEncoder(cols=ohe_columns, handle_unknown='impute')
    df = ohe.fit_transform(df)
    
    # scaling
    sc_columns = [
        "Age",
        "Fare"
    ]
    sc = StandardScaler()
    df[sc_columns] = sc.fit_transform(df[sc_columns])
    
    # extract
    df.drop([
        "Name",
        "Ticket",
        "Cabin"
    ], axis=1, inplace=True)

    
    df_train, df_test = df[:len(df_train)], df[len(df_train):]
    return df_train, df_test

In [4]:
with timer("read csv", logger):
    df_train, df_test = load_data()

df_train = reduce_memory_usage(df_train)
df_test = reduce_memory_usage(df_test)

2022/01/02 15:37:33 45 [INFO] [read csv] start.
2022/01/02 15:37:33 47 [INFO] [read csv] done in 0.035 seconds.


In [5]:
X_train = df_train.drop(["PassengerId", "Survived"], axis=1)
y_train = df_train["Survived"]
X_test = df_test.drop(["PassengerId", "Survived"], axis=1)

In [6]:
X_train.head()

Unnamed: 0,Pclass_1,Pclass_2,Pclass_3,Sex,Age,SibSp,Parch,Fare,Embarked_1,Embarked_2,Embarked_3,Embarked_4
0,1,0,0,0,-0.611816,1,0,-0.503418,1,0,0,0
1,0,1,0,1,0.630371,1,0,0.734863,0,1,0,0
2,1,0,0,1,-0.30127,0,0,-0.490234,1,0,0,0
3,0,1,0,1,0.397461,1,0,0.383301,1,0,0,0
4,1,0,0,0,0.397461,0,0,-0.487793,1,0,0,0


# Training

In [7]:
from model_nn import ModelNN
from runner import Runner
from keras.callbacks import EarlyStopping
from hyperopt import hp, fmin, tpe, space_eval

In [9]:
runner = Runner(PROJECT_NAME, ModelNN, cv=False)

In [11]:
params = {
    "units_list": [12, 8, 4],
    "dropout": 0.2,
    "num_classes": 2
}

# space = {
#     "layers": hp.uniformint("layer", 3, 7),
#     "dropout": hp.uniform("dropout", 0.1, 0.4),
#     "units": hp.choice("units", [4, 8, 12, 16])
# }

train_params = {
    "epochs": 5000,
    "batch_size": 32,
    "verbose": 1,
    "callbacks": [EarlyStopping(monitor="val_loss", min_delta=0, patience=30, verbose=1)]
}

In [16]:
def objective(args):
    logger.info(args)
    runner.train(X_train, y_train, args, train_params)
    return runner.get_score()

In [17]:
best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=2)
best_params = space_eval(space, best)
logger.info("best params: {}".format(best_params))

  0%|                                     | 0/2 [00:00<?, ?trial/s, best loss=?]

2022/01/01 11:37:07 2 [INFO] {'dropout': 0.35533811619227473, 'layers': 6, 'units': 12}


Metal device set to: Apple M1 Max


2022-01-01 11:37:07.285715: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-01-01 11:37:07.285887: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
2022-01-01 11:37:07.406809: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2022-01-01 11:37:07.787040: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-01-01 11:37:09.189156: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


Epoch 00221: early stopping                                                     
 50%|█████     | 1/2 [01:28<01:28, 88.49s/trial, best loss: 0.37501025199890137]

2022/01/01 11:38:35 2 [INFO] {'dropout': 0.14697523300939222, 'layers': 5, 'units': 4}
2022-01-01 11:38:36.206935: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-01-01 11:38:37.315807: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


Epoch 00220: early stopping                                                     
100%|██████████| 2/2 [02:43<00:00, 81.89s/trial, best loss: 0.37501025199890137]

2022/01/01 11:39:51 3 [INFO] best params: {'dropout': 0.35533811619227473, 'layers': 6, 'units': 12}





# Inference

In [12]:
runner = Runner(PROJECT_NAME, ModelNN, cv=False)
with timer("train", logger):
    runner.train(X_train, y_train, params, train_params)

2022/01/02 15:38:53 45 [INFO] [train] start.


Epoch 1/5000


2022-01-02 15:38:53.896372: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


Epoch 2/5000
 5/23 [=====>........................] - ETA: 0s - loss: 0.7460 - accuracy: 0.5312

2022-01-02 15:38:54.582021: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


Epoch 3/5000
Epoch 4/5000
Epoch 5/5000
Epoch 6/5000
Epoch 7/5000
Epoch 8/5000
Epoch 9/5000
Epoch 10/5000
Epoch 11/5000
Epoch 12/5000
Epoch 13/5000
Epoch 14/5000
Epoch 15/5000
Epoch 16/5000
Epoch 17/5000
Epoch 18/5000
Epoch 19/5000
Epoch 20/5000
Epoch 21/5000
Epoch 22/5000
Epoch 23/5000
Epoch 24/5000
Epoch 25/5000
Epoch 26/5000
Epoch 27/5000
Epoch 28/5000
Epoch 29/5000
Epoch 30/5000
Epoch 31/5000
Epoch 32/5000
Epoch 33/5000
Epoch 34/5000
Epoch 35/5000
Epoch 36/5000
Epoch 37/5000
Epoch 38/5000
Epoch 39/5000
Epoch 40/5000
Epoch 41/5000
Epoch 42/5000
Epoch 43/5000
Epoch 44/5000
Epoch 45/5000
Epoch 46/5000
Epoch 47/5000
Epoch 48/5000
Epoch 49/5000
Epoch 50/5000
Epoch 51/5000
Epoch 52/5000
Epoch 53/5000
Epoch 54/5000
Epoch 55/5000
Epoch 56/5000
Epoch 57/5000
Epoch 58/5000
Epoch 59/5000
Epoch 60/5000
Epoch 61/5000
Epoch 62/5000
Epoch 63/5000
Epoch 64/5000
Epoch 65/5000
Epoch 66/5000
Epoch 67/5000
Epoch 68/5000
Epoch 69/5000
Epoch 70/5000
Epoch 71/5000
Epoch 72/5000
Epoch 73/5000
Epoch 74/5000

2022/01/02 15:40:39 47 [INFO] [train] done in 106.124 seconds.


Epoch 00259: early stopping


In [13]:
runner.get_score()

0.3846169710159302

In [14]:
with timer("prediction", logger):
    prob = runner.predict(X_test)
prob

2022/01/02 15:40:53 45 [INFO] [prediction] start.
2022-01-02 15:40:53.120827: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022/01/02 15:40:53 47 [INFO] [prediction] done in 0.199 seconds.


array([[0.90039206, 0.09960796],
       [0.7186646 , 0.2813354 ],
       [0.86595374, 0.13404633],
       [0.8780423 , 0.12195769],
       [0.61403733, 0.38596264],
       [0.84271026, 0.15728982],
       [0.26888797, 0.731112  ],
       [0.8151269 , 0.18487306],
       [0.39134094, 0.6086591 ],
       [0.87308484, 0.1269152 ],
       [0.8822608 , 0.11773922],
       [0.784472  , 0.21552801],
       [0.04182854, 0.9581714 ],
       [0.8900869 , 0.10991312],
       [0.07577251, 0.9242274 ],
       [0.04961208, 0.95038784],
       [0.79316103, 0.20683901],
       [0.7740442 , 0.22595578],
       [0.617314  , 0.38268602],
       [0.5332779 , 0.4667221 ],
       [0.63727164, 0.3627284 ],
       [0.7957919 , 0.20420808],
       [0.04263877, 0.9573612 ],
       [0.34846494, 0.6515351 ],
       [0.13683441, 0.86316556],
       [0.8855458 , 0.11445421],
       [0.03464452, 0.9653555 ],
       [0.7838079 , 0.21619214],
       [0.7696658 , 0.23033425],
       [0.80208296, 0.197917  ],
       [0.

In [16]:
pred = prob.argmax(axis=1)
pred

array([0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [17]:
submission = pd.DataFrame({
    "PassengerId": df_test["PassengerId"],
    "Survived": pred
})
submission
submission.to_csv(f"../submission/submission_{PROJECT_NAME}.csv", index=False)
submission

Unnamed: 0,PassengerId,Survived
891,892,0
892,893,0
893,894,0
894,895,0
895,896,0
...,...,...
1304,1305,0
1305,1306,1
1306,1307,0
1307,1308,0


In [18]:
!kaggle competitions submit titanic -f ../submission/submission_{PROJECT_NAME}.csv -m "val_accuracy: 0.8380"

100%|██████████████████████████████████████| 2.77k/2.77k [00:02<00:00, 1.03kB/s]
Successfully submitted to Titanic - Machine Learning from Disaster