# Kaggle

In [1]:
import os
import json

with open("/Users/g-ogaki/workspace/kaggle.json") as f:
    json_data = json.load(f) 
    os.environ["KAGGLE_USERNAME"] = json_data["username"]
    os.environ["KAGGLE_KEY"] = json_data["key"]

In [2]:
!kaggle competitions download -c titanic

Downloading titanic.zip to /Users/g-ogaki/workspace/kaggle-titanic/code
  0%|                                               | 0.00/34.1k [00:00<?, ?B/s]
100%|███████████████████████████████████████| 34.1k/34.1k [00:00<00:00, 814kB/s]


In [3]:
!unzip titanic.zip

Archive:  titanic.zip
  inflating: gender_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


# Import

In [3]:
from utils import init_logger, timer, fix_seed
import random
import pandas as pd
pd.set_option('display.max_columns', 150)
import numpy as np
import category_encoders as ce
import matplotlib.pyplot as plt
import datetime as dt
from tqdm import tqdm
from tqdm.notebook import tqdm_notebook
tqdm_notebook.pandas(desc="Processing:")

fix_seed()
logger = init_logger()

In [4]:
PROJECT_NAME = "v1"

# Load

In [5]:
from etl import load_data

In [6]:
with timer("read csv", logger):
    df_train, df_test = load_data()

2022/01/01 11:10:08 45 [INFO] [read csv] start.
2022/01/01 11:10:08 47 [INFO] [read csv] done in 0.032 seconds.


In [7]:
X_train = df_train.drop(["PassengerId", "Survived"], axis=1)
y_train = df_train["Survived"]
X_test = df_test.drop(["PassengerId", "Survived"], axis=1)

# Training

In [8]:
from model_nn import ModelNN
from runner import Runner
from keras.callbacks import EarlyStopping
from hyperopt import hp, fmin, tpe, space_eval

In [10]:
runner = Runner(PROJECT_NAME, ModelNN, cv=False)

In [14]:
params = {
    "layers": 4,
    "dropout": 0.2,
    "units": 8
}

train_params = {
    "epochs": 5000,
    "batch_size": 32,
    "verbose": 0,
    "callbacks": [EarlyStopping(monitor="val_loss", min_delta=0, patience=30, verbose=1)]
}

In [10]:
def objective(args):
    logger.info(args)
    runner.train(X, y, args, train_params)
    return runner.get_score()

In [11]:
best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=20)
best_params = space_eval(space, best)
logger.info("best params: {}".format(best_params))

  0%|                                    | 0/20 [00:00<?, ?trial/s, best loss=?]

2021/12/31 14:33:48 2 [INFO] {'bagging_fraction': 0.8041882760340158, 'feature_fraction': 0.758404881916913, 'lambda_l1': 0.8732061538314787, 'lambda_l2': 0.02853921475938039, 'learning_rate': 0.01, 'max_leaves': 33, 'metric': ('multi_logloss', 'multi_error'), 'min_data_in_leaf': 41, 'num_classes': 7, 'objective': 'multiclass', 'seed': 0, 'verbose': -1}





Training until validation scores don't improve for 20 rounds                    
[50]	training's multi_logloss: 0.45077	training's multi_error: 0.0803313	valid_1's multi_logloss: 0.45129	valid_1's multi_error: 0.0805225
[100]	training's multi_logloss: 0.291398	training's multi_error: 0.0571466	valid_1's multi_logloss: 0.292148	valid_1's multi_error: 0.0572413
[150]	training's multi_logloss: 0.209781	training's multi_error: 0.0523722	valid_1's multi_logloss: 0.210703	valid_1's multi_error: 0.052405
[200]	training's multi_logloss: 0.166474	training's multi_error: 0.0488456	valid_1's multi_logloss: 0.167547	valid_1's multi_error: 0.0490913
[250]	training's multi_logloss: 0.141748	training's multi_error: 0.0466009	valid_1's multi_logloss: 0.142972	valid_1's multi_error: 0.0470013
[300]	training's multi_logloss: 0.12632	training's multi_error: 0.04482	valid_1's multi_logloss: 0.127683	valid_1's multi_error: 0.0453975
[350]	training's multi_logloss: 0.115918	training's multi_error: 0.0433262

2021/12/31 14:46:45 2 [INFO] {'bagging_fraction': 0.8309292802869729, 'feature_fraction': 0.8493136822063353, 'lambda_l1': 0.012037178705749304, 'lambda_l2': 0.018650660926638262, 'learning_rate': 0.01, 'max_leaves': 46, 'metric': ('multi_logloss', 'multi_error'), 'min_data_in_leaf': 19, 'num_classes': 7, 'objective': 'multiclass', 'seed': 0, 'verbose': -1}





Training until validation scores don't improve for 20 rounds                    
[50]	training's multi_logloss: 0.419123	training's multi_error: 0.0672559	valid_1's multi_logloss: 0.420059	valid_1's multi_error: 0.0676675
[100]	training's multi_logloss: 0.25902	training's multi_error: 0.0535016	valid_1's multi_logloss: 0.260289	valid_1's multi_error: 0.0536575
[150]	training's multi_logloss: 0.184407	training's multi_error: 0.0485353	valid_1's multi_logloss: 0.185946	valid_1's multi_error: 0.0489625
[200]	training's multi_logloss: 0.146783	training's multi_error: 0.0458656	valid_1's multi_logloss: 0.148511	valid_1's multi_error: 0.0464287
[250]	training's multi_logloss: 0.124943	training's multi_error: 0.0439166	valid_1's multi_logloss: 0.12689	valid_1's multi_error: 0.0445912
[300]	training's multi_logloss: 0.112358	training's multi_error: 0.0423316	valid_1's multi_logloss: 0.11449	valid_1's multi_error: 0.0431487
[350]	training's multi_logloss: 0.104379	training's multi_error: 0.0411

2021/12/31 14:56:20 2 [INFO] {'bagging_fraction': 0.8299037171131212, 'feature_fraction': 0.803399637322971, 'lambda_l1': 0.0626719383891622, 'lambda_l2': 0.014493318498254155, 'learning_rate': 0.01, 'max_leaves': 53, 'metric': ('multi_logloss', 'multi_error'), 'min_data_in_leaf': 26, 'num_classes': 7, 'objective': 'multiclass', 'seed': 0, 'verbose': -1}





Training until validation scores don't improve for 20 rounds                    
[50]	training's multi_logloss: 0.424079	training's multi_error: 0.0668431	valid_1's multi_logloss: 0.425126	valid_1's multi_error: 0.0673437
[100]	training's multi_logloss: 0.264431	training's multi_error: 0.0523875	valid_1's multi_logloss: 0.265829	valid_1's multi_error: 0.0526713
[150]	training's multi_logloss: 0.187529	training's multi_error: 0.0476209	valid_1's multi_logloss: 0.189216	valid_1's multi_error: 0.04817
[200]	training's multi_logloss: 0.148103	training's multi_error: 0.0450788	valid_1's multi_logloss: 0.14999	valid_1's multi_error: 0.0457687
[250]	training's multi_logloss: 0.125608	training's multi_error: 0.0431475	valid_1's multi_logloss: 0.127715	valid_1's multi_error: 0.0439288
[300]	training's multi_logloss: 0.11206	training's multi_error: 0.0417019	valid_1's multi_logloss: 0.114342	valid_1's multi_error: 0.0425775
[350]	training's multi_logloss: 0.103565	training's multi_error: 0.04059

2021/12/31 15:07:17 2 [INFO] {'bagging_fraction': 0.8989200326060357, 'feature_fraction': 0.86436856562306, 'lambda_l1': 0.4897384544924264, 'lambda_l2': 0.011468634268358415, 'learning_rate': 0.01, 'max_leaves': 34, 'metric': ('multi_logloss', 'multi_error'), 'min_data_in_leaf': 40, 'num_classes': 7, 'objective': 'multiclass', 'seed': 0, 'verbose': -1}





Training until validation scores don't improve for 20 rounds                    
[50]	training's multi_logloss: 0.425145	training's multi_error: 0.0708938	valid_1's multi_logloss: 0.425904	valid_1's multi_error: 0.0712462
[100]	training's multi_logloss: 0.266259	training's multi_error: 0.0565116	valid_1's multi_logloss: 0.267291	valid_1's multi_error: 0.0566038
[150]	training's multi_logloss: 0.191915	training's multi_error: 0.0511988	valid_1's multi_logloss: 0.193168	valid_1's multi_error: 0.0513612
[200]	training's multi_logloss: 0.154016	training's multi_error: 0.0480156	valid_1's multi_logloss: 0.155449	valid_1's multi_error: 0.048395
[250]	training's multi_logloss: 0.132006	training's multi_error: 0.0458037	valid_1's multi_logloss: 0.133624	valid_1's multi_error: 0.0464338
[300]	training's multi_logloss: 0.118979	training's multi_error: 0.0440994	valid_1's multi_logloss: 0.120767	valid_1's multi_error: 0.044715
[350]	training's multi_logloss: 0.110536	training's multi_error: 0.042

2021/12/31 15:17:16 2 [INFO] {'bagging_fraction': 0.7068361964966817, 'feature_fraction': 0.7116335119363578, 'lambda_l1': 0.09513094301468064, 'lambda_l2': 0.524104595381781, 'learning_rate': 0.01, 'max_leaves': 52, 'metric': ('multi_logloss', 'multi_error'), 'min_data_in_leaf': 41, 'num_classes': 7, 'objective': 'multiclass', 'seed': 0, 'verbose': -1}





Training until validation scores don't improve for 20 rounds                    
[50]	training's multi_logloss: 0.454084	training's multi_error: 0.0788222	valid_1's multi_logloss: 0.454689	valid_1's multi_error: 0.0792237
[100]	training's multi_logloss: 0.294272	training's multi_error: 0.0541347	valid_1's multi_logloss: 0.295186	valid_1's multi_error: 0.0543525
[150]	training's multi_logloss: 0.208092	training's multi_error: 0.048985	valid_1's multi_logloss: 0.209248	valid_1's multi_error: 0.0494125
[200]	training's multi_logloss: 0.162704	training's multi_error: 0.0458238	valid_1's multi_logloss: 0.164076	valid_1's multi_error: 0.0464613
[250]	training's multi_logloss: 0.137146	training's multi_error: 0.0438575	valid_1's multi_logloss: 0.138687	valid_1's multi_error: 0.0445888
[300]	training's multi_logloss: 0.120831	training's multi_error: 0.0423044	valid_1's multi_logloss: 0.122563	valid_1's multi_error: 0.0431038
[350]	training's multi_logloss: 0.110143	training's multi_error: 0.04

2021/12/31 15:30:12 2 [INFO] {'bagging_fraction': 0.8197814508193194, 'feature_fraction': 0.7634908800588147, 'lambda_l1': 0.13532178103709736, 'lambda_l2': 0.21595416946835802, 'learning_rate': 0.01, 'max_leaves': 69, 'metric': ('multi_logloss', 'multi_error'), 'min_data_in_leaf': 31, 'num_classes': 7, 'objective': 'multiclass', 'seed': 0, 'verbose': -1}





Training until validation scores don't improve for 20 rounds                    
[50]	training's multi_logloss: 0.438482	training's multi_error: 0.0708359	valid_1's multi_logloss: 0.43934	valid_1's multi_error: 0.0713088
[100]	training's multi_logloss: 0.276366	training's multi_error: 0.0508994	valid_1's multi_logloss: 0.277597	valid_1's multi_error: 0.0512475
[150]	training's multi_logloss: 0.193837	training's multi_error: 0.046145	valid_1's multi_logloss: 0.195396	valid_1's multi_error: 0.0468638
[200]	training's multi_logloss: 0.150607	training's multi_error: 0.0436288	valid_1's multi_logloss: 0.152423	valid_1's multi_error: 0.0443363
[250]	training's multi_logloss: 0.126558	training's multi_error: 0.0418891	valid_1's multi_logloss: 0.128602	valid_1's multi_error: 0.0427825
[300]	training's multi_logloss: 0.111869	training's multi_error: 0.0407081	valid_1's multi_logloss: 0.114136	valid_1's multi_error: 0.0416238
[350]	training's multi_logloss: 0.102321	training's multi_error: 0.039

2021/12/31 15:42:37 2 [INFO] {'bagging_fraction': 0.8752568813438453, 'feature_fraction': 0.7759201601499417, 'lambda_l1': 0.06838213507022683, 'lambda_l2': 0.19405602740685965, 'learning_rate': 0.01, 'max_leaves': 59, 'metric': ('multi_logloss', 'multi_error'), 'min_data_in_leaf': 47, 'num_classes': 7, 'objective': 'multiclass', 'seed': 0, 'verbose': -1}





Training until validation scores don't improve for 20 rounds                    
[50]	training's multi_logloss: 0.433631	training's multi_error: 0.0714975	valid_1's multi_logloss: 0.434376	valid_1's multi_error: 0.07185
[100]	training's multi_logloss: 0.272359	training's multi_error: 0.0519913	valid_1's multi_logloss: 0.273444	valid_1's multi_error: 0.0523363
[150]	training's multi_logloss: 0.192102	training's multi_error: 0.0471216	valid_1's multi_logloss: 0.193483	valid_1's multi_error: 0.0477038
[200]	training's multi_logloss: 0.15066	training's multi_error: 0.0444141	valid_1's multi_logloss: 0.152279	valid_1's multi_error: 0.0451587
[250]	training's multi_logloss: 0.127133	training's multi_error: 0.0425897	valid_1's multi_logloss: 0.128971	valid_1's multi_error: 0.0434037
[300]	training's multi_logloss: 0.112867	training's multi_error: 0.0413022	valid_1's multi_logloss: 0.114904	valid_1's multi_error: 0.0422125
[350]	training's multi_logloss: 0.103675	training's multi_error: 0.0402

KeyboardInterrupt: 

# Inference

In [15]:
runner = Runner(PROJECT_NAME, ModelNN, cv=True)
with timer("train", logger):
    runner.train(X_train, y_train, params, train_params)

2022/01/01 11:11:58 45 [INFO] [train] start.
2022-01-01 11:11:58.698966: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-01-01 11:11:59.386391: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


Epoch 00154: early stopping


2022-01-01 11:12:42.842436: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-01-01 11:12:43.822981: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


Epoch 00190: early stopping


2022-01-01 11:13:38.414961: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-01-01 11:13:39.355758: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


Epoch 00182: early stopping


2022-01-01 11:14:31.224461: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-01-01 11:14:32.222778: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


Epoch 00082: early stopping


2022-01-01 11:14:55.757951: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-01-01 11:14:56.795413: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022/01/01 11:15:18 47 [INFO] [train] done in 200.435 seconds.


Epoch 00077: early stopping


In [18]:
runner.get_score()

0.4293127179145813

In [16]:
with timer("prediction", logger):
    prob = runner.predict(X_test)
pred = np.where(prob > 0.5, 1, 0)
pred

2022/01/01 11:15:21 45 [INFO] [prediction] start.
2022-01-01 11:15:21.248984: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-01-01 11:15:21.462221: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-01-01 11:15:21.666765: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-01-01 11:15:21.935051: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-01-01 11:15:22.202376: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022/01/01 11:15:22 47 [INFO] [prediction] done in 1.260 seconds.


array([0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [17]:
submission = pd.DataFrame({
    "PassengerId": df_test["PassengerId"],
    "Survived": pred
})
submission
submission.to_csv(f"../submission/submission_{PROJECT_NAME}.csv", index=False)
submission

Unnamed: 0,PassengerId,Survived
891,892,0
892,893,0
893,894,0
894,895,0
895,896,0
...,...,...
1304,1305,0
1305,1306,1
1306,1307,0
1307,1308,0


In [20]:
!kaggle competitions submit titanic -f ../submission/submission_{PROJECT_NAME}.csv -m "loss: 0.4293127179145813"

100%|████████████████████████████████████████| 2.77k/2.77k [00:03<00:00, 842B/s]
Successfully submitted to Titanic - Machine Learning from Disaster