In [1]:
%load_ext autoreload
%autoreload 1

In [72]:
# Familiar imports
import numpy as np
import pandas as pd

# For ordinal encoding categorical variables, splitting data
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split, cross_val_score

# For training random forest model
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

import os
from os import path
import random
# import zipfile
import tqdm
%aimport common
%aimport common_scoring
%aimport common_prediction
import seaborn as sns
import matplotlib.pyplot as plt
import xgboost
from datetime import datetime, timedelta
import itertools
import json
import functools
from IPython.display import HTML
import time
import logging
import optuna
import webbrowser

FN = path.join(os.getcwd(),"20210822-optuna")
STOPFILE_FN = ".optuna.stopfile"

In [3]:
X,y,X_test = common.setup()
X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=0)

In [66]:
SCORINGS = common_scoring.get_scorings()
# SUBMISSION_DATE = datetime.now()
SUBMISSION_DATE = datetime(2021,8,23)

In [75]:
%%time
#FIXME: cache the results

_N_TRIALS = 16
# pbar = tqdm.notebook.tqdm(total=_N_TRIALS)
_scoring = SCORINGS["cross_val"]

    def objective(trial):
        if path.isfile(STOPFILE_FN):
            trial.study.stop()

        xgb_params = dict(
            max_depth=trial.suggest_int("max_depth", 2, 10),
    #         learning_rate=trial.suggest_float("learning_rate", 1e-4, 1e-1, log=True),
            learning_rate=trial.suggest_float("learning_rate", 1e-4, 0.3, log=True),
    #         n_estimators=trial.suggest_int("n_estimators", 1000, 8000),
            n_estimators=trial.suggest_int("n_estimators", 500, 10000),
            min_child_weight=trial.suggest_int("min_child_weight", 1, 10),
            colsample_bytree=trial.suggest_float("colsample_bytree", 0.2, 1.0),
            subsample=trial.suggest_float("subsample", 0.2, 1.0),
            reg_alpha=trial.suggest_float("reg_alpha", 1e-4, 1e2, log=True),
            reg_lambda=trial.suggest_float("reg_lambda", 1e-4, 1e2, log=True),
        )
    #     pbar.update(1)
        xgb = xgboost.XGBRegressor(**xgb_params)
        return _scoring(xgb,X,y,X_train,y_train,X_valid,y_valid)

    study = optuna.create_study(direction="minimize")
    study.optimize(
        objective, 
        n_trials=_N_TRIALS,
        show_progress_bar=True,
    )
    xgb_params = study.best_params
    best_value = study.best_value

[32m[I 2021-08-24 00:18:53,831][0m A new study created in memory with name: no-name-b0db16f9-66a4-4192-97ff-a6fc513d27ce[0m
  self._init_valid()


  0%|          | 0/16 [00:00<?, ?it/s]

[32m[I 2021-08-24 01:33:21,421][0m Trial 0 finished with value: 0.7240900809634516 and parameters: {'max_depth': 5, 'learning_rate': 0.042606592517095705, 'n_estimators': 8544, 'min_child_weight': 5, 'colsample_bytree': 0.3888750746930081, 'subsample': 0.6175868235438191, 'reg_alpha': 14.117286252220444, 'reg_lambda': 0.002518788650164608}. Best is trial 0 with value: 0.7240900809634516.[0m
[32m[I 2021-08-24 01:52:32,342][0m Trial 1 finished with value: 5.050510532868337 and parameters: {'max_depth': 8, 'learning_rate': 0.0002007706505253688, 'n_estimators': 2182, 'min_child_weight': 6, 'colsample_bytree': 0.20896020342148686, 'subsample': 0.43687550220355204, 'reg_alpha': 0.012969305595402467, 'reg_lambda': 0.006779718795709784}. Best is trial 0 with value: 0.7240900809634516.[0m
[32m[I 2021-08-24 03:19:51,695][0m Trial 2 finished with value: 4.3196793279973535 and parameters: {'max_depth': 10, 'learning_rate': 0.000138621961590795, 'n_estimators': 4317, 'min_child_weight': 10

In [76]:
common.call_me("optuna done")

## prediction

In [80]:
with open(".tmp/optuna_best_params_2021-08-24T03:50:07.419376.json") as f:
    xgb_params = json.load(f)["best_params"]
xgb_params;

In [81]:
%%time
# here we use the fact that xgboost is deterministic, cf. https://www.kaggle.com/devinanzelmo/done-done-2-more-efficient
model = xgboost.XGBRegressor(**xgb_params)
_PREDICTION_MODES = common_prediction.get_prediction_methods()

for tag,pm in tqdm.notebook.tqdm(list(_PREDICTION_MODES.items())):
    pred_df = pm(model,X,y,X_train,y_train,X_valid,y_valid,X_test)
    pred_fn = f".tmp/predictions_{tag}_{SUBMISSION_DATE.strftime('%Y%m%d')}_optuna.csv"
    pred_df.to_csv(pred_fn,index=None)
    common.submit_prediction(
        pred_fn,
        competition_name="30-days-of-ml",
        notebook_fn=FN,
        mse=best_value,
        suff=tag,
    )

_URL = "https://www.kaggle.com/c/30-days-of-ml/submissions"
_URL
webbrowser.get("firefox").open(_URL)

  0%|          | 0/2 [00:00<?, ?it/s]

auto-commit


[master 3bc58e67] auto-commit
 3 files changed, 57 insertions(+), 83 deletions(-)


remote: 
remote: GitHub found 7 vulnerabilities on nailbiter/for's default branch (2 high, 5 moderate). To find out more, visit:        
remote:      https://github.com/nailbiter/for/security/dependabot        
remote: 
To https://github.com/nailbiter/for.git
   39449eb8..3bc58e67  master -> master
100%|██████████| 3.01M/3.01M [00:03<00:00, 855kB/s] 


Successfully submitted to 30 Days of ML

auto-commit


[master e8448fb0] auto-commit
 1 file changed, 37 insertions(+), 1 deletion(-)


remote: 
remote: GitHub found 7 vulnerabilities on nailbiter/for's default branch (2 high, 5 moderate). To find out more, visit:        
remote:      https://github.com/nailbiter/for/security/dependabot        
remote: 
To https://github.com/nailbiter/for.git
   3bc58e67..e8448fb0  master -> master
100%|██████████| 3.01M/3.01M [00:05<00:00, 631kB/s] 


Successfully submitted to 30 Days of MLCPU times: user 1h 12min 25s, sys: 1min 13s, total: 1h 13min 39s
Wall time: 10min 42s


True

In [74]:
common.call_me("submission done")