In [1]:
import pandas as pd
import numpy as np

from sklearn import model_selection
from sklearn import preprocessing

from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from xgboost import XGBRegressor

import optuna

In [2]:
train = pd.read_csv("../input/30-days-of-ml/train.csv")
test = pd.read_csv("../input/30-days-of-ml/test.csv")
sample = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

In [3]:
train.head()

Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,...,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,target
0,1,B,B,B,C,B,B,A,E,C,...,0.400361,0.160266,0.310921,0.38947,0.267559,0.237281,0.377873,0.322401,0.86985,8.113634
1,2,B,B,A,A,B,D,A,F,A,...,0.533087,0.558922,0.516294,0.594928,0.341439,0.906013,0.921701,0.261975,0.465083,8.481233
2,3,A,A,A,C,B,D,A,D,A,...,0.650609,0.375348,0.902567,0.555205,0.843531,0.748809,0.620126,0.541474,0.763846,8.364351
3,4,B,B,A,C,B,D,A,E,C,...,0.66898,0.239061,0.732948,0.679618,0.574844,0.34601,0.71461,0.54015,0.280682,8.049253
4,6,A,A,A,C,B,D,A,E,A,...,0.686964,0.420667,0.648182,0.684501,0.956692,1.000773,0.776742,0.625849,0.250823,7.97226


In [4]:
sample.head()

Unnamed: 0,id,target
0,0,0.5
1,5,0.5
2,15,0.5
3,16,0.5
4,17,0.5


In [5]:
train["kfold"] = -1
n_splits = 5

In [6]:
kf = model_selection.KFold(n_splits=n_splits, shuffle=True, random_state=0)
for fold, (train_indicies, valid_indicies) in enumerate(kf.split(X=train)):
    train.loc[valid_indicies, "kfold"] = fold

In [7]:
useful_features = [c for c in train.columns if c not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if 'cat' in col]
test = test[useful_features]

In [8]:
for col in object_cols:
    temp_train = []
    temp_test_feat = None
    for fold in range(n_splits):
        xtrain =  train[train.kfold != fold].reset_index(drop=True)
        xvalid = train[train.kfold == fold].reset_index(drop=True)
        feat = xtrain.groupby(col)["target"].agg("mean")
        feat = feat.to_dict()
        xvalid.loc[:, f"tar_enc_{col}"] = xvalid[col].map(feat)
        temp_train.append(xvalid)
        if temp_test_feat is None:
            temp_test_feat = test[col].map(feat)
        else:
            temp_test_feat += test[col].map(feat)
    
    temp_test_feat /= n_splits
    test.loc[:, f"tar_enc_{col}"] = temp_test_feat
    train = pd.concat(temp_train)

In [9]:
useful_features = [c for c in train.columns if c not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if col.startswith("cat")]
test = test[useful_features]

In [10]:
print(useful_features)
print(object_cols)

['cat0', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8', 'cat9', 'cont0', 'cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7', 'cont8', 'cont9', 'cont10', 'cont11', 'cont12', 'cont13', 'tar_enc_cat0', 'tar_enc_cat1', 'tar_enc_cat2', 'tar_enc_cat3', 'tar_enc_cat4', 'tar_enc_cat5', 'tar_enc_cat6', 'tar_enc_cat7', 'tar_enc_cat8', 'tar_enc_cat9']
['cat0', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8', 'cat9']


In [11]:
def run(trial):
    fold = 0
    learning_rate = trial.suggest_float("learning_rate", 1e-2, 0.25, log=True)
    reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
    reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
    subsample = trial.suggest_float("subsample", 0.1, 1.0)
    colsample_bytree = trial.suggest_float("colsample_bytree", 0.1, 1.0)
    max_depth = trial.suggest_int("max_depth", 1, 7)

    xtrain = train[train.kfold != fold].reset_index(drop=True)
    xvalid = train[train.kfold == fold].reset_index(drop=True)

    ytrain = xtrain.target
    yvalid = xvalid.target

    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    ordinal_encoder = preprocessing.OrdinalEncoder()
    xtrain[object_cols] = ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.transform(xvalid[object_cols])

    model = XGBRegressor(
        random_state=42,
        tree_method="gpu_hist",
        gpu_id=1,
        predictor="gpu_predictor",
        n_estimators=10000,
        learning_rate=learning_rate,
        reg_lambda=reg_lambda,
        reg_alpha=reg_alpha,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        max_depth=max_depth,
    )
    model.fit(xtrain, ytrain, early_stopping_rounds=300, eval_set=[(xvalid, yvalid)], verbose=1000)
    
    preds_valid = model.predict(xvalid)
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    
    return rmse

In [12]:
study = optuna.create_study(direction="minimize")
study.optimize(run, n_trials=10)

[32m[I 2021-08-30 02:06:44,685][0m A new study created in memory with name: no-name-9230728a-72ee-40c5-9aca-5670852a0e77[0m


[0]	validation_0-rmse:7.60435
[1000]	validation_0-rmse:0.72256
[2000]	validation_0-rmse:0.72159
[2115]	validation_0-rmse:0.72157


[32m[I 2021-08-30 02:07:00,010][0m Trial 0 finished with value: 0.7215323844820103 and parameters: {'learning_rate': 0.02214783482175915, 'reg_lambda': 0.07076779154981984, 'reg_alpha': 2.923602499561769e-06, 'subsample': 0.4648763713002555, 'colsample_bytree': 0.2193714469525782, 'max_depth': 6}. Best is trial 0 with value: 0.7215323844820103.[0m


[0]	validation_0-rmse:5.85234
[1000]	validation_0-rmse:0.72100
[2000]	validation_0-rmse:0.72083
[2154]	validation_0-rmse:0.72098


[32m[I 2021-08-30 02:07:07,526][0m Trial 1 finished with value: 0.7207342344965707 and parameters: {'learning_rate': 0.24996830282032062, 'reg_lambda': 74.95430763488979, 'reg_alpha': 5.092178744070549, 'subsample': 0.6600886156203973, 'colsample_bytree': 0.3171222931639457, 'max_depth': 2}. Best is trial 1 with value: 0.7207342344965707.[0m


[0]	validation_0-rmse:6.37868
[385]	validation_0-rmse:0.73327


[32m[I 2021-08-30 02:07:12,233][0m Trial 2 finished with value: 0.7270465105118721 and parameters: {'learning_rate': 0.18132711555352352, 'reg_lambda': 0.0005275532590882823, 'reg_alpha': 4.289594377912753e-06, 'subsample': 0.5487823634521312, 'colsample_bytree': 0.6560684706232305, 'max_depth': 6}. Best is trial 1 with value: 0.7207342344965707.[0m


[0]	validation_0-rmse:7.53614
[1000]	validation_0-rmse:0.72235
[2000]	validation_0-rmse:0.72110
[2587]	validation_0-rmse:0.72108


[32m[I 2021-08-30 02:07:26,277][0m Trial 3 finished with value: 0.7210280124015485 and parameters: {'learning_rate': 0.031005563306090153, 'reg_lambda': 1.0102031071008295, 'reg_alpha': 0.0001331709494191399, 'subsample': 0.8272247317015341, 'colsample_bytree': 0.3133527001544443, 'max_depth': 5}. Best is trial 1 with value: 0.7207342344965707.[0m


[0]	validation_0-rmse:6.90245
[353]	validation_0-rmse:0.75472


[32m[I 2021-08-30 02:07:31,993][0m Trial 4 finished with value: 0.7346053899302317 and parameters: {'learning_rate': 0.11322695663349736, 'reg_lambda': 0.0010302494894871084, 'reg_alpha': 0.0003502568826912699, 'subsample': 0.13412087985192545, 'colsample_bytree': 0.9492767542699304, 'max_depth': 7}. Best is trial 1 with value: 0.7207342344965707.[0m


[0]	validation_0-rmse:6.52449
[350]	validation_0-rmse:0.76513


[32m[I 2021-08-30 02:07:37,815][0m Trial 5 finished with value: 0.7338437963956541 and parameters: {'learning_rate': 0.16238107307325356, 'reg_lambda': 3.5229818866645652e-06, 'reg_alpha': 1.0787822162912993e-05, 'subsample': 0.16820005364052576, 'colsample_bytree': 0.30964513973902597, 'max_depth': 7}. Best is trial 1 with value: 0.7207342344965707.[0m


[0]	validation_0-rmse:6.94381
[535]	validation_0-rmse:0.72698


[32m[I 2021-08-30 02:07:43,445][0m Trial 6 finished with value: 0.7246564399013294 and parameters: {'learning_rate': 0.10789353198743094, 'reg_lambda': 0.0010889936982949636, 'reg_alpha': 0.08950237750649363, 'subsample': 0.59329994878405, 'colsample_bytree': 0.6339266184249422, 'max_depth': 6}. Best is trial 1 with value: 0.7207342344965707.[0m


[0]	validation_0-rmse:7.06324
[1000]	validation_0-rmse:0.73027
[2000]	validation_0-rmse:0.72763
[3000]	validation_0-rmse:0.72614
[4000]	validation_0-rmse:0.72534
[5000]	validation_0-rmse:0.72460
[6000]	validation_0-rmse:0.72406
[7000]	validation_0-rmse:0.72361
[8000]	validation_0-rmse:0.72328
[9000]	validation_0-rmse:0.72298
[9999]	validation_0-rmse:0.72274


[32m[I 2021-08-30 02:08:03,745][0m Trial 7 finished with value: 0.7227084240755903 and parameters: {'learning_rate': 0.09239775161052549, 'reg_lambda': 4.84511859775794e-06, 'reg_alpha': 0.04986449596508688, 'subsample': 0.3332190254592759, 'colsample_bytree': 0.4626857724714698, 'max_depth': 1}. Best is trial 1 with value: 0.7207342344965707.[0m


[0]	validation_0-rmse:7.10870
[1000]	validation_0-rmse:0.72196
[1378]	validation_0-rmse:0.72204


[32m[I 2021-08-30 02:08:10,804][0m Trial 8 finished with value: 0.7219019089077013 and parameters: {'learning_rate': 0.08651250034118399, 'reg_lambda': 16.255848978933955, 'reg_alpha': 19.685406239013638, 'subsample': 0.3437549929160484, 'colsample_bytree': 0.43091190346841435, 'max_depth': 4}. Best is trial 1 with value: 0.7207342344965707.[0m


[0]	validation_0-rmse:7.50637
[1000]	validation_0-rmse:0.72885
[2000]	validation_0-rmse:0.72484
[3000]	validation_0-rmse:0.72301
[4000]	validation_0-rmse:0.72199
[5000]	validation_0-rmse:0.72141
[6000]	validation_0-rmse:0.72107
[7000]	validation_0-rmse:0.72092
[8000]	validation_0-rmse:0.72079
[9000]	validation_0-rmse:0.72075
[9158]	validation_0-rmse:0.72075


[32m[I 2021-08-30 02:08:34,460][0m Trial 9 finished with value: 0.7207290397605832 and parameters: {'learning_rate': 0.034863297894071936, 'reg_lambda': 0.02571068180150075, 'reg_alpha': 1.4058982970223255e-08, 'subsample': 0.6513216548171736, 'colsample_bytree': 0.9187565602194514, 'max_depth': 2}. Best is trial 9 with value: 0.7207290397605832.[0m


In [13]:
study.best_params

{'learning_rate': 0.034863297894071936,
 'reg_lambda': 0.02571068180150075,
 'reg_alpha': 1.4058982970223255e-08,
 'subsample': 0.6513216548171736,
 'colsample_bytree': 0.9187565602194514,
 'max_depth': 2}

In [14]:
model_hparams = study.best_params.copy()

In [15]:
final_predictions = []
scores = []
for fold in range(5):
    xtrain =  train[train.kfold != fold].reset_index(drop=True)
    xvalid = train[train.kfold == fold].reset_index(drop=True)
    xtest = test.copy()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    ordinal_encoder = preprocessing.OrdinalEncoder()
    xtrain[object_cols] = ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.transform(xvalid[object_cols])
    xtest[object_cols] = ordinal_encoder.transform(xtest[object_cols])
    
    model_hparams = {'learning_rate': 0.07853392035787837, 'reg_lambda': 1.7549293092194938e-05, 'reg_alpha': 14.68267919457715, 'subsample': 0.8031450486786944, 'colsample_bytree': 0.170759104940733, 'max_depth': 3}
    # model_hparams = {'learning_rate': 0.05716288982066382, 'reg_lambda': 0.00019494817166380117, 'reg_alpha': 0.006442549190932354, 'subsample': 0.8695060199115421, 'colsample_bytree': 0.7452392097440439, 'max_depth': 2}
    
    model = XGBRegressor(
        random_state=0, 
        # tree_method='gpu_hist',
        # gpu_id=0,
        # predictor="gpu_predictor",
        n_estimators=10000,
        **model_hparams
    )
    model.fit(xtrain, ytrain, early_stopping_rounds=300, eval_set=[(xvalid, yvalid)], verbose=1000)
    
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    
    final_predictions.append(test_preds)
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    scores.append(rmse)
    
    print(fold, rmse)

print(np.mean(scores), np.std(scores))

[0]	validation_0-rmse:7.17003
[1000]	validation_0-rmse:0.72052
[2000]	validation_0-rmse:0.71876
[2980]	validation_0-rmse:0.71862
0 0.7185690282322732
[0]	validation_0-rmse:7.17294
[1000]	validation_0-rmse:0.71768
[2000]	validation_0-rmse:0.71595
[3000]	validation_0-rmse:0.71585
[3097]	validation_0-rmse:0.71588
1 0.7158032403042446
[0]	validation_0-rmse:7.17481
[1000]	validation_0-rmse:0.72258
[2000]	validation_0-rmse:0.72064
[3000]	validation_0-rmse:0.72039
[3577]	validation_0-rmse:0.72046
2 0.7203691676704727
[0]	validation_0-rmse:7.17146
[1000]	validation_0-rmse:0.71631
[2000]	validation_0-rmse:0.71436
[2805]	validation_0-rmse:0.71431
3 0.7142325177485888
[0]	validation_0-rmse:7.17549
[1000]	validation_0-rmse:0.71692
[2000]	validation_0-rmse:0.71492
[3000]	validation_0-rmse:0.71465
[3107]	validation_0-rmse:0.71466
4 0.7146383432467058
0.7167224594404569 0.0023705109565708444


In [16]:
preds = np.mean(np.column_stack(final_predictions), axis=1)

In [17]:
output = sample.copy()
output.target = preds
output.to_csv('submission_tuning_optuna.csv', index=False)