In [13]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.metrics import mean_absolute_error, mean_squared_error
from xgboost import XGBRegressor
import optuna

import warnings
warnings.filterwarnings("ignore")

In [3]:
#import data:
data = pd.read_csv("../../kaggle-30days/data/train_folds/train_folds.csv")
#shuffle the data:
data_full= data.sample(frac=1)

In [4]:
#training data: 
#Note: since test data is not available:
df = data_full.iloc[:240000,:]
df_test = data_full.iloc[240000:,:]

In [5]:
useful_features = [c for c in df.columns if c not in ["id","target","kfold"]]
object_cols = [c for c in useful_features if c.startswith("cat")]
df_test = df_test[useful_features]

In [6]:
#target encoding:
for col in object_cols:
    temp_df = []
    temp_test_feat =None
    for fold in range(5):
        xtrain =  df[df.kfold != fold].reset_index(drop=True)
        xvalid = df[df.kfold == fold].reset_index(drop=True)
        feat = xtrain.groupby(col)["target"].agg("mean")
        feat = feat.to_dict()
        print(feat)
        xvalid.loc[:, f"tar_enc_{col}"] = xvalid[col].map(feat)
        print(xvalid.head())
        temp_df.append(xvalid)
        if temp_test_feat is None:
            temp_test_feat = df_test[col].map(feat)
        else:
            temp_test_feat += df_test[col].map(feat) 
    
    #calculate the value for each fold and then divide by 5:
    temp_test_feat /= 5
    df_test.loc[:, f"tar_enc_{col}"] = temp_test_feat
    df = pd.concat(temp_df)
    print("df",df)
    
        
        

{'A': 8.237357916702521, 'B': 8.249206864369928}
       id cat0 cat1 cat2  ...    cont13    target kfold tar_enc_cat0
0   97127    A    A    A  ...  0.189804  8.884669     0     8.237358
1   28518    A    A    A  ...  0.857538  9.041808     0     8.237358
2  289922    B    A    B  ...  0.764259  7.428308     0     8.249207
3  265997    B    A    A  ...  0.425692  8.327012     0     8.249207
4  103360    B    A    A  ...  0.708118  8.630546     0     8.249207

[5 rows x 28 columns]
{'A': 8.237168140863156, 'B': 8.249228725855923}
       id cat0 cat1 cat2  ...    cont13    target kfold tar_enc_cat0
0  488880    B    B    A  ...  0.818652  9.339204     1     8.249229
1  357806    A    B    A  ...  0.843537  8.117776     1     8.237168
2   41431    B    B    A  ...  0.587890  8.200927     1     8.249229
3  188016    A    A    A  ...  0.287837  8.823878     1     8.237168
4  327993    A    A    A  ...  0.202111  9.081631     1     8.237168

[5 rows x 28 columns]
{'A': 8.23817247947091, 'B':

In [8]:
df_test = df_test[useful_features]

In [14]:
#For this GPU is required
#Note: GPU hist doesnt give much great results, so tune the parameter in gpu and take those best params and use it to predict in CPU
def run(trial):
    fold=0
    #some examples of parameters for xgboost:
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 1e-2, log=True)
    reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
    reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
    subsample = trial.suggest_float("subsample", 0.1, 1.0)
    colsample_bytree = trial.suggest_float("colsample_bytree", 0.1, 1.0)
    max_depth = trial.suggest_int("max_depth", 1, 7)
    
    
    xtrain = df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    
    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    ordinalencoder = preprocessing.OrdinalEncoder()
    xtrain[object_cols] = ordinalencoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinalencoder.transform(xvalid[object_cols])
    
    model = XGBRegressor(random_state=42,
                        #tree_method = "gpu_hist",
                        #gpu_id = "1",
                        #predictor = "gpu_predictor",
                        n_estimators = 200,
                        learning_rate =learning_rate,
                        reg_lambda = reg_lambda,
                        reg_alpha = reg_alpha,
                        subsample = subsample, 
                        colsample_bytree = colsample_bytree,
                        max_depth = max_depth,)
    model.fit(xtrain, ytrain, early_stopping_rounds=100, eval_set =[(xvalid, yvalid)], verbose=1000)
    pred_valid = model.predict(xvalid)
    rmse = mean_squared_error(y_true=yvalid, y_pred=pred_valid, squared=False)
    return rmse
    

In [16]:
#here we are specifying minimize since we wanted to optimize the rmse score:
study = optuna.create_study(direction = "minimize")
study.optimize(run, n_trials=5)

#Print the best parameters:
study.best_params

[32m[I 2023-05-16 21:56:33,939][0m A new study created in memory with name: no-name-b1d5bbbf-132a-48c5-9096-d0a09016c5e9[0m


[0]	validation_0-rmse:7.71773
[199]	validation_0-rmse:1.83691


[32m[I 2023-05-16 21:56:41,024][0m Trial 0 finished with value: 1.8369129136751796 and parameters: {'learning_rate': 0.00761406470599245, 'reg_lambda': 0.011805311808314534, 'reg_alpha': 88.26953706323562, 'subsample': 0.3823448694972297, 'colsample_bytree': 0.4795130865215185, 'max_depth': 2}. Best is trial 0 with value: 1.8369129136751796.[0m


[0]	validation_0-rmse:7.77574
[199]	validation_0-rmse:7.64842


[32m[I 2023-05-16 21:56:48,723][0m Trial 1 finished with value: 7.6484207924682615 and parameters: {'learning_rate': 8.374908887543265e-05, 'reg_lambda': 17.733011282161932, 'reg_alpha': 0.0005951469797068024, 'subsample': 0.374830851517874, 'colsample_bytree': 0.7887392712816126, 'max_depth': 6}. Best is trial 0 with value: 1.8369129136751796.[0m


[0]	validation_0-rmse:7.77628
[199]	validation_0-rmse:7.75564


[32m[I 2023-05-16 21:56:59,786][0m Trial 2 finished with value: 7.755639456905614 and parameters: {'learning_rate': 1.3476639756894046e-05, 'reg_lambda': 0.021783328860354112, 'reg_alpha': 2.9809748731059723e-07, 'subsample': 0.6759219277211678, 'colsample_bytree': 0.17035763068373644, 'max_depth': 3}. Best is trial 0 with value: 1.8369129136751796.[0m


[0]	validation_0-rmse:7.77561
[199]	validation_0-rmse:7.62336


[32m[I 2023-05-16 21:57:04,264][0m Trial 3 finished with value: 7.623361659888947 and parameters: {'learning_rate': 0.00010028943444929665, 'reg_lambda': 4.1854526520695916e-08, 'reg_alpha': 1.5086348710080711e-05, 'subsample': 0.14817767805347645, 'colsample_bytree': 0.10109231193521442, 'max_depth': 1}. Best is trial 0 with value: 1.8369129136751796.[0m


[0]	validation_0-rmse:7.75453
[199]	validation_0-rmse:4.44907


[32m[I 2023-05-16 21:57:12,271][0m Trial 4 finished with value: 4.449070852338091 and parameters: {'learning_rate': 0.0028359892174821193, 'reg_lambda': 20.083851098961016, 'reg_alpha': 3.3856494175545493e-06, 'subsample': 0.8395102804934238, 'colsample_bytree': 0.7038628425590557, 'max_depth': 6}. Best is trial 0 with value: 1.8369129136751796.[0m


{'learning_rate': 0.00761406470599245,
 'reg_lambda': 0.011805311808314534,
 'reg_alpha': 88.26953706323562,
 'subsample': 0.3823448694972297,
 'colsample_bytree': 0.4795130865215185,
 'max_depth': 2}