In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV
import os
%env JOBLIB_TEMP_FOLDER=/tmp

env: JOBLIB_TEMP_FOLDER=/tmp


In [2]:
def getFiles():
    """ Dictonary to get the right Files"""
    dict={}
    for dirname,_, filenames in os.walk('/kaggle/input'):
        for filename in filenames:
            dict[filename[0:filename.find('.')]] = pd.read_csv(os.path.join(dirname, filename))
    print('Path dictonary created!')
    print('')
    return dict

In [3]:
def preprocess(dataset, mode):
    """
    @param dataset: training or test
    @param mode: define y and features of training/test dataset
    @return Return either training data or the test data
    """
    #Define the target and the features
    if mode is "train":
        y = dataset.target
        features = dataset.columns[1:-1]
    else:
        features = dataset.columns[1:]
    X = dataset[features]

    #Check and handle categorical data with One-Hot-Encoder
    s = (X.dtypes == 'object')
    object_cols = list(s[s].index)
    OH_encoder = OneHotEncoder(handle_unknown = 'ignore', sparse=False)
    OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X[object_cols]))
    OH_cols_train.index = X.index
    num_X = X.drop(object_cols, axis=1)
    label_X = pd.concat([num_X, OH_cols_train], axis=1)
    
    #Return the correct X and y data to train or test the model
    if mode is "train":
        train_X, val_X, train_y, val_y = train_test_split(label_X, y, random_state = 0, test_size = 0.2, shuffle = True)
        return label_X, y, train_X, train_y, val_X, val_y
    else:
        return label_X, dataset

In [4]:
def submit(dataset, submit_prediction):
    """Submit the final prediction of the test data."""
    output = pd.DataFrame({'id': dataset.id,
                       'target': submit_prediction})
    output.to_csv('submission.csv', index=False)

In [5]:
dict = getFiles()
X, y, train_X, train_y, val_X, val_y = preprocess(dataset = dict['train'], mode = "train")
test_X, testset = preprocess(dataset = dict['test'], mode = "test")

Path dictonary created!



In [6]:
model = XGBRegressor(random_state=42, tree_method='gpu_hist', gpu_id = 0)
params = {'learning_rate': [0.1],
         'n_estimators': [10000],
         'max_depth': [1,2,3,4,5],
         'subsample':[0.4, 0.6, 0.8, 1],
         'min_child_weight': [3, 4, 5],
         'reg_alpha':[0, 0.2, 0.4,  1],
         'reg_lambda':[0,0.4, 0,0.6, 0.8,1],
         'gamma': [0.5, 1, 1.2, 1.5, 1.8, 2],
         'colsample_bytree': [0.6, 0.8, 1.0],
          "eta": [0.01, 0.04, 0.07, 0.1, 0.15, 0.2, 0.3]
         }

random_search = RandomizedSearchCV(model, param_distributions=params, n_iter=100, cv=3, n_jobs= -1, verbose=1, scoring='neg_root_mean_squared_error')
random_search.fit(train_X, train_y, early_stopping_rounds=10, eval_set=[(val_X, val_y)],verbose=100)

print("Train accuracy", random_search.score(train_X, train_y)*-1)
print("Test accuracy", random_search.score(val_X, val_y)*-1)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed: 10.6min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 16.5min finished


[0]	validation_0-rmse:7.00472
[100]	validation_0-rmse:0.73225
[200]	validation_0-rmse:0.72810
[300]	validation_0-rmse:0.72598
[400]	validation_0-rmse:0.72445
[500]	validation_0-rmse:0.72348
[600]	validation_0-rmse:0.72281
[700]	validation_0-rmse:0.72224
[800]	validation_0-rmse:0.72198
[803]	validation_0-rmse:0.72199
Train accuracy 0.710954369594449
Test accuracy 0.72197617341784


In [7]:
print(random_search.best_params_)

{'subsample': 0.8, 'reg_lambda': 0, 'reg_alpha': 0, 'n_estimators': 10000, 'min_child_weight': 5, 'max_depth': 3, 'learning_rate': 0.1, 'gamma': 1.5, 'eta': 0.15, 'colsample_bytree': 0.8}


In [8]:
model2 = XGBRegressor(tree_method='gpu_hist', gpu_id = 0, random_state = 42, 
learning_rate=random_search.best_params_["learning_rate"], 
n_estimators=random_search.best_params_["n_estimators"],
max_depth=random_search.best_params_["max_depth"],
subsample=random_search.best_params_["subsample"],
min_child_weight=random_search.best_params_["min_child_weight"],
reg_alpha=random_search.best_params_["reg_alpha"],
reg_lambda=random_search.best_params_["reg_lambda"],
gamma=random_search.best_params_["gamma"],
colsample_bytree=random_search.best_params_["colsample_bytree"],
eta=random_search.best_params_["eta"])

model2.fit(X, y)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.8, eta=0.15, gamma=1.5,
             gpu_id=0, importance_type='gain', interaction_constraints='',
             learning_rate=0.1, max_delta_step=0, max_depth=3,
             min_child_weight=5, missing=nan, monotone_constraints='()',
             n_estimators=10000, n_jobs=2, num_parallel_tree=1, random_state=42,
             reg_alpha=0, reg_lambda=0, scale_pos_weight=1, subsample=0.8,
             tree_method='gpu_hist', validate_parameters=1, verbosity=None)

In [9]:
submit_prediction = model2.predict(test_X)

In [10]:
submit(testset, submit_prediction)