In [1]:
import os
import pickle
import random as r
import numpy as np
import pandas as pd

from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, GridSearchCV

In [2]:
SEED = 0
PATH = "../input/petfinder-pawpularity-score"

sc_X = StandardScaler()
sc_y = StandardScaler()

In [3]:
def breaker(num=50, char="*") -> None:
    print("\n" + num*char + "\n")


def head(x, no_of_ele=5) -> None:
    print(x[:no_of_ele])
    

def get_data(path: str, name: str) -> tuple:
    df = pd.read_csv(os.path.join(path, name), engine="python")
    targets = None
    
    if name == "train.csv":
        features = df.iloc[:, 1:-1].copy().values
        targets  = df.iloc[:, -1].copy().values
    else:
        features = df.iloc[:, 1:].copy().values
    
    return features, targets


def split_and_transform(features: np.ndarray, targets: np.ndarray, seed: int) -> tuple:
    for tr_idx, va_idx in KFold(n_splits=5, shuffle=True, random_state=seed).split(features):
        break
    
    tr_features, va_features = features[tr_idx], features[va_idx]
    tr_targets, va_targets   = targets[tr_idx], targets[va_idx]
    tr_targets, va_targets   = tr_targets.reshape(-1, 1), va_targets.reshape(-1, 1)
    
    tr_features, tr_targets = sc_X.fit_transform(tr_features), sc_y.fit_transform(tr_targets)
    va_features, va_targets = sc_X.transform(va_features), sc_y.transform(va_targets)
    
    return tr_features, va_features, tr_targets, va_targets  


def save_model(model, filename: str) -> None:
    pickle.dump(model, open(filename, "wb"))

In [4]:
features, targets = get_data(PATH, "train.csv")
tr_features, va_features, tr_targets, va_targets = split_and_transform(features, targets, SEED)

In [5]:
xgbr = XGBRegressor(random_state=SEED, tree_method="gpu_hist")
xgbr.fit(tr_features, tr_targets)

y_pred = xgbr.predict(va_features)

breaker()
print("RMSE : {:.5f}".format(np.sqrt(mean_squared_error(y_pred, va_targets))))
breaker()


**************************************************

RMSE : 1.02718

**************************************************



In [6]:
r.seed(SEED)
seeds = [r.randint(0, 99) for _ in range(5)]

parameters = {"random_state" : seeds,
              "n_estimators" : [i*100 for i in range(1, 11)], 
              "learning_rate" : [1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6]}
model = GridSearchCV(xgbr, parameters, cv=5)
model.fit(tr_features, tr_targets)
print("{}".format(model.best_params_))

{'learning_rate': 0.01, 'n_estimators': 300, 'random_state': 49}


In [7]:
xgbr = XGBRegressor(random_state=model.best_params_["random_state"], 
                    learning_rate=model.best_params_["learning_rate"], 
                    n_estimators=model.best_params_["n_estimators"])
xgbr.fit(tr_features, tr_targets)

y_pred = xgbr.predict(va_features)

breaker()
print("RMSE : {:.5f}".format(np.sqrt(mean_squared_error(y_pred, va_targets))))
breaker()

save_model(xgbr, "./xgb_model.pkl")
np.save("./xgb_predictions.npy", sc_y.inverse_transform(y_pred))


**************************************************

RMSE : 1.01290

**************************************************

