In [1]:
import catboost as cbt
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold
from sklearn.metrics import mean_squared_log_error
from gensim.models import word2vec
import jpholiday
import swifter
import datetime
from tqdm.notebook import tqdm
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, LabelEncoder, OrdinalEncoder

from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import xgboost as xgb
%matplotlib inline
from utils import StratifiedGroupKFold
pd.options.display.max_rows = 1000

In [2]:
train, test, groups = joblib.load('../output/features.joblib')

In [3]:
X_train = train.drop(['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales'], axis=1)
y_train = np.log1p(train['Global_Sales'])
X_test = test.drop(['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales'], axis=1).copy()

In [5]:
oe = OrdinalEncoder()
whole = pd.concat([X_train, X_test], axis=0, ignore_index=True)
whole[['Platform', 'Genre', 'Rating', 'Genre_Platform', 'Genre_Rating',
       'Genre_Bin_Year', 'Platform_Rating', 'Platform_Bin_Year',
       'Rating_Bin_Year']] = oe.fit_transform(whole[['Platform', 'Genre', 'Rating', 'Genre_Platform', 'Genre_Rating',
       'Genre_Bin_Year', 'Platform_Rating', 'Platform_Bin_Year',
       'Rating_Bin_Year']].fillna('nan_value'))
X_train = whole.iloc[:len(X_train), :].reset_index(drop=True)
X_test = whole.iloc[len(X_train):, :].reset_index(drop=True)

In [6]:
stratified_y = pd.cut(y_train, 10, labels=False)

In [7]:
#kf = GroupKFold(n_splits=5)
kf = StratifiedGroupKFold(n_splits=5, random_state=0)
oof = np.zeros(X_train.shape[0])
y_pred = np.zeros(X_test.shape[0])
models = []

for i, (train_index, valid_index) in enumerate(kf.split(X_train, y=stratified_y, groups=groups)):
    print(f'Start {i+1} fold')
    print('-' * 20)
    X_tr, y_tr = X_train.iloc[train_index, :], y_train.iloc[train_index]
    X_val, y_val = X_train.iloc[valid_index, :], y_train.iloc[valid_index]
    X_tes = X_test.copy()
    
    params = {
        'objective': 'reg:squarederror',
        'n_estimators': 10000000,
        'max_depth': 4,
        'learning_rate': 0.01,
        'verbosity': 1,
        'colsample_bytree': 0.4,
        'eval_metric': 'rmse',
        'random_state': 0,
        'tree_method': 'gpu_hist',
        'gpu_id': 0
    }
    model = xgb.XGBRegressor(**params)
    model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], early_stopping_rounds=1000, verbose=1000)
    models.append(model)
    oof[valid_index] = model.predict(X_val)
    y_pred += model.predict(X_tes) / 5

Start 1 fold
--------------------
[0]	validation_0-rmse:2.70856
Will train until validation_0-rmse hasn't improved in 1000 rounds.
[1000]	validation_0-rmse:0.90935
Stopping. Best iteration:
[483]	validation_0-rmse:0.86688

Start 2 fold
--------------------
[0]	validation_0-rmse:2.71948
Will train until validation_0-rmse hasn't improved in 1000 rounds.
[1000]	validation_0-rmse:0.85269
[2000]	validation_0-rmse:0.84542
[3000]	validation_0-rmse:0.84013
[4000]	validation_0-rmse:0.83552
[5000]	validation_0-rmse:0.83341
[6000]	validation_0-rmse:0.83241
[7000]	validation_0-rmse:0.83115
[8000]	validation_0-rmse:0.83091
[9000]	validation_0-rmse:0.83027
[10000]	validation_0-rmse:0.83057
Stopping. Best iteration:
[9110]	validation_0-rmse:0.83001

Start 3 fold
--------------------
[0]	validation_0-rmse:2.71147
Will train until validation_0-rmse hasn't improved in 1000 rounds.
[1000]	validation_0-rmse:0.84780
[2000]	validation_0-rmse:0.83118
[3000]	validation_0-rmse:0.82441
[4000]	validation_0-rmse:

In [8]:
oof = np.where(oof < 0, 0, oof)
y_pred = np.where(y_pred < 0, 0, y_pred)

y_train = np.expm1(y_train)
oof = np.expm1(oof)
y_pred = np.expm1(y_pred)

print(mean_squared_log_error(y_train, oof) ** .5)

0.827261109202819


In [9]:
joblib.dump((oof, y_pred), '../output/XGB_oof_preds.joblib')

['../output/XGB_oof_preds.joblib']