In [1]:
import catboost as cbt
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold
from sklearn.metrics import mean_squared_log_error
from gensim.models import word2vec
import jpholiday
import swifter
import datetime
from tqdm.notebook import tqdm
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
%matplotlib inline
from utils import StratifiedGroupKFold
pd.options.display.max_rows = 1000

In [2]:
train, test, groups = joblib.load('../output/features.joblib')

In [3]:
X_train = train.drop(['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales'], axis=1)
y_train = np.log1p(train['Global_Sales'])
X_test = test.drop(['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales'], axis=1).copy()

category_cols = X_train.columns[X_train.dtypes=='category']
for col in category_cols:
    X_train[col] = X_train[col].astype(object)
    X_test[col] = X_test[col].astype(object)

In [4]:
stratified_y = pd.cut(y_train, 10, labels=False)

In [5]:
category_cols = X_train.columns[X_train.dtypes=='object']
for col in category_cols:
    X_train[col] = X_train[col].astype(str)
    X_test[col] = X_test[col].astype(str)

In [6]:
category_cols

Index(['Platform', 'Genre', 'Rating', 'Genre_Platform', 'Genre_Rating',
       'Genre_Bin_Year', 'Platform_Rating', 'Platform_Bin_Year',
       'Rating_Bin_Year'],
      dtype='object')

In [7]:
#kf = GroupKFold(n_splits=5)
kf = StratifiedGroupKFold(n_splits=5, random_state=0)
oof = np.zeros(X_train.shape[0])
y_pred = np.zeros(X_test.shape[0])
models = []

for i, (train_index, valid_index) in enumerate(kf.split(X_train, y=stratified_y, groups=groups)):
    print(f'Start {i+1} fold')
    print('-' * 20)
    X_tr, y_tr = X_train.iloc[train_index, :], y_train.iloc[train_index]
    X_val, y_val = X_train.iloc[valid_index, :], y_train.iloc[valid_index]
    X_tes = X_test.copy()
    
    params_cbt = {
        'iterations': 100000000,
        'depth': 3,
        'learning_rate': 0.01,
        'random_seed': 0,
        'cat_features': ['Platform', 'Genre', 'Rating', 'Genre_Platform', 'Genre_Rating',
       'Genre_Bin_Year', 'Platform_Rating', 'Platform_Bin_Year',
       'Rating_Bin_Year'],
        'task_type': 'GPU',
    }
    model = cbt.CatBoostRegressor(**params_cbt)
    model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], early_stopping_rounds=1000, verbose=1000)
    models.append(model)
    oof[valid_index] = model.predict(X_val)
    y_pred += model.predict(X_tes) / 5

Start 1 fold
--------------------
0:	learn: 1.3071777	test: 1.2932342	best: 1.2932342 (0)	total: 11.2ms	remaining: 12d 23h 24m 2s
1000:	learn: 0.8973836	test: 0.9197817	best: 0.9197817 (1000)	total: 9.26s	remaining: 10d 16h 52m 51s
2000:	learn: 0.8586886	test: 0.8925105	best: 0.8925105 (2000)	total: 18.5s	remaining: 10d 16h 23m 32s
3000:	learn: 0.8382523	test: 0.8845174	best: 0.8844817 (2958)	total: 27.9s	remaining: 10d 18h 8m 34s
4000:	learn: 0.8218504	test: 0.8833666	best: 0.8823749 (3702)	total: 37s	remaining: 10d 17h 9m 13s
5000:	learn: 0.8082347	test: 0.8802155	best: 0.8802124 (4998)	total: 46.5s	remaining: 10d 18h 17m 45s
6000:	learn: 0.7995495	test: 0.8778724	best: 0.8778272 (5980)	total: 55.9s	remaining: 10d 18h 55m 18s
7000:	learn: 0.7909851	test: 0.8757755	best: 0.8757569 (6992)	total: 1m 5s	remaining: 10d 19h 47m 22s
8000:	learn: 0.7842468	test: 0.8771749	best: 0.8756579 (7067)	total: 1m 14s	remaining: 10d 19h 51m 24s
bestTest = 0.8756579381
bestIteration = 7067
Shrink model

Start 5 fold
--------------------
0:	learn: 1.3084992	test: 1.2880484	best: 1.2880484 (0)	total: 10ms	remaining: 11d 15h 5m 12s
1000:	learn: 0.9103877	test: 0.9005221	best: 0.9005221 (1000)	total: 9.51s	remaining: 10d 23h 53m 12s
2000:	learn: 0.8699864	test: 0.8724200	best: 0.8724200 (2000)	total: 19.2s	remaining: 11d 1h 51m 39s
3000:	learn: 0.8453335	test: 0.8605766	best: 0.8605766 (3000)	total: 28.6s	remaining: 11d 46m 15s
4000:	learn: 0.8280861	test: 0.8523308	best: 0.8523283 (3999)	total: 38.1s	remaining: 11d 25m 52s
5000:	learn: 0.8148940	test: 0.8486090	best: 0.8486028 (4995)	total: 47.6s	remaining: 11d 6m 39s
6000:	learn: 0.8053713	test: 0.8463832	best: 0.8463821 (5999)	total: 57.1s	remaining: 11d 7m 14s
7000:	learn: 0.8002615	test: 0.8450050	best: 0.8449477 (6903)	total: 1m 6s	remaining: 10d 22h 40m 55s
8000:	learn: 0.7953530	test: 0.8436999	best: 0.8436621 (7990)	total: 1m 15s	remaining: 10d 22h 49m 9s
9000:	learn: 0.7899492	test: 0.8419693	best: 0.8419297 (8981)	total: 1m 25s

In [8]:
oof = np.where(oof < 0, 0, oof)
y_pred = np.where(y_pred < 0, 0, y_pred)

y_train = np.expm1(y_train)
oof = np.expm1(oof)
y_pred = np.expm1(y_pred)

print(mean_squared_log_error(y_train, oof) ** .5)

0.865501207218099


In [9]:
joblib.dump((oof, y_pred), '../output/CBT_oof_preds.joblib')

['../output/CBT_oof_preds.joblib']