# Imports

In [1]:
%load_ext autoreload
%autoreload 2

In [4]:
import gc
import pandas as pd
import numpy as np

from src import CTX, SEED, FOLDERS
from src.data import io

# Load

In [10]:
X_train = io.load_data(FOLDERS.PROCESSED, CTX + 'X_train', np.empty(0))
X_test = io.load_data(FOLDERS.PROCESSED, CTX + 'X_test', np.empty(0))
y_train = io.load_data(FOLDERS.PROCESSED, CTX + 'y_train', np.empty(0))
dates_train = io.load_data(FOLDERS.PROCESSED, CTX + 'dates_train', pd.DataFrame())
predictions = io.load_data(FOLDERS.PROCESSED, CTX + 'predictions', pd.DataFrame())

Loading ../data/processed/CC-Labs-hv_X_train.h5
Loading ../data/processed/CC-Labs-hv_X_test.h5
Loading ../data/processed/CC-Labs-hv_y_train.h5
Loading ../data/processed/CC-Labs-hv_dates_train.h5
Loading ../data/processed/CC-Labs-hv_predictions.h5


In [17]:
print(X_train.dtype)
print(X_train.shape)
print(X_test.dtype)
print(X_test.shape)
print(y_train.dtype)
print(y_train.shape)
print(dates_train.dtypes)
print(dates_train.shape)
print(predictions.dtypes)
print(predictions.shape)

float32
(675137, 150)
float32
(35700, 150)
float32
(675137,)
date_block_num    int32
dtype: object
(675137, 1)
shop_id    int32
item_id    int32
dtype: object
(35700, 2)


In [19]:
max_train_date_block_num = dates_train.date_block_num.max()
max_train_date_block_num

33

# Train

In [20]:
def gen_time_split(data, n_splits):
    for i in range(n_splits):
        #print(i)
        first_vali_date_block_num = max_train_date_block_num - i
        vali_indices = data.loc[:,'date_block_num'] == first_vali_date_block_num
        train_indices = data.loc[:,'date_block_num'] < first_vali_date_block_num
        yield (train_indices[train_indices].index.values, vali_indices[vali_indices].index.values)

In [24]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold

def grid_search(est, param_grid, filename):
    search_est = GridSearchCV(
        est,
        param_grid,
        scoring='neg_mean_squared_error',
        cv=gen_time_split(dates_train, 3),
        refit=True,
        return_train_score=True,
        n_jobs=-1,
        verbose=2)
    
    search_est.fit(X_train, y_train)
    
    print(search_est.cv_results_)
    print(search_est.best_score_)
    print(search_est.best_params_)
    
    save_model(filename, search_est.best_estimator_)    
    
    return search_est.best_estimator_    

## XGBoost

In [25]:
import xgboost as xgb
xgb_search_est = grid_search(
    xgb.XGBRegressor(objective='reg:linear', n_jobs=-1, silent=0, random_state=SEED), # **{'tree_method':'gpu_hist'}) #n_estimators=100, learning_rate=0.3, max_depth=7, 
    {'n_estimators':[50, 100, 150], 'learning_rate':[0.01, 0.03, 0.1], 'max_depth':[5,6,7]},
    CTX + 'xgb_search_est')

Fitting 3 folds for each of 27 candidates, totalling 81 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


KeyboardInterrupt: 

In [None]:
y_pred = xgb_search_est.predict(X_test)
score('XGBoost', y_test, y_pred)

In [None]:
plt.figure(102, figsize=(18,9))
plt.bar(mapper.transformed_names_, xgb_search_est.feature_importances_)
plt.xticks(rotation=90)
plt.show()