# Imports

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import gc
import pandas as pd
import numpy as np

from src import CTX, SEED, FOLDERS
from src.data import io

# Load train data

In [None]:
X_train = io.load_data(FOLDERS.PROCESSED, CTX + 'X_train', np.empty(0))
y_train = io.load_data(FOLDERS.PROCESSED, CTX + 'y_train', np.empty(0))
dates_train = io.load_data(FOLDERS.PROCESSED, CTX + 'dates_train', pd.DataFrame())
feature_names = io.load_data(FOLDERS.PROCESSED, CTX + 'feature_names', pd.DataFrame())

In [None]:
print(X_train.dtype)
print(X_train.shape)
print(y_train.dtype)
print(y_train.shape)
print(dates_train.dtypes)
print(dates_train.shape)
print(feature_names.dtypes)
print(feature_names.shape)

In [None]:
max_train_date_block_num = dates_train.date_block_num.max()
max_train_date_block_num

# Train

In [None]:
def save_and_score_est(est, name):
    from src.models.io import save_model
    from src.models import score
    
    y_train_pred = est.predict(X_train)
    score(name, y_train, y_train_pred)
    
    save_model(FOLDERS.MODELS, name, est)

In [None]:
def gen_time_split(data, n_splits):
    for i in range(n_splits):
        #print(i)
        first_vali_date_block_num = max_train_date_block_num - i
        vali_indices = data.loc[:,'date_block_num'] == first_vali_date_block_num
        train_indices = data.loc[:,'date_block_num'] < first_vali_date_block_num
        yield (train_indices[train_indices].index.values, vali_indices[vali_indices].index.values)

In [None]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sys import maxsize

def grid_search(est, param_grid, short_circuit):
    if short_circuit:
        est.fit(X_train, y_train, verbose=True)
        return est
    else:
        search_est = GridSearchCV(
            est,
            param_grid,
            scoring='neg_mean_squared_error',
            cv=gen_time_split(dates_train, 3),
            refit=True,
            return_train_score=True,
            n_jobs=1,
            verbose=maxsize)

        search_est.fit(X_train, y_train)
    
        print(search_est.cv_results_)
        print(search_est.best_score_)
        print(search_est.best_params_)

        return search_est.best_estimator_

## XGBoost

In [None]:
import xgboost as xgb
model_name = CTX + 'xgb'
base_est = xgb.XGBRegressor(
    n_estimators=200, learning_rate=0.1, max_depth=9,
    objective='reg:linear', n_jobs=1, silent=False, random_state=SEED, verbose = True,
    **{'tree_method':'gpu_hist'})
est = grid_search(
    base_est, 
    {'n_estimators':[150, 200, 250], 'learning_rate':[0.03, 0.1, 0.3], 'max_depth':[8,9,10]},
    short_circuit = True)

In [None]:
save_and_score_est(est, model_name)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline  

plt.figure(102, figsize=(18,9))
plt.bar(feature_names.feature_name, est.feature_importances_)
plt.xticks(rotation=90)
plt.show()

## Neural network

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import SGD, Adam

def create_keras_model(lr, dropout_rate):
    model = Sequential()
    model.add(Dense(300, activation='sigmoid', input_shape=(X_train.shape[1], )))
    model.add(Dense(300, activation='sigmoid'))
    model.add(Dropout(dropout_rate, seed=SEED))
    model.add(Dense(100, activation='sigmoid'))
    model.add(Dense(1))
    model.compile(
        loss='mean_squared_error',
        optimizer=Adam(lr=lr)
        #optimizer=SGD(lr=lr, momentum=0.0, decay=0.0, nesterov=False)
    )
    return model

In [None]:
from keras.wrappers.scikit_learn import KerasRegressor
model_name = CTX + 'keras'
base_est = KerasRegressor(lr=0.0003, batch_size=10000, dropout_rate=0.7,
    build_fn=create_keras_model, epochs=5, verbose=True)
est = grid_search(
    base_est,
    {'lr':[0.00003, 0.0001, 0.0003], 'batch_size':[10000, 30000, 100000], 'dropout_rate':[0.6, 0.7, 0.8]},
    short_circuit = True)

In [None]:
save_and_score_est(est, model_name)

## Linear regression

In [None]:
from sklearn.linear_model import LinearRegression
model_name = CTX + 'linear_regression'
est = grid_search(
    LinearRegression(normalize=False, n_jobs=-1),
    {})