In [8]:
from sklearn.linear_model import ElasticNet
from sklearn.decomposition import PCA
from xgboost import XGBRegressor
from xgboost import plot_importance
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from hyperopt import fmin, tpe, hp
import pandas as pd
import numpy as np
import gc
import os

In [2]:
train = pd.read_pickle('./data/data.pkl')
train.reset_index(inplace = False)
X_test = train.loc[train.date_block_num == 34, ['date_block_num', 'item_cnt_month']].copy()
X_test.reset_index(inplace = False)
train = train.loc[train.date_block_num <= 33, ['date_block_num', 'item_cnt_month']].copy()
train.reset_index(inplace = False)

train_files = [x for x in os.listdir("./features_train") if "feature" in x]
for file in train_files:
    temp_csv = pd.read_csv("./features_train/" + file)
    train[file] = temp_csv["item_cnt_month"].copy()
    
test_files = [x for x in os.listdir("./features") if "feature" in x]
for file in test_files:
    temp_csv = pd.read_csv("./features/" + file)
    X_test[file] = temp_csv["item_cnt_month"].copy()
    
gc.collect()

0

In [3]:
split_train = train[train.date_block_num < 33]
split_valid = train[train.date_block_num == 33]

In [4]:
def val(model):
    mean_rmse = 0.0
    X = StandardScaler().fit_transform(split_train.drop(['date_block_num', 'item_cnt_month'], axis=1))
    Y = StandardScaler().fit_transform(split_train[['item_cnt_month']])
    model.fit(X, Y)
    X_new = StandardScaler().fit_transform(split_valid.drop(['date_block_num', 'item_cnt_month'], axis = 1))
    scaler = StandardScaler()
    Y_new = scaler.fit_transform(split_valid[['item_cnt_month']])
    res = mean_squared_error(
        Y_new,
        model.predict(X_new),
        squared = True
    ) * scaler.scale_
    return res

In [7]:
split_train.drop('date_block_num', axis = 1).corr()

Unnamed: 0,item_cnt_month,feature_catboost.csv,feature_catboost_na.csv,feature_catboost_na_new.csv,feature_catboost_new.csv,feature_catboost_only_cat.csv,feature_lgbm.csv,feature_lgbm_na.csv,feature_lgbm_na_tuned.csv,feature_lgbm_tuned.csv,feature_xgboost.csv,feature_xgboost_na.csv,feature_xgboost_na_tuned.csv,feature_xgboost_tuned.csv
item_cnt_month,1.0,0.006621,0.006645,0.006621,0.006874,0.008274,0.00724,0.007177,0.006801,0.006799,0.006992,0.006727,0.006565,0.006823
feature_catboost.csv,0.006621,1.0,0.993261,0.991632,0.992334,0.78134,0.982097,0.973459,0.974381,0.973853,0.980877,0.973363,0.974403,0.974738
feature_catboost_na.csv,0.006645,0.993261,1.0,0.991812,0.9914,0.777901,0.981479,0.972659,0.974222,0.973692,0.980223,0.972692,0.973996,0.97426
feature_catboost_na_new.csv,0.006621,0.991632,0.991812,1.0,0.987817,0.787397,0.983,0.974958,0.976474,0.976038,0.980376,0.97444,0.977811,0.977452
feature_catboost_new.csv,0.006874,0.992334,0.9914,0.987817,1.0,0.767936,0.978903,0.970041,0.969164,0.968507,0.978028,0.968723,0.968004,0.969455
feature_catboost_only_cat.csv,0.008274,0.78134,0.777901,0.787397,0.767936,1.0,0.790575,0.796846,0.806868,0.807045,0.79577,0.801617,0.793781,0.794807
feature_lgbm.csv,0.00724,0.982097,0.981479,0.983,0.978903,0.790575,1.0,0.992404,0.983278,0.982752,0.989379,0.983844,0.988223,0.989342
feature_lgbm_na.csv,0.007177,0.973459,0.972659,0.974958,0.970041,0.796846,0.992404,1.0,0.985017,0.984693,0.985871,0.985019,0.982782,0.984781
feature_lgbm_na_tuned.csv,0.006801,0.974381,0.974222,0.976474,0.969164,0.806868,0.983278,0.985017,1.0,0.999042,0.987844,0.990883,0.982687,0.982776
feature_lgbm_tuned.csv,0.006799,0.973853,0.973692,0.976038,0.968507,0.807045,0.982752,0.984693,0.999042,1.0,0.987302,0.990526,0.982251,0.982766


In [12]:
pca = PCA(whiten=True).fit(split_train.drop(['item_cnt_month', 'date_block_num'], axis = 1))

In [14]:
pca.transform(split_train)

ValueError: operands could not be broadcast together with shapes (2430227,15) (13,) 

In [9]:
#Search space
param_space = {'alpha': hp.uniform('alpha', np.log(1e-4), np.log(1)),
               'l1_ratio': hp.loguniform('l1_ratio', np.log(1e-4), np.log(1))}

#objective
def objective(params):   
    return val(ElasticNet(alpha=params['alpha'], l1_ratio=params['l1_ratio']), tol)[0]

#run
best = fmin(fn=objective, space=param_space, 
            max_evals=50, 
            rstate=np.random.RandomState(777), 
            algo=tpe.suggest)
print(best)

  0%|                                                                           | 0/50 [00:00<?, ?trial/s, best loss=?]

  positive)



  2%|▉                                                | 1/50 [00:31<25:34, 31.33s/trial, best loss: 1.2287627143813749]

  positive)



  4%|█▉                                               | 2/50 [01:04<25:26, 31.80s/trial, best loss: 1.2287627143813749]

  positive)



  6%|██▉                                              | 3/50 [02:09<33:41, 43.02s/trial, best loss: 1.2287627143813749]


KeyboardInterrupt: 

In [8]:
val(ElasticNet(alpha=0.2, l1_ratio=0.8))

0.7027797536503817

In [32]:
model = ElasticNet(alpha=0.2, l1_ratio=0.8)
model.fit(train.drop(['item_cnt_month'], axis=1), train['item_cnt_month'])
X_test["item_cnt_month"] = model.predict(X_test.drop(['item_cnt_month'], axis = 1))

In [33]:
sub = pd.read_csv("./data/sample_submission.csv")
sub.reset_index(inplace = False)
sub['item_cnt_month'] = X_test["item_cnt_month"]
sub.to_csv("./submission.csv", index=False)