In [1]:
from sklearn.linear_model import ElasticNet
from xgboost import XGBRegressor
from xgboost import plot_importance
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from hyperopt import fmin, tpe, hp
import pandas as pd
import numpy as np
import gc
import os

In [2]:
train = pd.read_pickle('./data/data.pkl')
train.reset_index(inplace = True)
X_test = train.loc[train.date_block_num == 34, ['date_block_num', 'item_cnt_month']].copy()
X_test.reset_index(inplace = True)
train = train.loc[train.date_block_num <= 33, ['date_block_num', 'item_cnt_month']].copy()
train.reset_index(inplace = True)

train_files = [x for x in os.listdir("./features_train") if "feature" in x]
for file in train_files:
    temp_csv = pd.read_csv("./features_train/" + file)
    train[file] = temp_csv["item_cnt_month"].copy()
    
test_files = [x for x in os.listdir("./features") if "feature" in x]
for file in test_files:
    temp_csv = pd.read_csv("./features/" + file)
    X_test[file] = temp_csv["item_cnt_month"].copy()
    
gc.collect()

0

In [3]:
split_train = train[train.date_block_num < 33]
split_valid = train[train.date_block_num == 33]

In [9]:
def val(model):
    mean_rmse = 0.0
    X = StandardScaler().fit_transform(split_train.drop(['date_block_num', 'item_cnt_month'], axis=1))
    Y = StandardScaler().fit_transform(split_train[['item_cnt_month']])
    model.fit(X, Y)
    X_new = StandardScaler().fit_transform(split_valid.drop(['date_block_num', 'item_cnt_month'], axis = 1))
    scaler = StandardScaler()
    Y_new = scaler.fit_transform(split_valid[['item_cnt_month']])
    res = mean_squared_error(
        Y_new,
        model.predict(X_new),
        squared = True
    ) * scaler.scale_
    return res

In [12]:
np.log(1e-4)

-9.210340371976182

In [13]:
#Search space
param_space = {'alpha': hp.uniform('alpha', -3, 0),
               'l1_ratio': hp.uniform('l1_ratio', -3, 0)}

#objective
def objective(params):   
    return val(ElasticNet(alpha=10**params['alpha'], l1_ratio=10**params['l1_ratio']))

#run
best = fmin(fn=objective, space=param_space, 
            max_evals=50, 
            rstate=np.random.RandomState(777), 
            algo=tpe.suggest)
print(best)

  0%|                                                                           | 0/50 [00:00<?, ?trial/s, best loss=?]

job exception: cannot convert dictionary update sequence element #0 to a sequence



  0%|                                                                           | 0/50 [00:05<?, ?trial/s, best loss=?]


TypeError: cannot convert dictionary update sequence element #0 to a sequence

In [31]:
val(ElasticNet(alpha=0.2, l1_ratio=0.8))

array([0.70277981])

In [32]:
model = ElasticNet(alpha=0.2, l1_ratio=0.8)
model.fit(train.drop(['item_cnt_month'], axis=1), train['item_cnt_month'])
X_test["item_cnt_month"] = model.predict(X_test.drop(['item_cnt_month'], axis = 1))

In [33]:
sub = pd.read_csv("./data/sample_submission.csv")
sub.reset_index(inplace = False)
sub['item_cnt_month'] = X_test["item_cnt_month"]
sub.to_csv("./submission.csv", index=False)