## library import

In [None]:
# import
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
plt.style.use('seaborn-darkgrid')

## file_import

In [None]:
# カレンダーデータ
df_cal = pd.read_csv('../input/m5-forecasting-accuracy/calendar.csv')
# 製品および店舗ごとの過去の毎日の販売台数データ[d_1 - d_1941]（パブリックリーダーボードに使用されるラベル）
df_eval = pd.read_csv('../input/m5-forecasting-accuracy/sales_train_evaluation.csv')
# 製品および店舗ごとの過去の毎日の販売台数データ[d_1 - d_1913]
# df_val = pd.read_csv('../input/m5-forecasting-accuracy/sales_train_validation.csv')
# 店舗および日付ごとに販売された製品の価格に関する情報が含まれています。
df_price = pd.read_csv('../input/m5-forecasting-accuracy/sell_prices.csv')
# サンプルアウトプット
df_sample_output = pd.read_csv('../input/m5-forecasting-accuracy/sample_submission.csv')

## sample_output

In [None]:
df_sample_output.head()

In [None]:
df_sample_output.describe().T

## df_cal

In [None]:
df_cal.head()

In [None]:
# 宗教別で祝日などが異なるため、きれいな特徴量にはならなそう→後ほどチューニング?
holiday = ['NewYear', 'OrthodoxChristmas', 'MartinLutherKingDay', 'SuperBowl', 'PresidentsDay', 'StPatricksDay', 'Easter', 'Cinco De Mayo', 'IndependenceDay', 'EidAlAdha', 'Thanksgiving', 'Christmas']
weekend = ['Saturday', 'Sunday']

def is_holiday(x):
    if x in holiday:
        return 1
    else:
        return 0

def is_weekend(x):
    if x in weekend:
        return 1
    else:
        return 0

In [None]:
df_cal['is_holiday_1'] = df_cal['event_name_1'].apply(is_holiday)
df_cal['is_holiday_2'] = df_cal['event_name_2'].apply(is_holiday)
df_cal['is_holiday'] = df_cal[['is_holiday_1','is_holiday_2']].max(axis=1)
df_cal['is_weekend'] = df_cal['weekday'].apply(is_weekend)

In [None]:
df_cal.head()

In [None]:
df_cal = df_cal.drop(['weekday', 'wday', 'month', 'year', 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2'], axis='columns')

## df_price

In [None]:
df_price.head()

In [None]:
df_price.describe()

## df_eval

In [None]:
df_eval.head()

In [None]:
del_col = []
for x in range(1851):
    del_col.append('d_' + str(x+1))

In [None]:
df_eval = df_eval.drop(del_col, axis='columns')

## join eval and cal and price

In [None]:
df_eval = df_eval.melt(['id','item_id','dept_id','cat_id','store_id','state_id'], var_name='d', value_name='qty')
print(df_eval.shape)
df_eval.head()

In [None]:
df_eval = pd.merge(df_eval, df_cal, how='left', on='d')
df_eval.head()

In [None]:
df_eval = pd.merge(df_eval, df_price, how='left', on=['item_id', 'wm_yr_wk', 'store_id'])
df_eval.head()

In [None]:
df_eval.shape

In [None]:
df_eval.tail()

In [None]:
df_eval.head()

In [None]:
df_eval.tail()

テストデータの作成

In [None]:
df_eval_test = df_eval.query('d == "d_1852"')

In [None]:
df_eval_test.head()

In [None]:
df_eval_test = df_eval_test[['id', 'store_id', 'item_id', 'dept_id', 'cat_id', 'state_id', 'd', 'qty', 'sell_price']]

In [None]:
df_eval_test.head()

In [None]:
df_eval_test.shape

In [None]:
df_eval_test['qty'] = df_eval_test['d'].apply(lambda x: int(x.replace(x, '0')))

In [None]:
tmp_df = df_eval_test

In [None]:
for x in range(28):
    df_eval_test = df_eval_test.append(tmp_df)

In [None]:
df_eval_test = df_eval_test.reset_index(drop=True)

In [None]:
df_eval_test.head()

In [None]:
df_eval_test.tail()

In [None]:
# ※ここに日付を直す処理を入れる、今はすべて同じ※
lst_d = []
i = 0
lst_index = df_eval_test.index
for x in lst_index:
    lst_d.append('d_' + str(((lst_index[i]) // 30490) + 1942))
    i = i + 1

lst_d

In [None]:
df_eval_test['d'] = lst_d

In [None]:
df_eval_test.head()

In [None]:
df_eval_test.tail()

In [None]:
df_eval_test.shape

In [None]:
df_eval_test = pd.merge(df_eval_test, df_cal, how='left', on='d')

In [None]:
df_eval_test = pd.merge(df_eval_test, df_price, how='left', on=['item_id', 'wm_yr_wk', 'store_id'])

In [None]:
df_eval_test.head()

In [None]:
import gc
del tmp_df
gc.collect()

In [None]:
df_eval = pd.get_dummies(data=df_eval, columns=['dept_id', 'cat_id', 'store_id', 'state_id'])
df_eval_test = pd.get_dummies(data=df_eval_test, columns=['dept_id', 'cat_id', 'store_id', 'state_id'])

In [None]:
df_eval.info()

In [None]:
df_eval_test.info()

In [None]:
df_eval_test.head(10).T

In [None]:
df_eval_test = df_eval_test.drop(['sell_price_x', 'snap_CA', 'snap_TX', 'snap_WI'], axis='columns')
df_eval_test = df_eval_test.rename(columns={'sell_price_y': 'sell_price'})
df_eval = df_eval.drop(['snap_CA', 'snap_TX', 'snap_WI'], axis='columns')                                   

In [None]:
df_eval.info()

In [None]:
df_eval_test.info()

In [None]:
from sklearn.model_selection import train_test_split

# 目的変数
target_col = 'qty'

# 除外する説明変数
exclude_cols = ['id', 'item_id', 'd', 'date', 'wm_yr_wk']

# 説明変数
feature_cols = [col for col in df_eval.columns if col not in exclude_cols]

# ndarrayに変換
y = np.array(df_eval[target_col])
X = np.array(df_eval[feature_cols])

# 学習データとテストデータに分割
# ramdom_state 固定で再現性の高い結果にする
X_train, X_test, y_train, y_test = \
 train_test_split(X, y, test_size=0.3, random_state=1234)

# 学習データを更に分割
# X_train1, X_train2, y_train1, y_train2 = \
#  train_test_split(X_train, y_train, test_size=0.3, random_state=1234)



In [None]:
import lightgbm as lgb

#LGB用のデータに変形
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test)

params = {
    'boosting_type': 'gbdt',
    'metric': 'rmse',
    'objective': 'regression',
    'n_jobs': -1,
    'seed': 236,
    'learning_rate': 0.01,
    'bagging_fraction': 0.75,
    'bagging_freq': 10, 
    'colsample_bytree': 0.75}

model = lgb.train(params, lgb_train, num_boost_round=2500, early_stopping_rounds=50, valid_sets = [lgb_train, lgb_eval], verbose_eval=100)

In [None]:
pred = model.predict(df_eval_test[feature_cols])

In [None]:
pred

In [None]:
len(pred)

In [None]:
df_eval_test['pred_qty'] = pred

In [None]:
df_eval_test

In [None]:
predictions = df_eval_test[['id', 'date', 'pred_qty']]
predictions = pd.pivot(predictions, index = 'id', columns = 'date', values = 'pred_qty').reset_index()
predictions

In [None]:
predictions.describe()

In [None]:
predictions = predictions.drop(predictions.columns[1], axis=1)
predictions

In [None]:
predictions.columns = ['id'] + ['F' + str(i + 1) for i in range(28)]
predictions

In [None]:
x = 2744099 + 1 - 853720
df_val = df_eval[x:]

In [None]:
predictions_v = df_val[['id', 'date', 'qty']]
predictions_v = pd.pivot(predictions_v, index = 'id', columns = 'date', values = 'qty').reset_index()
predictions_v

In [None]:
predictions_v['id'] = predictions['id'].apply(lambda x: x.replace('evaluation', 'validation'))
predictions_v.head()

In [None]:
predictions_v.columns = ['id'] + ['F' + str(i + 1) for i in range(28)]
predictions_v.head()

In [None]:
predictions_concat = pd.concat([predictions, predictions_v], axis=0)

In [None]:
predictions_concat

In [None]:
predictions_concat.to_csv('submission.csv', index=False)