## Init

In [0]:
# Colab
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
import pandas as pd
import gc
import numpy as np
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
from functools import reduce

from sklearn.metrics import mean_squared_error

def rmse(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred)).round(3)

from sklearn.ensemble import RandomForestRegressor


import time
from itertools import product

from sklearn.preprocessing import LabelEncoder

import lightgbm
from sklearn.neighbors import NearestNeighbors
%cd /content/drive/My\ Drive/git/Predict-Future-Sales-Kaggle


/content/drive/My Drive/git/Predict-Future-Sales-Kaggle


### Load

In [0]:
train = pd.read_csv('./data/sales_train.csv')
shops = pd.read_csv('./data/shops.csv')
cats = pd.read_csv('./data/item_categories.csv')
items = pd.read_csv('./data/items.csv')
test = pd.read_csv('./data/test.csv')
# sample_submission = pd.read_csv('./data/sample_submission.csv')

### Preprocessing

In [0]:
# Outliers
train['item_price'] = np.where(train['item_price']>60000, 60000, train['item_price'])
train['item_cnt_day'] = np.where(train['item_cnt_day']>1000, 1000, train['item_cnt_day'])

# Errors
mode = train[(train['shop_id']==32)&(train['item_id']==2973)]['item_price'].mode()
train.loc[train['item_price']<0, 'item_price'] = mode

# Duplicates
train.loc[train['shop_id']==0, 'shop_id'] = 57
train.loc[train['shop_id']==1, 'shop_id'] = 58
train.loc[train['shop_id']==10, 'shop_id'] = 11

test.loc[test['shop_id']==0, 'shop_id'] = 57
test.loc[test['shop_id']==1, 'shop_id'] = 58
test.loc[test['shop_id']==10, 'shop_id'] = 11

In [0]:
# Shops
ts = time.time()
shops.loc[shops['shop_name'] == 'Сергиев Посад ТЦ "7Я"', 'shop_name'] = 'СергиевПосад ТЦ "7Я"'
shops['city'] = shops['shop_name'].apply(lambda x: x.split(' ')[0])
shops.loc[shops['city']=='!Якутск', 'city'] = 'Якутск'
shops['city_code'] = LabelEncoder().fit_transform(shops['city'])
shops_in_city_cnt = shops['city_code'].value_counts().reset_index(name='total_shops_in_city_cnt')
shops = pd.merge(shops, shops_in_city_cnt, left_on='city_code', right_on='index', how='left')
shops = shops[['shop_id', 'city_code', 'total_shops_in_city_cnt']]
time.time() - ts

0.03698253631591797

In [0]:
# Cats
ts = time.time()
cats['split'] =\
    cats['item_category_name'].apply(lambda x: [w for w in x.split('-')])
cats['type'] = cats['split'].apply(lambda x: x[0].strip())
cats['type_code'] = LabelEncoder().fit_transform(cats['type'])
cats['subtype'] = cats['split']\
    .apply(lambda x: x[1].strip() if len(x)>1 else x[0].strip())
cats['subtype_code'] = LabelEncoder().fit_transform(cats['subtype'])
cats = cats[['item_category_id', 'type_code', 'subtype_code']]
time.time() - ts

0.008412837982177734

In [0]:
# Items
items.drop('item_name', axis=1, inplace=True)

In [0]:
# Structure for train df
ts = time.time()
df = []
keys = ['date_block_num', 'shop_id', 'item_id']

for i in range(34):
    sales = train[train.date_block_num==i]
    df.append(np.array(list(product([i], sales.shop_id.unique(), sales.item_id.unique())), dtype='int16'))
df = pd.DataFrame(np.vstack(df), columns=keys)
df['date_block_num'] = df['date_block_num'].astype(np.int8)
df['shop_id'] = df['shop_id'].astype(np.int8)
df['item_id'] = df['item_id'].astype(np.int16)
df.sort_values(keys, inplace=True)
time.time() - ts

13.292612314224243

In [0]:
# Target creating
ts = time.time()
gr = train.groupby(keys).agg({'item_cnt_day': ['sum']})
gr.columns = ['item_cnt_month']
gr.reset_index(inplace=True)

df = pd.merge(df, gr, on=keys, how='left')
df['item_cnt_month'] = df['item_cnt_month']\
    .fillna(0)\
    .clip(0, 20)\
    .astype(np.float16)
time.time() - ts

5.9491801261901855

6.631270170211792

In [0]:
# Train & Test concat
ts = time.time()
test['date_block_num'] = 34
test['date_block_num'] = test['date_block_num'].astype(np.int8)
test['shop_id'] = test['shop_id'].astype(np.int8)
test['item_id'] = test['item_id'].astype(np.int16)
df = pd.concat([df, test], ignore_index=True, sort=False, keys=keys)
df.fillna(0, inplace=True)
time.time() - ts

0.17386746406555176

0.1729896068572998

In [0]:
# Add attributes
ts = time.time()
df = pd.merge(df, shops, on='shop_id', how='left')
df = pd.merge(df, items, on='item_id', how='left')
df = pd.merge(df, cats, on='item_category_id', how='left')
time.time() - ts

4.4410436153411865

4.838852882385254

In [0]:
# Info about item cnt
ts = time.time()
dim = ['date_block_num', 'shop_id', 'item_id']
gr = train.groupby(dim).agg({'item_cnt_day': ['min', 'max', 'std']}).round(2)
gr.columns = ['item_cnt_month_min', 'item_cnt_month_max', 'item_cnt_month_std']
gr.reset_index(inplace=True)
df = pd.merge(df, gr, on=dim, how='left')
time.time() - ts

6.692381858825684

6.861055135726929

In [0]:
# In how many shops the item was selling
ts = time.time()
dim = ['date_block_num', 'item_id']
gr = df.groupby(dim + ['shop_id']).size().reset_index()\
    .groupby(dim).size().reset_index(name='shops_month_cnt')
gr['shops_month_cnt'] = gr['shops_month_cnt'].astype(np.int32)
df = pd.merge(df, gr, on=dim, how='left')
time.time() - ts

9.131294250488281

9.816214084625244

In [0]:
# How many items were in the shop
ts = time.time()
dim = ['date_block_num', 'shop_id']
gr = df.groupby(dim + ['item_id']).size().reset_index()\
    .groupby(dim).size().reset_index(name='items_in_sh_month_cnt')
gr['items_in_sh_month_cnt'] = gr['items_in_sh_month_cnt'].astype(np.int32)
df = pd.merge(df, gr, on=dim, how='left')
time.time() - ts

7.329606771469116

8.75346302986145

In [0]:
# How many items of this type were selling in this shop
ts = time.time()
dim = ['date_block_num', 'item_id', 'type_code']
gr = df.groupby(dim + ['shop_id']).size().reset_index()\
    .groupby(dim).size().reset_index(name='items_type_month_cnt')
gr['items_type_month_cnt'] = gr['items_type_month_cnt'].astype(np.int32)
df = pd.merge(df, gr, on=dim, how='left')
time.time() - ts

10.6351957321167

10.855282306671143

In [0]:
# How many items of this type were selling in this city
ts = time.time()
dim = ['date_block_num', 'item_id', 'city_code']
gr = df.groupby(dim + ['shop_id']).size().reset_index()\
    .groupby(dim).size().reset_index(name='items_city_month_cnt')
gr['items_city_month_cnt'] = gr['items_city_month_cnt'].astype(np.int32)
df = pd.merge(df, gr, on=dim, how='left')
time.time() - ts

17.012718439102173

16.887547492980957

In [0]:
# How many days product has been sold
ts = time.time()
dim = ['date_block_num', 'shop_id', 'item_id']
gr = train.groupby(dim).size().reset_index(name='days_with_sales')
df = pd.merge(df, gr, on=dim, how='left')
time.time() - ts

4.98628830909729

5.68704080581665

In [0]:
# Count and variance of price
ts = time.time()

dim = ['date_block_num', 'item_id']

gr = train.groupby(['date_block_num', 'item_id', 'item_price'])\
                    .size().reset_index()\
                    .groupby(['date_block_num', 'item_id']).agg({'item_price': {'count', 'std', 'mean'}}).round(2)
gr.columns = ['price_cnt', 'price_std', 'price_mean']
df = pd.merge(df, gr, on=dim, how='left')
time.time() - ts

2.9128267765045166

3.033938407897949

### Main Checkpoint

In [0]:
# Write
ts = time.time()
df.fillna(0, inplace=True)
df.to_pickle('output/df_month_base.pkl')
time.time() - ts

9.422922849655151

9.916898488998413

In [0]:
# Read
ts = time.time()
df = pd.read_pickle('output/df_month_base.pkl')
time.time() - ts

7.862703323364258

## F1

In [0]:
def time_agg1(df, dm, feature, cols, pref):

  df_vars = cols +\
            [feature + pref + '_12m'] +\
            [feature + pref + '_9m'] +\
            [feature + pref + '_6m'] +\
            [feature + pref + '_3m'] +\
            [feature + pref + '_lag_1m'] +\
            [feature + pref + '_lag_2m'] +\
            [feature + pref + '_lag_3m'] +\
            [feature + pref + '_lag_6m'] +\
            [feature + pref + '_lag_9m'] +\
            [feature + pref + '_lag_12m']

  df_lags_1 = pd.DataFrame(columns=df_vars)

  df_tmp = df.copy()
  df_tmp['asofblock'] = dm
  df_tmp['delta'] = df_tmp['asofblock'] - df_tmp['date_block_num']
  
  # Month since last purchase
  # df_last = df_tmp[df_tmp['delta']>0].groupby(cols)['date_block_num'].min().reset_index()
  # df_last.columns = cols + ['last_purchase_m_1']
  
  # Lag
  df_lag_1 = df_tmp[df_tmp['delta']==1]\
      .groupby(cols, as_index=False)[feature].mean().round(2)
  df_lag_2 = df_tmp[df_tmp['delta']==2]\
      .groupby(cols, as_index=False)[feature].mean().round(2)
  df_lag_3 = df_tmp[df_tmp['delta']==3]\
      .groupby(cols, as_index=False)[feature].mean().round(2)
  df_lag_6 = df_tmp[df_tmp['delta']==6]\
      .groupby(cols, as_index=False)[feature].mean().round(2)
  df_lag_9 = df_tmp[df_tmp['delta']==9]\
      .groupby(cols, as_index=False)[feature].mean().round(2)
  df_lag_12 = df_tmp[df_tmp['delta']==12]\
      .groupby(cols, as_index=False)[feature].mean().round(2)
  
  # Mean
  df_avg_3 = df_tmp[df_tmp['delta'].isin([1, 2, 3])]\
      .groupby(cols, as_index=False)[feature].mean().round(2)
  df_avg_6 = df_tmp[df_tmp['delta'].isin([1, 2, 3, 4, 5, 6])]\
      .groupby(cols, as_index=False)[feature].mean().round(2)
  df_avg_9 = df_tmp[df_tmp['delta'].isin([1, 2, 3, 4, 5, 6, 7, 8, 9])]\
      .groupby(cols, as_index=False)[feature].mean().round(2)
  df_avg_12 = df_tmp[df_tmp['delta'].isin([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])]\
      .groupby(cols, as_index=False)[feature].mean().round(2)


  df_lag_1[feature] = df_lag_1[feature].astype(np.float16)
  df_lag_2[feature] = df_lag_2[feature].astype(np.float16)
  df_lag_3[feature] = df_lag_3[feature].astype(np.float16)
  df_lag_6[feature] = df_lag_6[feature].astype(np.float16)
  df_lag_9[feature] = df_lag_9[feature].astype(np.float16)
  df_lag_12[feature] = df_lag_12[feature].astype(np.float16)

  df_avg_3[feature] = df_avg_3[feature].astype(np.float16)
  df_avg_6[feature] = df_avg_6[feature].astype(np.float16)
  df_avg_9[feature] = df_avg_9[feature].astype(np.float16)
  df_avg_12[feature] = df_avg_12[feature].astype(np.float16)


  df_lag_1.columns = cols + [feature + pref + '_lag_1m']
  df_lag_2.columns = cols + [feature + pref + '_lag_2m']
  df_lag_3.columns = cols + [feature + pref + '_lag_3m']
  df_lag_6.columns = cols + [feature + pref + '_lag_6m']
  df_lag_9.columns = cols + [feature + pref + '_lag_9m']
  df_lag_12.columns = cols + [feature + pref + '_lag_12m']

  df_avg_3.columns = cols + [feature + pref + '_3m']
  df_avg_6.columns = cols + [feature + pref + '_6m']
  df_avg_9.columns = cols + [feature + pref + '_9m']
  df_avg_12.columns = cols + [feature + pref + '_12m']

  dfs = [
    df_avg_12, 
    df_avg_9,
    df_avg_6,
    df_avg_3,
      
    df_lag_1,
    df_lag_2,
    df_lag_3,

    df_lag_6,
    df_lag_9,
    df_lag_12
    # ,df_last
  ]
  
  dff = reduce(lambda left, right: pd.merge(left, right, on=cols, how='left'), dfs)
  return dff

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

def get_pc(df, n, feature, prefix):
  scl = StandardScaler()
  x = scl.fit_transform(df)

  pca = PCA(n_components = n)
  pcomponents = pca.fit_transform(x)

  print(sum(pca.explained_variance_ratio_))
  # return pcomponents
  return pd.DataFrame(data=pcomponents, columns=['pc_' + prefix + feature + str(x) for x in range(1, n+1)], index=None)

### D1

In [0]:
df = df[df['date_block_num']>=12]

dms = df['date_block_num'].unique()
dms.sort()

cols = ['asofblock', 'shop_id', 'item_id']
features = [
            'item_cnt_month', 
              'shops_month_cnt', 
              'items_in_sh_month_cnt', 
              'items_type_month_cnt', 
              'items_city_month_cnt',
              'days_with_sales',
              'price_cnt',
              'price_std',
              'price_mean'
              ]

In [0]:
for feature in features:
  print(feature, 'started')
  for dm in tqdm(dms): 
    df_tmp1 = time_agg1(df, dm, feature, cols=cols, pref='_1')

    if dm == 12:
      df_total = df_tmp1
    else:
      df_total = df_total.append(df_tmp1, sort=False, ignore_index=True)

  df_total.fillna(0, inplace=True)
  X_cols = [x for x in df_total.columns if x not in cols + ['item_cnt_month'] +  ['date_block_num']]
  pc = get_pc(df_total[X_cols], n=3, feature=feature, prefix='1_')
  df_out = pd.concat([df_total[cols].astype(np.int16), pc.astype(np.float16)], axis=1)
  df_out.to_pickle('output/pc_'+feature+'.pkl')


  0%|          | 0/23 [00:00<?, ?it/s][A

item_cnt_month started



  4%|▍         | 1/23 [00:00<00:20,  1.07it/s][A
  9%|▊         | 2/23 [00:03<00:30,  1.45s/it][A
 13%|█▎        | 3/23 [00:07<00:41,  2.06s/it][A
 17%|█▋        | 4/23 [00:11<00:51,  2.70s/it][A
 22%|██▏       | 5/23 [00:15<00:58,  3.24s/it][A
 26%|██▌       | 6/23 [00:20<01:02,  3.70s/it][A
 30%|███       | 7/23 [00:26<01:07,  4.24s/it][A
 35%|███▍      | 8/23 [00:31<01:09,  4.63s/it][A
 39%|███▉      | 9/23 [00:37<01:09,  4.99s/it][A
 43%|████▎     | 10/23 [00:43<01:10,  5.41s/it][A
 48%|████▊     | 11/23 [00:50<01:08,  5.73s/it][A
 52%|█████▏    | 12/23 [00:56<01:05,  5.99s/it][A
 57%|█████▋    | 13/23 [01:03<01:03,  6.32s/it][A
 61%|██████    | 14/23 [01:10<00:58,  6.50s/it][A
 65%|██████▌   | 15/23 [01:17<00:53,  6.65s/it][A
 70%|██████▉   | 16/23 [01:24<00:46,  6.71s/it][A
 74%|███████▍  | 17/23 [01:31<00:40,  6.83s/it][A
 78%|███████▊  | 18/23 [01:38<00:34,  6.81s/it][A
 83%|████████▎ | 19/23 [01:45<00:26,  6.72s/it][A
 87%|████████▋ | 20/23 [01:51<00:19,  6

0.801990022507162



  0%|          | 0/23 [00:00<?, ?it/s][A

shops_month_cnt started



  4%|▍         | 1/23 [00:00<00:19,  1.11it/s][A
  9%|▊         | 2/23 [00:03<00:29,  1.40s/it][A
 13%|█▎        | 3/23 [00:06<00:40,  2.01s/it][A
 17%|█▋        | 4/23 [00:11<00:50,  2.67s/it][A
 22%|██▏       | 5/23 [00:15<00:57,  3.21s/it][A
 26%|██▌       | 6/23 [00:20<01:02,  3.68s/it][A
 30%|███       | 7/23 [00:25<01:07,  4.20s/it][A
 35%|███▍      | 8/23 [00:31<01:09,  4.61s/it][A
 39%|███▉      | 9/23 [00:37<01:09,  4.94s/it][A
 43%|████▎     | 10/23 [00:43<01:09,  5.35s/it][A
 48%|████▊     | 11/23 [00:49<01:07,  5.62s/it][A
 52%|█████▏    | 12/23 [00:56<01:04,  5.90s/it][A
 57%|█████▋    | 13/23 [01:03<01:02,  6.25s/it][A
 61%|██████    | 14/23 [01:10<00:57,  6.42s/it][A
 65%|██████▌   | 15/23 [01:16<00:52,  6.53s/it][A
 70%|██████▉   | 16/23 [01:23<00:46,  6.64s/it][A
 74%|███████▍  | 17/23 [01:30<00:39,  6.64s/it][A
 78%|███████▊  | 18/23 [01:37<00:33,  6.66s/it][A
 83%|████████▎ | 19/23 [01:43<00:26,  6.68s/it][A
 87%|████████▋ | 20/23 [01:50<00:19,  6

0.6640553503308076



  0%|          | 0/23 [00:00<?, ?it/s][A

items_in_sh_month_cnt started



  4%|▍         | 1/23 [00:01<00:26,  1.20s/it][A
  9%|▊         | 2/23 [00:03<00:34,  1.63s/it][A
 13%|█▎        | 3/23 [00:07<00:43,  2.18s/it][A
 17%|█▋        | 4/23 [00:11<00:53,  2.80s/it][A
 22%|██▏       | 5/23 [00:16<00:59,  3.30s/it][A
 26%|██▌       | 6/23 [00:20<01:04,  3.78s/it][A
 30%|███       | 7/23 [00:26<01:08,  4.28s/it][A
 35%|███▍      | 8/23 [00:31<01:10,  4.67s/it][A
 39%|███▉      | 9/23 [00:37<01:10,  5.04s/it][A
 43%|████▎     | 10/23 [00:44<01:10,  5.40s/it][A
 48%|████▊     | 11/23 [00:50<01:08,  5.72s/it][A
 52%|█████▏    | 12/23 [00:57<01:06,  6.03s/it][A
 57%|█████▋    | 13/23 [01:04<01:03,  6.38s/it][A
 61%|██████    | 14/23 [01:11<00:59,  6.61s/it][A
 65%|██████▌   | 15/23 [01:18<00:54,  6.78s/it][A
 70%|██████▉   | 16/23 [01:25<00:48,  6.87s/it][A
 74%|███████▍  | 17/23 [01:32<00:41,  6.92s/it][A
 78%|███████▊  | 18/23 [01:39<00:34,  6.88s/it][A
 83%|████████▎ | 19/23 [01:46<00:27,  6.87s/it][A
 87%|████████▋ | 20/23 [01:53<00:20,  6

0.6686320079594956



  0%|          | 0/23 [00:00<?, ?it/s][A

items_type_month_cnt started



  4%|▍         | 1/23 [00:00<00:20,  1.08it/s][A
  9%|▊         | 2/23 [00:03<00:30,  1.44s/it][A
 13%|█▎        | 3/23 [00:07<00:40,  2.05s/it][A
 17%|█▋        | 4/23 [00:11<00:51,  2.70s/it][A
 22%|██▏       | 5/23 [00:15<00:58,  3.27s/it][A
 26%|██▌       | 6/23 [00:20<01:04,  3.80s/it][A
 30%|███       | 7/23 [00:26<01:09,  4.33s/it][A
 35%|███▍      | 8/23 [00:32<01:11,  4.78s/it][A
 39%|███▉      | 9/23 [00:38<01:11,  5.10s/it][A
 43%|████▎     | 10/23 [00:44<01:10,  5.45s/it][A
 48%|████▊     | 11/23 [00:50<01:09,  5.78s/it][A
 52%|█████▏    | 12/23 [00:57<01:06,  6.09s/it][A
 57%|█████▋    | 13/23 [01:04<01:03,  6.36s/it][A
 61%|██████    | 14/23 [01:11<00:59,  6.60s/it][A
 65%|██████▌   | 15/23 [01:18<00:53,  6.71s/it][A
 70%|██████▉   | 16/23 [01:26<00:47,  6.84s/it][A
 74%|███████▍  | 17/23 [01:33<00:41,  6.89s/it][A
 78%|███████▊  | 18/23 [01:39<00:34,  6.89s/it][A
 83%|████████▎ | 19/23 [01:46<00:27,  6.85s/it][A
 87%|████████▋ | 20/23 [01:53<00:20,  6

0.6640553503308011



  0%|          | 0/23 [00:00<?, ?it/s][A

items_city_month_cnt started



  4%|▍         | 1/23 [00:00<00:19,  1.14it/s][A
  9%|▊         | 2/23 [00:03<00:29,  1.40s/it][A
 13%|█▎        | 3/23 [00:06<00:39,  1.99s/it][A
 17%|█▋        | 4/23 [00:11<00:50,  2.66s/it][A
 22%|██▏       | 5/23 [00:15<00:57,  3.19s/it][A
 26%|██▌       | 6/23 [00:20<01:02,  3.69s/it][A
 30%|███       | 7/23 [00:25<01:06,  4.16s/it][A
 35%|███▍      | 8/23 [00:31<01:08,  4.59s/it][A
 39%|███▉      | 9/23 [00:37<01:09,  4.96s/it][A
 43%|████▎     | 10/23 [00:43<01:09,  5.33s/it][A
 48%|████▊     | 11/23 [00:49<01:07,  5.66s/it][A
 52%|█████▏    | 12/23 [00:56<01:05,  5.94s/it][A
 57%|█████▋    | 13/23 [01:03<01:03,  6.31s/it][A
 61%|██████    | 14/23 [01:10<00:59,  6.60s/it][A
 65%|██████▌   | 15/23 [01:17<00:53,  6.72s/it][A
 70%|██████▉   | 16/23 [01:24<00:47,  6.81s/it][A
 74%|███████▍  | 17/23 [01:31<00:41,  6.85s/it][A
 78%|███████▊  | 18/23 [01:38<00:34,  6.81s/it][A
 83%|████████▎ | 19/23 [01:45<00:27,  6.78s/it][A
 87%|████████▋ | 20/23 [01:51<00:20,  6

0.8134303567760267



  0%|          | 0/23 [00:00<?, ?it/s][A

days_with_sales started



  4%|▍         | 1/23 [00:00<00:20,  1.07it/s][A
  9%|▊         | 2/23 [00:03<00:30,  1.46s/it][A
 13%|█▎        | 3/23 [00:07<00:41,  2.08s/it][A
 17%|█▋        | 4/23 [00:11<00:51,  2.71s/it][A
 22%|██▏       | 5/23 [00:15<00:59,  3.28s/it][A
 26%|██▌       | 6/23 [00:21<01:04,  3.82s/it][A
 30%|███       | 7/23 [00:26<01:09,  4.32s/it][A
 35%|███▍      | 8/23 [00:31<01:09,  4.66s/it][A
 39%|███▉      | 9/23 [00:37<01:10,  5.04s/it][A
 43%|████▎     | 10/23 [00:44<01:10,  5.43s/it][A
 48%|████▊     | 11/23 [00:50<01:09,  5.77s/it][A
 52%|█████▏    | 12/23 [00:57<01:06,  6.05s/it][A
 57%|█████▋    | 13/23 [01:04<01:04,  6.41s/it][A
 61%|██████    | 14/23 [01:11<00:59,  6.63s/it][A
 65%|██████▌   | 15/23 [01:19<00:54,  6.78s/it][A
 70%|██████▉   | 16/23 [01:26<00:48,  6.87s/it][A
 74%|███████▍  | 17/23 [01:33<00:41,  6.89s/it][A
 78%|███████▊  | 18/23 [01:40<00:34,  6.92s/it][A
 83%|████████▎ | 19/23 [01:46<00:27,  6.89s/it][A
 87%|████████▋ | 20/23 [01:53<00:20,  6

0.8251324877064037



  0%|          | 0/23 [00:00<?, ?it/s][A

price_cnt started



  4%|▍         | 1/23 [00:00<00:20,  1.07it/s][A
  9%|▊         | 2/23 [00:03<00:30,  1.46s/it][A
 13%|█▎        | 3/23 [00:07<00:41,  2.07s/it][A
 17%|█▋        | 4/23 [00:11<00:52,  2.75s/it][A
 22%|██▏       | 5/23 [00:15<00:58,  3.27s/it][A
 26%|██▌       | 6/23 [00:20<01:04,  3.77s/it][A
 30%|███       | 7/23 [00:26<01:08,  4.29s/it][A
 35%|███▍      | 8/23 [00:32<01:10,  4.70s/it][A
 39%|███▉      | 9/23 [00:37<01:11,  5.08s/it][A
 43%|████▎     | 10/23 [00:44<01:11,  5.49s/it][A
 48%|████▊     | 11/23 [00:51<01:10,  5.84s/it][A
 52%|█████▏    | 12/23 [00:58<01:07,  6.17s/it][A
 57%|█████▋    | 13/23 [01:05<01:05,  6.50s/it][A
 61%|██████    | 14/23 [01:12<01:00,  6.71s/it][A
 65%|██████▌   | 15/23 [01:19<00:54,  6.85s/it][A
 70%|██████▉   | 16/23 [01:29<00:53,  7.70s/it][A
 74%|███████▍  | 17/23 [01:37<00:46,  7.71s/it][A
 78%|███████▊  | 18/23 [01:44<00:37,  7.52s/it][A
 83%|████████▎ | 19/23 [01:51<00:29,  7.35s/it][A
 87%|████████▋ | 20/23 [01:58<00:21,  7

0.7742480838027975



  0%|          | 0/23 [00:00<?, ?it/s][A

price_std started



  4%|▍         | 1/23 [00:00<00:20,  1.08it/s][A
  9%|▊         | 2/23 [00:03<00:30,  1.44s/it][A
 13%|█▎        | 3/23 [00:07<00:41,  2.06s/it][A
 17%|█▋        | 4/23 [00:11<00:51,  2.72s/it][A
 22%|██▏       | 5/23 [00:15<00:58,  3.24s/it][A
 26%|██▌       | 6/23 [00:20<01:03,  3.73s/it][A
 30%|███       | 7/23 [00:26<01:07,  4.22s/it][A
 35%|███▍      | 8/23 [00:31<01:09,  4.63s/it][A
 39%|███▉      | 9/23 [00:37<01:09,  4.99s/it][A
 43%|████▎     | 10/23 [00:43<01:09,  5.38s/it][A
 48%|████▊     | 11/23 [00:50<01:08,  5.74s/it][A
 52%|█████▏    | 12/23 [00:56<01:06,  6.02s/it][A
 57%|█████▋    | 13/23 [01:04<01:03,  6.37s/it][A
 61%|██████    | 14/23 [01:11<00:59,  6.62s/it][A
 65%|██████▌   | 15/23 [01:18<00:54,  6.77s/it][A
 70%|██████▉   | 16/23 [01:25<00:47,  6.85s/it][A
 74%|███████▍  | 17/23 [01:32<00:41,  6.89s/it][A
 78%|███████▊  | 18/23 [01:39<00:34,  6.93s/it][A
 83%|████████▎ | 19/23 [01:46<00:27,  6.93s/it][A
 87%|████████▋ | 20/23 [01:53<00:20,  6

0.8091241129866559



  0%|          | 0/23 [00:00<?, ?it/s][A

price_mean started



  4%|▍         | 1/23 [00:00<00:21,  1.02it/s][A
  9%|▊         | 2/23 [00:03<00:31,  1.49s/it][A
 13%|█▎        | 3/23 [00:07<00:41,  2.08s/it][A
 17%|█▋        | 4/23 [00:11<00:51,  2.71s/it][A
 22%|██▏       | 5/23 [00:15<00:58,  3.25s/it][A
 26%|██▌       | 6/23 [00:20<01:03,  3.73s/it][A
 30%|███       | 7/23 [00:26<01:07,  4.24s/it][A
 35%|███▍      | 8/23 [00:31<01:09,  4.65s/it][A
 39%|███▉      | 9/23 [00:37<01:11,  5.08s/it][A
 43%|████▎     | 10/23 [00:44<01:11,  5.48s/it][A
 48%|████▊     | 11/23 [00:50<01:09,  5.83s/it][A
 52%|█████▏    | 12/23 [00:57<01:07,  6.11s/it][A
 57%|█████▋    | 13/23 [01:04<01:04,  6.44s/it][A
 61%|██████    | 14/23 [01:12<00:59,  6.66s/it][A
 65%|██████▌   | 15/23 [01:19<00:54,  6.76s/it][A
 70%|██████▉   | 16/23 [01:26<00:47,  6.84s/it][A
 74%|███████▍  | 17/23 [01:33<00:41,  6.91s/it][A
 78%|███████▊  | 18/23 [01:39<00:34,  6.90s/it][A
 83%|████████▎ | 19/23 [01:46<00:27,  6.85s/it][A
 87%|████████▋ | 20/23 [01:53<00:20,  6

0.8661918390504353


### D2

In [0]:
df = df[df['date_block_num']>=12]

dms = df['date_block_num'].unique()
dms.sort()

cols = ['asofblock', 'shop_id', 'city_code', 'type_code', 'item_id']
features = [
            'item_cnt_month', 
              'shops_month_cnt', 
              'items_in_sh_month_cnt', 
              'items_type_month_cnt', 
              'items_city_month_cnt',
              'days_with_sales',
              'price_cnt',
              'price_std',
              'price_mean'
              ]

In [0]:
for feature in features:
  print(feature, 'started')
  for dm in tqdm(dms): 
    df_tmp1 = time_agg1(df, dm, feature, cols=cols, pref='_2')

    if dm == 12:
      df_total = df_tmp1
    else:
      df_total = df_total.append(df_tmp1, sort=False, ignore_index=True)

  df_total.fillna(0, inplace=True)
  X_cols = [x for x in df_total.columns if x not in cols + ['item_cnt_month'] +  ['date_block_num']]
  pc = get_pc(df_total[X_cols], n=3, feature=feature, prefix='2_')
  df_out = pd.concat([df_total[cols].astype(np.int16), pc.astype(np.float16)], axis=1)
  df_out.to_pickle('output/pc_' + '2_' + feature + '.pkl')

  0%|          | 0/23 [00:00<?, ?it/s]

item_cnt_month started


100%|██████████| 23/23 [02:56<00:00,  8.48s/it]


0.8019900225071595


  0%|          | 0/23 [00:00<?, ?it/s]

shops_month_cnt started


100%|██████████| 23/23 [02:58<00:00,  8.68s/it]


0.6640553503308065


  0%|          | 0/23 [00:00<?, ?it/s]

items_in_sh_month_cnt started


100%|██████████| 23/23 [02:59<00:00,  8.75s/it]


0.6686320079594921


  0%|          | 0/23 [00:00<?, ?it/s]

items_type_month_cnt started


100%|██████████| 23/23 [02:59<00:00,  8.85s/it]


0.6640553503308031


  0%|          | 0/23 [00:00<?, ?it/s]

items_city_month_cnt started


100%|██████████| 23/23 [03:00<00:00,  8.85s/it]


0.8134303567760216


  0%|          | 0/23 [00:00<?, ?it/s]

days_with_sales started


100%|██████████| 23/23 [03:01<00:00,  8.95s/it]


0.8251324877063659


  0%|          | 0/23 [00:00<?, ?it/s]

price_cnt started


100%|██████████| 23/23 [03:01<00:00,  8.82s/it]


0.774248083802809


  0%|          | 0/23 [00:00<?, ?it/s]

price_std started


100%|██████████| 23/23 [03:00<00:00,  8.85s/it]


0.8091241129866545


  0%|          | 0/23 [00:00<?, ?it/s]

price_mean started


100%|██████████| 23/23 [03:03<00:00,  8.99s/it]


0.8661918390504381


In [0]:
df_out.head()

Unnamed: 0,asofblock,shop_id,city_code,type_code,item_id,pc_2_price_mean1,pc_2_price_mean2,pc_2_price_mean3
0,13,2,0,1,1956,-0.600098,-0.359375,0.204712
1,13,2,0,1,1958,-0.600098,-0.359375,0.204712
2,13,2,0,1,5571,-0.600098,-0.359375,0.204712
3,13,2,0,1,5572,2.923828,-1.616211,1.651367
4,13,2,0,1,5573,1.161133,-0.987793,0.927734


## Combine variables

In [0]:
# 1
features = [
              'item_cnt_month', 
              'shops_month_cnt', 
              'items_in_sh_month_cnt', 
              'items_type_month_cnt', 
              'items_city_month_cnt',
              'days_with_sales',
              'price_cnt',
              'price_std',
              'price_mean']

cols = ['asofblock', 'shop_id', 'item_id']

for i in range(0, len(features)):
  df_tmp = pd.read_pickle('output/pc_'+features[i]+'.pkl')
  if i == 0:
    df_fin = df_tmp
  else:
    df_fin = pd.merge(df_fin, df_tmp, how='left', on=cols)

In [0]:
# 2
features = [
              'item_cnt_month', 
              'shops_month_cnt', 
              'items_in_sh_month_cnt', 
              'items_type_month_cnt', 
              'items_city_month_cnt',
              'days_with_sales',
              'price_cnt',
              'price_std',
              'price_mean']

cols = ['asofblock', 'shop_id', 'city_code', 'type_code', 'item_id']

for i in range(0, len(features)):
  df_tmp = pd.read_pickle('output/pc_' + '2_' + features [i] + '.pkl')
  if i == 0:
    df_fin_2 = df_tmp
  else:
    df_fin_2 = pd.merge(df_fin_2, df_tmp, how='left', on=cols)

In [0]:
df_fin_2.head()

Unnamed: 0,asofblock,shop_id,city_code,type_code,item_id,pc_2_item_cnt_month1,pc_2_item_cnt_month2,pc_2_item_cnt_month3,pc_2_shops_month_cnt1,pc_2_shops_month_cnt2,pc_2_shops_month_cnt3,pc_2_items_in_sh_month_cnt1,pc_2_items_in_sh_month_cnt2,pc_2_items_in_sh_month_cnt3,pc_2_items_type_month_cnt1,pc_2_items_type_month_cnt2,pc_2_items_type_month_cnt3,pc_2_items_city_month_cnt1,pc_2_items_city_month_cnt2,pc_2_items_city_month_cnt3,pc_2_days_with_sales1,pc_2_days_with_sales2,pc_2_days_with_sales3,pc_2_price_cnt1,pc_2_price_cnt2,pc_2_price_cnt3,pc_2_price_std1,pc_2_price_std2,pc_2_price_std3,pc_2_price_mean1,pc_2_price_mean2,pc_2_price_mean3
0,13,2,0,1,1956,-0.571289,-0.090881,-0.003275,0.005894,-1.854492,-0.473145,-1.112305,-2.279297,0.885742,0.005894,-1.854492,-0.473145,-1.50293,-0.30127,-0.037567,-0.641602,-0.073975,-0.00576,-0.749023,-0.122559,0.029785,-0.770996,-0.212402,0.090088,-0.600098,-0.359375,0.204712
1,13,2,0,1,1958,-0.571289,-0.090881,-0.003275,0.005894,-1.854492,-0.473145,-1.112305,-2.279297,0.885742,0.005894,-1.854492,-0.473145,-1.50293,-0.30127,-0.037567,-0.641602,-0.073975,-0.00576,-0.749023,-0.122559,0.029785,-1.054688,-0.113647,-0.058563,-0.600098,-0.359375,0.204712
2,13,2,0,1,5571,-0.571289,-0.090881,-0.003275,0.005894,-1.854492,-0.473145,-1.112305,-2.279297,0.885742,0.005894,-1.854492,-0.473145,-1.50293,-0.30127,-0.037567,-0.641602,-0.073975,-0.00576,-0.749023,-0.122559,0.029785,-0.512207,-0.302734,0.22583,-0.600098,-0.359375,0.204712
3,13,2,0,1,5572,3.693359,-1.4375,0.838867,0.005894,-1.854492,-0.473145,-1.112305,-2.279297,0.885742,0.005894,-1.854492,-0.473145,-1.50293,-0.30127,-0.037567,2.025391,-0.973633,0.506348,2.166016,-0.852539,-0.426025,0.890137,-0.791016,0.960449,2.923828,-1.616211,1.651367
4,13,2,0,1,5573,-0.571289,-0.090881,-0.003275,0.005894,-1.854492,-0.473145,-1.112305,-2.279297,0.885742,0.005894,-1.854492,-0.473145,-1.50293,-0.30127,-0.037567,-0.641602,-0.073975,-0.00576,0.566895,-0.451904,-0.176025,-0.515625,-0.301514,0.223999,1.161133,-0.987793,0.927734


In [0]:
df_merged = pd.merge(
                 df,
                 df_fin,
                 left_on=['date_block_num', 'shop_id', 'item_id'],
                 right_on=['asofblock', 'shop_id', 'item_id'],
                 how='left')\
              .merge(
                 df_fin_2,
                 left_on=['date_block_num', 'shop_id', 'city_code', 'type_code', 'item_id'],
                 right_on=['asofblock', 'shop_id', 'city_code', 'type_code', 'item_id'],
                 how='left')

In [0]:
df_merged.fillna(0, inplace=True)

In [0]:
X_cols = [
'date_block_num',
'shop_id',
'item_id',
'ID',

'pc_1_item_cnt_month1',
'pc_1_item_cnt_month2',
'pc_1_item_cnt_month3',
'pc_1_shops_month_cnt1',
'pc_1_shops_month_cnt2',
'pc_1_shops_month_cnt3',
'pc_1_items_in_sh_month_cnt1',
'pc_1_items_in_sh_month_cnt2',
'pc_1_items_in_sh_month_cnt3',
'pc_1_items_type_month_cnt1',
'pc_1_items_type_month_cnt2',
'pc_1_items_type_month_cnt3',
'pc_1_items_city_month_cnt1',
'pc_1_items_city_month_cnt2',
'pc_1_items_city_month_cnt3',
'pc_1_days_with_sales1',
'pc_1_days_with_sales2',
'pc_1_days_with_sales3',
'pc_1_price_cnt1',
'pc_1_price_cnt2',
'pc_1_price_cnt3',
'pc_1_price_std1',
'pc_1_price_std2',
'pc_1_price_std3',
'pc_1_price_mean1',
'pc_1_price_mean2',
'pc_1_price_mean3',

'pc_2_item_cnt_month1',
'pc_2_item_cnt_month2',
'pc_2_item_cnt_month3',
'pc_2_shops_month_cnt1',
'pc_2_shops_month_cnt2',
'pc_2_shops_month_cnt3',
'pc_2_items_in_sh_month_cnt1',
'pc_2_items_in_sh_month_cnt2',
'pc_2_items_in_sh_month_cnt3',
'pc_2_items_type_month_cnt1',
'pc_2_items_type_month_cnt2',
'pc_2_items_type_month_cnt3',
'pc_2_items_city_month_cnt1',
'pc_2_items_city_month_cnt2',
'pc_2_items_city_month_cnt3',
'pc_2_days_with_sales1',
'pc_2_days_with_sales2',
'pc_2_days_with_sales3',
'pc_2_price_cnt1',
'pc_2_price_cnt2',
'pc_2_price_cnt3',
'pc_2_price_std1',
'pc_2_price_std2',
'pc_2_price_std3',
'pc_2_price_mean1',
'pc_2_price_mean2',
'pc_2_price_mean3',
]

In [0]:
y_col = 'item_cnt_month'

In [0]:
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor(n_estimators=30, verbose=True, min_samples_leaf=100, n_jobs=10)
rfr.fit(df_merged[X_cols], df_merged[y_col])

[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 out of  30 | elapsed: 69.9min finished


RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=100,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=30, n_jobs=10, oob_score=False,
                      random_state=None, verbose=True, warm_start=False)

In [0]:
pd.DataFrame({'f':rfr.feature_importances_, 'v': X_cols}).sort_values('f')

Unnamed: 0,f,v
17,0.000379,pc_1_items_city_month_cnt2
44,0.000405,pc_2_items_city_month_cnt2
43,0.000456,pc_2_items_city_month_cnt1
16,0.000509,pc_1_items_city_month_cnt1
14,0.000819,pc_1_items_type_month_cnt2
41,0.000901,pc_2_items_type_month_cnt2
35,0.000906,pc_2_shops_month_cnt2
8,0.000911,pc_1_shops_month_cnt2
36,0.000966,pc_2_shops_month_cnt3
9,0.001072,pc_1_shops_month_cnt3


In [0]:
rfr.score(df_merged[X_cols], df_merged[y_col])

[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 out of  30 | elapsed:   20.8s finished


0.4865033361446712

In [0]:
# Features
df_merged['month'] = df_merged['date_block_num']%12
days = pd.Series([31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31])
df_merged['days'] = df_merged['month'].map(days).astype(np.int8)

In [0]:
df_merged['is_pack'] = np.where(df_merged['item_id']==20949, 1, 0)
df_merged['is_ps_cash_in'] = np.where(df_merged['item_id']==5822, 1, 0)
df_merged['is_1c_cash_in'] = np.where(df_merged['item_id']==17717, 1, 0)

In [0]:
df_merged.to_csv('output/df_total.csv')

### Checkpoint

In [0]:
df_merged = pd.read_csv('output/df_total.csv')

### Train & Test split

In [0]:
X_cols = [
'date_block_num',
'shop_id',
'item_id',
'ID',

'pc_1_item_cnt_month1',
'pc_1_item_cnt_month2',
'pc_1_item_cnt_month3',
'pc_1_shops_month_cnt1',
'pc_1_shops_month_cnt2',
'pc_1_shops_month_cnt3',
'pc_1_items_in_sh_month_cnt1',
'pc_1_items_in_sh_month_cnt2',
'pc_1_items_in_sh_month_cnt3',
'pc_1_items_type_month_cnt1',
'pc_1_items_type_month_cnt2',
'pc_1_items_type_month_cnt3',
'pc_1_items_city_month_cnt1',
'pc_1_items_city_month_cnt2',
'pc_1_items_city_month_cnt3',
'pc_1_days_with_sales1',
'pc_1_days_with_sales2',
'pc_1_days_with_sales3',
'pc_1_price_cnt1',
'pc_1_price_cnt2',
'pc_1_price_cnt3',
'pc_1_price_std1',
'pc_1_price_std2',
'pc_1_price_std3',
'pc_1_price_mean1',
'pc_1_price_mean2',
'pc_1_price_mean3',

'pc_2_item_cnt_month1',
'pc_2_item_cnt_month2',
'pc_2_item_cnt_month3',
'pc_2_shops_month_cnt1',
'pc_2_shops_month_cnt2',
'pc_2_shops_month_cnt3',
'pc_2_items_in_sh_month_cnt1',
'pc_2_items_in_sh_month_cnt2',
'pc_2_items_in_sh_month_cnt3',
'pc_2_items_type_month_cnt1',
'pc_2_items_type_month_cnt2',
'pc_2_items_type_month_cnt3',
'pc_2_items_city_month_cnt1',
'pc_2_items_city_month_cnt2',
'pc_2_items_city_month_cnt3',
'pc_2_days_with_sales1',
'pc_2_days_with_sales2',
'pc_2_days_with_sales3',
'pc_2_price_cnt1',
'pc_2_price_cnt2',
'pc_2_price_cnt3',
'pc_2_price_std1',
'pc_2_price_std2',
'pc_2_price_std3',
'pc_2_price_mean1',
'pc_2_price_mean2',
'pc_2_price_mean3',

'month',
'days',
'is_pack',
'is_ps_cash_in',
'is_1c_cash_in'
]

In [0]:
y_col =  'item_cnt_next'

In [0]:
df_sample = df_merged.sample(round(df_merged.shape[0]*0.3))
# df_sample = df_merged

In [0]:
X_train = df_sample[(df_sample['date_block_num']<33)][X_cols]
y_train = df_sample[(df_sample['date_block_num']<33)]['item_cnt_month']
X_train.shape

(3202938, 63)

In [0]:
X_valid = df_merged[df_merged['date_block_num']==33][X_cols]
y_valid = df_merged[df_merged['date_block_num']==33]['item_cnt_month']
X_valid.shape

(238172, 63)

In [0]:
X_test = df_merged[df_merged['date_block_num']==34][X_cols]
y_test = df_merged[df_merged['date_block_num']==34]['item_cnt_month']
X_test.shape

(214200, 63)

In [0]:
del df_merged, df_sample
gc.collect()

0

In [0]:
# X_train.to_csv('output/X_train.csv')
# y_train.to_csv('output/y_train.csv')

# X_valid.to_csv('output/X_valid.csv')
# y_valid.to_csv('output/y_valid.csv')

# X_test.to_csv('output/X_test.csv')
# y_test.to_csv('output/y_test.csv')



X_train = pd.read_csv('output/X_train.csv')
y_train = pd.read_csv('output/y_train.csv')

X_valid = pd.read_csv('output/X_valid.csv')
y_valid = pd.read_csv('output/y_valid.csv')

X_test = pd.read_csv('output/X_test.csv')
y_test = pd.read_csv('output/y_test.csv')

### Predict

In [0]:
# dtr = lightgbm.LGBMRegressor(min_child_samples=20, n_estimators=1000)
# dtr.fit(X_train, y_train)

In [0]:
# pd.DataFrame({'col': list(X_train), 'FE': dtr.feature_importances_})\
#                 .sort_values('FE', ascending=False).head(10)

In [0]:
ts = time.time()
from xgboost import XGBRegressor
model = XGBRegressor(
    max_depth=8,
    n_estimators=1000,
    min_child_weight=300, 
    colsample_bytree=0.8, 
    subsample=0.1, 
    eta=0.3,    
    seed=42)

model.fit(
    X_train, 
    y_train, 
    eval_metric="rmse", 
    eval_set=[(X_train, y_train), (X_valid, y_valid)], 
    verbose=True, 
    early_stopping_rounds = 10)

time.time() - ts

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


[0]	validation_0-rmse:1.21232	validation_1-rmse:1.12937
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 10 rounds.
[1]	validation_0-rmse:1.19555	validation_1-rmse:1.09996
[2]	validation_0-rmse:1.18337	validation_1-rmse:1.07723
[3]	validation_0-rmse:1.16178	validation_1-rmse:1.05687
[4]	validation_0-rmse:1.14912	validation_1-rmse:1.04146
[5]	validation_0-rmse:1.13946	validation_1-rmse:1.02794
[6]	validation_0-rmse:1.12867	validation_1-rmse:1.0185
[7]	validation_0-rmse:1.11518	validation_1-rmse:1.00918
[8]	validation_0-rmse:1.10993	validation_1-rmse:1.00185
[9]	validation_0-rmse:1.1057	validation_1-rmse:0.994784
[10]	validation_0-rmse:1.10024	validation_1-rmse:0.990216
[11]	validation_0-rmse:1.09691	validation_1-rmse:0.985659
[12]	validation_0-rmse:1.09358	validation_1-rmse:0.981587
[13]	validation_0-rmse:1.09144	validation_1-rmse:0.978457
[14]	validation_0-rmse:1.08912	validation_1-rmse:0

In [0]:
from xgboost import plot_importance

def plot_features(booster, figsize):    
    fig, ax = plt.subplots(1,1,figsize=figsize)
    return plot_importance(booster=booster, ax=ax)

plot_features(model, (10,14))

In [0]:
preds_train = model.predict(X_train)
print('Train:', rmse(y_train, preds_train))
# plt.scatter(x=preds_train, y=y_train)

Train: 1.075


In [0]:
preds_valid = model.predict(X_valid)
print('Valid:', rmse(y_valid, preds_valid))
# plt.scatter(x=preds_valid, y=y_valid)

Valid: 0.961


In [0]:
filename = 'xg_model_2.sav'

In [0]:
import pickle
pickle.dump(model, open('output/'+filename, 'wb'))

In [0]:
loaded_model = pickle.load(open('output/'+filename, 'rb'))

### Submission

In [0]:
X_test = df_total[df_total['date_block_num']==34][X_cols]
y_test = df_total[df_total['date_block_num']==34]['item_cnt_month']
X_test.shape

(214200, 63)

In [0]:
y_test_preds = model.predict(X_test).clip(0, 20)
submission = pd.DataFrame({
    "ID": df_total[df_total.date_block_num==34]['ID'],
    "item_cnt_month": y_test_preds
})
submission['ID'] = submission['ID'].astype(np.int32)
submission.to_csv('submission_2.csv', index=False)

In [0]:
!git commit -am 'refine baseline'

fatal: not a git repository (or any parent up to mount point /content)
Stopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).
