In [1]:
import os
os.chdir('..')

In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

import torch
from torch.utils.data import DataLoader, TensorDataset

import torch.nn as nn
import torch.optim as optim

In [3]:
df = pd.read_csv(".data/sales_train.csv")
df_items = pd.read_csv(".data/items.csv")
df_shops = pd.read_csv(".data/shops.csv")

In [4]:
df = df.merge(df_items[['item_id', 'item_category_id']], on="item_id", how="left")
df = df.reindex(columns=['date', 'date_block_num', 'shop_id', 'item_id', 'item_category_id', 'item_price', 'item_cnt_day'])

In [5]:
df['date'] = pd.to_datetime(df['date'], format='%d.%m.%Y')
df = df.sort_values(by=['date', 'shop_id', 'item_id']).reset_index(drop=True)

In [6]:
valid_index = (df['date_block_num'] == df['date_block_num'].max())

df_train = df[~valid_index]#.drop(columns=["date_block_num"])
df_valid = df[valid_index]#.drop(columns=["date_block_num"])

In [7]:
uq_shops = df_shops['shop_id'].sort_values().unique()
uq_items = df_items['item_id'].sort_values().unique()
uq_categories = df_items['item_category_id'].sort_values().unique()

In [8]:
def check_arranged_index(arr: np.ndarray) -> bool:
    return (np.arange(0, arr.size) == arr).all()

In [9]:
print(check_arranged_index(uq_shops), check_arranged_index(uq_items), check_arranged_index(uq_categories))

True True True


In [10]:
pd.Series(df_train.groupby(["shop_id", "item_id", "date_block_num"])['item_cnt_day'].sum().values).value_counts()

1.0      1037474
2.0       260432
3.0       101320
4.0        52368
5.0        31392
          ...   
272.0          1
312.0          1
426.0          1
792.0          1
303.0          1
Name: count, Length: 441, dtype: int64

In [11]:
df_train.sort_values(by=['shop_id', 'item_id', 'date_block_num'])

Unnamed: 0,date,date_block_num,shop_id,item_id,item_category_id,item_price,item_cnt_day
165832,2013-02-15,1,0,30,40,265.0,2.0
169580,2013-02-16,1,0,30,40,265.0,9.0
175269,2013-02-17,1,0,30,40,265.0,4.0
180336,2013-02-18,1,0,30,40,265.0,4.0
186528,2013-02-20,1,0,30,40,265.0,2.0
...,...,...,...,...,...,...,...
2758268,2015-07-21,30,59,22164,37,699.0,1.0
1008514,2013-10-25,9,59,22167,49,299.0,1.0
1132842,2013-12-03,11,59,22167,49,299.0,1.0
1171425,2013-12-14,11,59,22167,49,299.0,1.0


In [14]:
df_train_monthly = df_train.groupby(["shop_id", "item_id", "date_block_num"])['item_cnt_day'].sum().reset_index()
df_train_monthly['item_cnt_day'] = df_train_monthly['item_cnt_day'].clip(0, 20)

In [15]:
df_train_monthly

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_day
0,0,30,1,20.0
1,0,31,1,11.0
2,0,32,0,6.0
3,0,32,1,10.0
4,0,33,0,3.0
...,...,...,...,...
1577588,59,22164,27,2.0
1577589,59,22164,30,1.0
1577590,59,22167,9,1.0
1577591,59,22167,11,2.0


In [17]:
def create_series_vectors(x: pd.DataFrame, MAX: int = df_train_monthly['date_block_num'].max() + 1) -> np.ndarray:
    v = np.zeros(MAX)
    v[x['date_block_num'].values] = x['item_cnt_day'].values

    return v

In [18]:
tst = pd.DataFrame([[0, 10], [3, 20], [4, 7]], columns=['date_block_num', 'item_cnt_day'])

create_series_vectors(tst)

array([10.,  0.,  0., 20.,  7.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.])

In [22]:
df_train_monthly_series = df_train_monthly.groupby(["shop_id", "item_id"]) \
    .apply(create_series_vectors) \
    .reset_index() \
    .rename(columns={0: 'monthly_sales_array'}, inplace=False)

  df_train_monthly_series = df_train_monthly.groupby(["shop_id", "item_id"]).apply(create_series_vectors) \


In [24]:
df_train_monthly_series = df_train_monthly_series.merge(df_items[['item_id', 'item_category_id']], on="item_id", how="left")
df_train_monthly_series = df_train_monthly_series.reindex(columns=['shop_id', 'item_id', 'item_category_id', 'monthly_sales_array'])

In [25]:
df_train_monthly_series

Unnamed: 0,shop_id,item_id,item_category_id,monthly_sales_array
0,0,30,40,"[0.0, 20.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
1,0,31,37,"[0.0, 11.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
2,0,32,40,"[6.0, 10.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
3,0,33,37,"[3.0, 3.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,0,35,40,"[1.0, 14.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
...,...,...,...,...
416999,59,22154,37,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
417000,59,22155,37,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ..."
417001,59,22162,40,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
417002,59,22164,37,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [31]:
df_valid_monthly = df_valid.groupby(["shop_id", "item_id", "date_block_num"])['item_cnt_day'].sum().reset_index()
df_valid_monthly['item_cnt_day'] = df_valid_monthly['item_cnt_day'].clip(0, 20)
df_valid_monthly = df_valid_monthly.drop(columns=["date_block_num"])

In [40]:
df_final = df_train_monthly_series.merge(df_valid_monthly, on=["shop_id", "item_id"], how="left").rename(columns={'item_cnt_day': 'y'}, inplace=False)

In [41]:
df_final = df_final[df_final['y'].notna()]

In [46]:
X = df_final.drop(columns=["y"])
y = df_final['y'].values

In [47]:
X

Unnamed: 0,shop_id,item_id,item_category_id,monthly_sales_array
6125,2,31,37,"[0.0, 4.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
6148,2,486,73,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
6167,2,787,49,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
6219,2,1075,40,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
6247,2,1377,23,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...
416975,59,22087,83,"[9.0, 1.0, 6.0, 0.0, 2.0, 17.0, 12.0, 7.0, 10...."
416976,59,22088,83,"[5.0, 0.0, 5.0, 3.0, 6.0, 8.0, 7.0, 4.0, 4.0, ..."
416977,59,22091,83,"[0.0, 0.0, 1.0, 0.0, 2.0, 0.0, 7.0, 2.0, 0.0, ..."
416979,59,22100,42,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
