# Imports

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline 
import holoviews as hv
from holoviews import opts
import hvplot.pandas

from src import CTX, FOLDERS
from src.data import io
from src.features import COLUMNS, build_features

In [None]:
hv.extension('bokeh', 'matplotlib')

# Read data

In [None]:
sales, test, items, categories, shops = io.read_raw(FOLDERS.RAW)

In [None]:
print('Sales shape: ', sales.shape)
print('Test shape: ', test.shape)
print('Items shape: ', items.shape)
print('Categories shape: ', categories.shape)
print('Shops shape: ', shops.shape)

In [None]:
observations = []

# Items

In [None]:
items.head()

In [None]:
items.nunique()

In [None]:
items_by_category = items[['item_id', 'item_category_id']].groupby('item_category_id').count()
hv.Bars(items_by_category).opts(width=1200, logy=True, ylabel='Item Count (log)')

In [None]:
observations += ['Tens of categories with very few items, a handfuld of categories with thousands of items.']

# Categories

In [None]:
categories.head()

In [None]:
categories.nunique()

In [None]:
hv.Table(categories.sort_values(by='item_category_name'))

In [None]:
observations += ['First word of category name a good candidate for super-category grouping of categories (split on dash and space)'] 
observations += ['(Categories already sorted on category name)'] 

In [None]:
categories = build_features.add_super_category(categories)
hv.Table(categories)

# Shops

In [None]:
shops.head()

In [None]:
shops.nunique()

In [None]:
hv.Table(shops.sort_values(by='shop_name'))

In [None]:
observations += ['First word of shop name a good candidate for super-category grouping of shops (split on dash and space)'] 
observations += ['(Shops already sorted on shop name)'] 

In [None]:
shops = build_features.add_super_shop(shops)
hv.Table(shops)

# Sales and Train data

In [None]:
sales.head()

In [None]:
observations += ['Submissions are evaluated by root mean squared error (RMSE). True target values are clipped into [0,20] range.']

In [None]:
sales_rolled_up = build_features.rollup_and_clip_sales(sales)
sales_rolled_up.head()

In [None]:
train = build_features.enrich(sales_rolled_up, shops, items, categories)
train.head()

In [None]:
train.nunique()

In [None]:
train.isnull().describe()

In [None]:
observations += ['All shop, item, and category references are valid in training data.']
observations += ['Training data does not have explicit NULLs']

In [None]:
train.describe()

In [None]:
hv.Bars(train.item_cnt_month.value_counts()).opts(width=1200, logx=True, xrotation=90, xlabel='item_cnt_month', ylabel='value_count')

# Test data

In [None]:
test.head()

In [None]:
observations += ['For each ID in the test set (shop_id/item_id combination), one must predict a total number of sales (item_cnt_month) for that date_block_num (34).']

In [None]:
print('Train keys:', train.groupby(COLUMNS.KEYS).size().reset_index().size)
print('Test keys:', test.groupby(COLUMNS.KEYS).size().reset_index().size)
train_test_merged = test.merge(train, on=COLUMNS.KEYS, how='left')
train_test_merged.isnull().item_cnt_month.value_counts()

In [None]:
observations += ['One in seven test data keys do not have entries in the training data.']

# Visualizations

In [None]:
def nansum(a, **kwargs):
    return np.nan if np.isnan(a).all() else np.nansum(a, **kwargs)

## Sales pr. time and shop heatmap

In [None]:
train.hvplot.heatmap('date_block_num', 'shop_name', 'item_cnt_month', 
                     reduce_function=nansum,
                     title='item_cnt_month pr. date_block_num and shop_name',
                     logz=True, height=800, width=1200, flip_yaxis=True)

In [None]:
train.hvplot.heatmap('date_block_num', 'super_shop_name', 'item_cnt_month', 
                     reduce_function=nansum,
                     title='item_cnt_month pr. date_block_num and super_shop_name',
                     logz=True, height=800, width=1200, flip_yaxis=True)

## Sales pr time and category heatmap

In [None]:
train.hvplot.heatmap('date_block_num', 'item_category_name', 'item_cnt_month', 
                     reduce_function=nansum,
                     title='item_cnt_month pr. date_block_num and item_category_name',
                     logz=True, height=800, width=1200, flip_yaxis=True)

In [None]:
train.hvplot.heatmap('date_block_num', 'super_category_name', 'item_cnt_month', 
                     reduce_function=nansum,
                     title='item_cnt_month pr. date_block_num and super_category_name',
                     logz=True, height=800, width=1200, flip_yaxis=True)

# Observations

In [None]:
io.save_data(FOLDERS.PROCESSED, CTX + 'observations', pd.DataFrame(observations, columns=['Observation']))