# Imports

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import gc
import os
import pandas as pd
import numpy as np
import holoviews as hv

pd.set_option('display.max_columns', 500)

from src import CTX, SEED, FOLDERS
from src.features import COLUMNS
import src.data as data
import src.features as features

In [None]:
hv.extension('bokeh')

In [None]:
observations = data.load_data(FOLDERS.PROCESSED, CTX + 'observations', pd.DataFrame())
observations

# Read raw data

In [None]:
sales, test, items, categories, shops = data.read_raw(FOLDERS.RAW)

In [None]:
### TODO
sales = sales. loc[sales.shop_id <= 10]
test = test.loc[test.shop_id <= 10]
### TODO

In [None]:
train = features.rollup_and_clip_sales(sales)
train.head()

In [None]:
max_train_date_block_num = train.date_block_num.max()
max_train_date_block_num

In [None]:
lags = [1, 2, 3, 4, 5, 12]

# Create grid

In [None]:
from tqdm import tqdm_notebook
sales_grid = features.create_grid(
    pd.concat([
        train.rename(columns={'item_cnt_month' : 'target'}),
        test.assign(date_block_num=max_train_date_block_num+1).drop(columns=['ID'])],
        ignore_index=True, sort=False),
    COLUMNS.KEYS_AND_TIME,
    tqdm_notebook)
print(len(sales_grid))
sales_grid.head()

In [None]:
data.save_data(FOLDERS.INTERIM, CTX + 'grid', sales_grid)
del sales_grid
gc.collect()

# Mean encode

In [None]:
sales_grid = data.load_data(FOLDERS.INTERIM, CTX + 'grid', pd.DataFrame())
enriched = features.enrich(
    sales_grid,
    features.add_super_shop(shops),
    items,
    features.add_super_category(categories))
mean_encoded = features.mean_encode(enriched)
mean_encoded = mean_encoded.drop(columns=mean_encoded.filter(regex='name').columns)
mean_encoded.head()

In [None]:
data.save_data(FOLDERS.INTERIM, CTX + 'mean_encoded', mean_encoded)
del sales_grid, enriched, mean_encoded
gc.collect()

# Create lags

In [None]:
mean_encoded = data.load_data(FOLDERS.INTERIM, CTX + 'mean_encoded', pd.DataFrame())

In [None]:
lagged_data, to_drop_cols = features.create_lags(
    mean_encoded,
    COLUMNS.KEYS_AND_TIME + COLUMNS.DERIVED_KEYS,
    lags,
    tqdm_notebook)
lagged_data.head()

In [None]:
to_drop_cols = pd.DataFrame(to_drop_cols, columns=['col_name'])
to_drop_cols.sort_values(by='col_name')

In [None]:
del mean_encoded
gc.collect()

In [None]:
lagged_data = lagged_data.assign(month = lagged_data.date_block_num % 12)
lagged_data = lagged_data[lagged_data.date_block_num >= max(lags)]
lagged_data = lagged_data.reset_index(drop=True)
lagged_data.head()

In [None]:
data.save_data(FOLDERS.INTERIM, CTX + 'lagged_data', lagged_data)
data.save_data(FOLDERS.INTERIM, CTX + 'to_drop_cols', to_drop_cols)
del lagged_data, to_drop_cols
gc.collect()

# One-hot encode and normalize

In [None]:
lagged_data = data.load_data(FOLDERS.INTERIM, CTX + 'lagged_data', pd.DataFrame())
to_drop_cols = data.load_data(FOLDERS.INTERIM, CTX + 'to_drop_cols', pd.DataFrame())

In [None]:
numeric_features = sorted(list(set(list(lagged_data.columns.values)) 
                               - set(to_drop_cols.col_name.values) - set(COLUMNS.KEYS_AND_TIME) - set(COLUMNS.DERIVED_KEYS) - set(['item_category_id'])))
#numeric_features = []
print(numeric_features)

In [None]:
categorical_features = ['shop_id'] + COLUMNS.DERIVED_KEYS
#categorical_features = []
print(categorical_features)

In [None]:
mapper = features.create_mapper_sklearn_pandas_contrib(categorical_features, numeric_features)
mapped_data = mapper.fit_transform(lagged_data.drop(to_drop_cols.col_name.values, axis=1)).astype(np.float32)

In [None]:
feature_names = pd.DataFrame(mapper.transformed_names_, columns=['feature_name'])
data.save_data(FOLDERS.PROCESSED, CTX + 'feature_names', feature_names)
feature_names.feature_name.sort_values()

In [None]:
num_columns_to_add = lagged_data.shop_id.nunique() + lagged_data.super_shop_id.nunique() + lagged_data.item_category_id.nunique() + lagged_data.super_category_id.nunique()
print(len(lagged_data.columns) - len(to_drop_cols) - len(categorical_features) + num_columns_to_add)
print(mapped_data.shape)
print(mapped_data.dtype)

In [None]:
#train = lagged_data.loc[lagged_data.date_block_num <= max_train_date_block_num]
#test_lagged = lagged_data.loc[lagged_data.date_block_num == max_train_date_block_num + 1]
train_indices = lagged_data.date_block_num <= max_train_date_block_num
test_indices = lagged_data.date_block_num == max_train_date_block_num + 1

In [None]:
X_train = mapped_data[train_indices]
#np.random.shuffle(X_train)
X_test = mapped_data[test_indices]

target_col = 'target'
y_train = lagged_data.loc[train_indices, [target_col]].values.ravel()

In [None]:
dates_train=lagged_data.loc[train_indices,['date_block_num']]
#dates_train.head()
predictions = lagged_data.loc[test_indices, ['shop_id', 'item_id']]
#predictions.head()

In [None]:
data.save_data(FOLDERS.PROCESSED, CTX + 'X_train', X_train)
data.save_data(FOLDERS.PROCESSED, CTX + 'X_test', X_test)
data.save_data(FOLDERS.PROCESSED, CTX + 'y_train', y_train)
data.save_data(FOLDERS.PROCESSED, CTX + 'dates_train', dates_train)
data.save_data(FOLDERS.PROCESSED, CTX + 'predictions', predictions)

In [None]:
del lagged_data
del mapped_data
gc.collect()