# Imports

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import gc
import os
import pandas as pd
import numpy as np
ls

from src import CTX, SEED, FOLDERS
from src.data import io
from src.features import COLUMNS, build_features

In [None]:
observations = io.load_data(FOLDERS.PROCESSED, CTX + 'observations', pd.DataFrame())
observations

# Read raw data

In [None]:
sales, test, items, categories, shops = io.read_raw(FOLDERS.RAW)

In [None]:
### TODO
sales = sales.loc[sales.shop_id <= 10]
test = test.loc[test.shop_id <= 10]
### TODO

In [None]:
train = build_features.rollup_and_clip_sales(sales)
train.head()

# Create grid

In [None]:
max_train_date_block_num = train.date_block_num.max()
max_train_date_block_num

In [None]:
from tqdm import tqdm_notebook
all_data = build_features.create_grid(
    pd.concat([
        train.rename(columns={'item_cnt_month' : 'target'}),
        test.assign(date_block_num=max_train_date_block_num+1).drop(columns=['ID'])],
        ignore_index=True, sort=False),
    shops, items, categories,
    COLUMNS.KEYS_AND_TIME,
    tqdm_notebook)
print(len(all_data))
all_data.head()

In [None]:
io.save_data(FOLDERS.INTERIM, CTX + 'grid', all_data)
del all_data
gc.collect()

# Create lags

In [None]:
all_data = io.load_data(FOLDERS.INTERIM, CTX + 'grid', pd.DataFrame())

In [None]:
lagged_data, to_drop_cols = build_features.create_lags(
    all_data,
    COLUMNS.KEYS_AND_TIME + COLUMNS.DERIVED_KEYS,
    tqdm_notebook)
lagged_data.head()

In [None]:
to_drop_cols = pd.DataFrame(to_drop_cols, columns=['col_name'])
to_drop_cols.head()

In [None]:
del all_data
gc.collect()

In [None]:
io.save_data(FOLDERS.INTERIM, CTX + 'lagged_data', lagged_data)
io.save_data(FOLDERS.INTERIM, CTX + 'to_drop_cols', to_drop_cols)
del lagged_data, to_drop_cols
gc.collect()

# Encode and normalize

In [None]:
lagged_data = io.load_data(FOLDERS.INTERIM, CTX + 'lagged_data', pd.DataFrame())
to_drop_cols = io.load_data(FOLDERS.INTERIM, CTX + 'to_drop_cols', pd.DataFrame())

In [None]:
numeric_features = sorted(list(set(list(lagged_data.columns.values)) 
                               - set(to_drop_cols.col_name.values) - set(COLUMNS.KEYS_AND_TIME) - set(COLUMNS.DERIVED_KEYS) - set(['item_category_id'])))
print(numeric_features)

In [None]:
#categorical_features = list(set(index_cols + ['item_category_id']) - set(['date_block_num']))
categorical_features = ['shop_id'] + COLUMNS.DERIVED_KEYS
print(categorical_features)

In [None]:
mapper = build_features.create_mapper(categorical_features, numeric_features)
mapped_data = mapper.fit_transform(lagged_data.drop(to_drop_cols.col_name.values, axis=1)).astype(np.float32)
#mapper.transformed_names_

In [None]:
mapped_data.dtype

In [None]:
#train = lagged_data.loc[lagged_data.date_block_num <= max_train_date_block_num]
#test_lagged = lagged_data.loc[lagged_data.date_block_num == max_train_date_block_num + 1]
train_indices = lagged_data.date_block_num <= max_train_date_block_num
test_indices = lagged_data.date_block_num == max_train_date_block_num + 1

In [None]:
X_train = mapped_data[train_indices]
X_test = mapped_data[test_indices]

target_col = 'target'
y_train = lagged_data.loc[train_indices, [target_col]].values.ravel()

In [None]:
dates_train=lagged_data.loc[train_indices,['date_block_num']]
#dates_train.head()
predictions = lagged_data.loc[test_indices, ['shop_id', 'item_id']]
#predictions.head()

In [None]:
io.save_data(FOLDERS.PROCESSED, CTX + 'X_train', X_train)
io.save_data(FOLDERS.PROCESSED, CTX + 'X_test', X_test)
io.save_data(FOLDERS.PROCESSED, CTX + 'y_train', y_train)
io.save_data(FOLDERS.PROCESSED, CTX + 'dates_train', dates_train)
io.save_data(FOLDERS.PROCESSED, CTX + 'predictions', predictions)

In [None]:
del lagged_data
del mapped_data
gc.collect()

In [None]:
X_train