# Imports

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
import gc
import os
import pandas as pd

from src import CTX, SEED, FOLDERS
from src.features import COLUMNS, build_features

In [5]:
from src.data import io
observations = io.load_data(FOLDERS.PROCESSED, CTX + 'observations', pd.DataFrame())
observations

Loading ../data/processed/CC-Labs-hv_observations.h5


Unnamed: 0,Observation
0,"Tens of categories with very few items, a hand..."
1,First word of category name a good candidate f...
2,(Categories already sorted on category name)
3,First word of shop name a good candidate for s...
4,(Shops already sorted on shop name)
5,Submissions are evaluated by root mean squared...
6,"All shop, item, and category references are va..."
7,Training data does not have explicit NULLs
8,For each ID in the test set (shop_id/item_id c...
9,One in seven test data keys do not have entrie...


# Read raw data

In [None]:
from src.data import io
sales, test, items, categories, shops = io.read_raw(FOLDERS.RAW)

In [None]:
categories = build_features.add_super_category(categories)
shops = build_features.add_super_shop(shops)
train = build_features.enrich_sales(
    build_features.rollup_and_clip_sales(sales),
    shops, items, categories)
train.head()

# Create grid

In [None]:
max_train_date_block_num = train.date_block_num.max()
max_train_date_block_num

In [None]:
from tqdm import tqdm_notebook
all_data = build_features.create_grid(
    pd.concat([
        train.rename(columns={'item_cnt_month' : 'target'}).drop(columns=['shop_name', 'super_shop_name', 'item_name', 'item_category_name', 'super_category_name']),
        test.assign(date_block_num=max_train_date_block_num+1).drop(columns=['ID'])],
        ignore_index=True, sort=False),
    COLUMNS.KEYS_AND_TIME,
    tqdm_notebook)
print(len(all_data))
all_data.head()

In [None]:
from src.data import io
io.save_data(FOLDERS.INTERIM, CTX + 'grid', all_data)
del all_data
gc.collect()

# Create lags

In [10]:
from src.data import io
all_data = io.load_data(FOLDERS.INTERIM, CTX + 'grid', pd.DataFrame())

Loading ../data/interim/CC-Labs-hv_grid.h5


In [11]:
lagged_data, to_drop_cols = build_features.create_lags(
    all_data,
    COLUMNS.KEYS_AND_TIME + COLUMNS.DERIVED_KEYS,
    tqdm_notebook,
    shift_range = [1, 2])
lagged_data.head()

NameError: name 'tqdm_notebook' is not defined

In [None]:
to_drop_cols = pd.DataFrame(to_drop_cols, columns=['col_name'])
to_drop_cols.head()

In [None]:
from src.data import io
io.save_data(FOLDERS.INTERIM, CTX + 'lagged_data', lagged_data)
io.save_data(FOLDERS.PROCESSED, CTX + 'to_drop_cols', to_drop_cols)
del lagged_data, to_drop_cols
gc.collect()

In [7]:
del all_data
gc.collect()

48

# Encode and normalize

In [8]:
lagged_data = io.load_data(FOLDERS.INTERIM, CTX + 'lagged_data', pd.DataFrame())
to_drop_cols = io.load_data(FOLDERS.INTERIM, CTX + 'to_drop_cols', pd.DataFrame())

Loading ../data/interim/CC-Labs-hv_lagged_data.h5
Loading ../data/interim/CC-Labs-hv_to_drop_cols.h5


FileNotFoundError: File ../data/interim/CC-Labs-hv_to_drop_cols.h5 does not exist

In [None]:
numeric_features = sorted(list(set(list(lagged_data.columns.values)) 
                               - set(to_drop_cols) - set(COLUMNS.KEYS_AND_TIME) - set(COLUMNS.DERIVED_KEYS) - set(['item_category_id'])))
print(numeric_features)

In [None]:
#categorical_features = list(set(index_cols + ['item_category_id']) - set(['date_block_num']))
categorical_features = ['shop_id'] + COLUMNS.DERIVED_KEYS
print(categorical_features)

In [None]:
mapper = build_features.create_mapper(categorical_features, numeric_features)
mapped_data = mapper.fit_transform(lagged_data.drop(to_drop_cols, axis=1)).astype(np.float32)
#mapper.transformed_names_

In [None]:
#train = lagged_data.loc[lagged_data.date_block_num <= max_train_date_block_num]
#test_lagged = lagged_data.loc[lagged_data.date_block_num == max_train_date_block_num + 1]
train_indices = lagged_data.date_block_num <= max_train_date_block_num
test_indices = lagged_data.date_block_num == max_train_date_block_num + 1

In [None]:
X_train = mapped_data[train_indices]
X_test = mapped_data[test_indices]

target_col = 'target'
y_train = lagged_data.loc[train_indices, [target_col]].values.ravel()

In [None]:
dates_train=lagged_data.loc[train_indices,['date_block_num']]
#dates_train.head()
predictions = lagged_data.loc[test_indices, ['shop_id', 'item_id']]
#predictions.head()

In [None]:
from src.data import io
io.save_data(FOLDERS.PROCESSED, CTX + 'X_train', X_train)
io.save_data(FOLDERS.PROCESSED, CTX + 'X_test', X_test)
io.save_data(FOLDERS.PROCESSED, CTX + 'y_train', y_train)
io.save_data(FOLDERS.PROCESSED, CTX + 'dates_train', dates_train)
io.save_data(FOLDERS.PROCESSED, CTX + 'predictions', predictions)

In [None]:
del lagged_data
del mapped_data
gc.collect()