In [1]:
import pandas as pd
import numpy as np

In [44]:
shops = pd.read_csv('../data/interim/shops_appearence.csv', parse_dates=['shop_appearence'])
items = pd.read_csv('../data/interim/items_appearence.csv', parse_dates=['item_appearence'])
item_cat = pd.read_csv('../data/interim/item_categories_global.csv')
sales = pd.read_csv('../data/interim/sales_train_etl.csv', parse_dates=['date'])
sales_pivot = pd.read_csv('../data/interim/sales_pivot.csv')
datasets = {'shops': shops, 'items': items, 'item_categories': item_cat}

In [45]:
train_data = pd.read_csv('../data/interim/train_2013-6_to_2015-6.csv', parse_dates=['date'])
val_data = pd.read_csv('../data/interim/val_2015-7.csv', parse_dates=['date'])

In [9]:
train_data.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,2013-06-16,5,30,11496,399.0,1.0
1,2013-06-14,5,30,11244,149.0,1.0
2,2013-06-06,5,30,11388,898.85,2.0
3,2013-06-15,5,30,11249,399.0,1.0
4,2013-06-13,5,30,8081,299.0,1.0


# New features

* how many unique items are sold currently in the shop
* how many shops sell this particular item
* what was item's median price last month
* how many items (in general) were sold in shop last month

In [49]:
unique_items_in_shop = sales.groupby(['shop_id', 'date_block_num']).agg({'item_id':'nunique'}).reset_index()
unique_items_in_shop.rename(columns={'item_id': 'items_selling'}, inplace=True)
datasets['unique_items_in_shop'] = unique_items_in_shop
unique_items_in_shop.to_csv('../data/interim/unique_items_in_shop.csv', index=False)

In [50]:
unique_shops_sell_item = sales.groupby(['item_id', 'date_block_num']).agg({'shop_id':'nunique'}).reset_index()
unique_shops_sell_item.rename(columns={'shop_id': 'sold_in_shops'}, inplace=True)
datasets['unique_shops_sell_item'] = unique_shops_sell_item
unique_shops_sell_item.to_csv('../data/interim/unique_shops_sell_item.csv', index=False)

In [51]:
price_month = sales.groupby(['date_block_num','item_id', 'shop_id']).agg({'item_price':'median'}).reset_index()
price_pivot = pd.pivot_table(price_month, values='item_price', index=['item_id', 'shop_id'],columns=['date_block_num']).fillna(0).reset_index()
price_pivot.rename(columns={i: f'price_{i}' for i in range(34)}, inplace=True)

In [52]:
shop_sales_month = sales.groupby(['date_block_num', 'shop_id']).agg({'item_cnt_day':'sum'}).reset_index().rename(columns={'item_cnt_day':'item_sold_month'})
shop_total_sales_pivot = pd.pivot_table(shop_sales_month, values='item_sold_month', index=['shop_id'],columns=['date_block_num']).fillna(0).reset_index()
shop_total_sales_pivot.rename(columns={i: f'total_count_{i}' for i in range(34)}, inplace=True)

In [56]:
sales_pivot.rename(columns={str(i): f'item_count_{i}' for i in range(34)}, inplace=True)

pivot = sales_pivot.merge(shop_total_sales_pivot, on=['shop_id']).merge(price_pivot, on=['item_id', 'shop_id'])
pivot.to_csv('../data/interim/total_pivot.csv', index=False)

(418199, 36)
(418199, 104)


In [59]:
datasets['lags'] = pivot

# Incorporating new features into feature extraction pipeline

In [61]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

In [64]:
class MonthFromDate(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None ):
        return self
    def transform(self, X, y=None):
        X['month'] = X['date_block_num'] % 12 + 1
        X['month_sin'] = np.sin(2 * np.pi * X['month']/12.0)
        X['month_cos'] = np.cos(2 * np.pi * X['month']/12.0)
        X.drop(['month'], axis=1, inplace=True) 
#        print('Month encoding stage complete')
        return X
    
class ShopItemAge(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None ):
        return self
    def transform(self, X, y=None):
        X['shop_age_month'] = (X['date'] - X['shop_appearence']).dt.days / 30
        X['item_age_month'] = (X['date'] - X['item_appearence']).dt.days / 30
        X.drop(['shop_appearence','item_appearence','date'], axis=1, inplace=True) 

#        print('Item and shop ages calculation complete')
        return X
    
class MonthlySales(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None ):
        return self
    def transform(self, X, y=None):
        X = X.groupby(['date_block_num','item_id', 'shop_id']).agg({'item_cnt_day':'sum',
                                                                    'date':'max',
                                                                   }).reset_index().rename({'item_cnt_day':'item_cnt_month'}, axis=1)
#        print('Monthly sales calculated')
        
        return X

class MergeTables(BaseEstimator, TransformerMixin):
    def __init__(self, lookup_tables):
        self.items = lookup_tables['items']
        self.shops = lookup_tables['shops']
        self.item_cat = lookup_tables['item_categories']
        self.items_in_shop = lookup_tables['unique_items_in_shop']
        self.shops_sell_item = lookup_tables['unique_shops_sell_item']
        self.lags = lookup_tables['lags']
    def fit(self, X, y=None ):
        return self
    def transform(self, X, y=None):
        X = X.merge(self.items, on='item_id')
        X = X.merge(self.shops_sell_item, on=['date_block_num', 'item_id'])
        X = X.merge(self.item_cat, on='item_category_id')
        X = X.merge(self.shops, on='shop_id')
        X = X.merge(self.items_in_shop, on=['date_block_num', 'shop_id'])
        X = X.merge(self.lags, on=['item_id', 'shop_id'], how='left')
        X.drop(['item_name', 'shop_name', 'item_category_name', 'city', 'global_cat'], axis=1, inplace=True)
#        print('Merging stage complete')
        return X

extraction_steps = [("montly_sales", MonthlySales()),
                ("month_feat", MonthFromDate()),
                ("merge", MergeTables(datasets)),
                ("ages", ShopItemAge())
            ]
        
feature_extraction_pipeline = Pipeline(steps=extraction_steps)

In [82]:
class GetLagSales(BaseEstimator, TransformerMixin):
    def __init__(self, lags):
        self.lags = lags
    def fit(self, X, y=None ):
        return self
    def transform(self, X, y=None):
        date_block = X['date_block_num'].iloc[0]
        for lag in self.lags:
            if date_block - lag >= 0:
                X[f'item_sales_lag_{lag}'] = X.loc[:, f'item_count_{int(date_block-lag)}']
            else:
                X[f'item_sales_lag_{lag}'] = 0
        return X
    
class GetLagPrice(BaseEstimator, TransformerMixin):
    def __init__(self, lags):
        self.lags = lags
    def fit(self, X, y=None ):
        return self
    def transform(self, X, y=None):
        date_block = X['date_block_num'].iloc[0]
        for lag in self.lags:
            if date_block - lag >= 0:
                X[f'price_lag_{lag}'] = X.loc[:, f'price_{int(date_block-lag)}']
            else:
                X[f'price_lag_{lag}'] = 0
        X.drop([f'price_{i}' for i in range(34)], axis=1, inplace=True)
        return X
    
class GetLagTotalSales(BaseEstimator, TransformerMixin):
    def __init__(self, lags):
        self.lags = lags
    def fit(self, X, y=None ):
        return self
    def transform(self, X, y=None):
        date_block = X['date_block_num'].iloc[0]
        for lag in self.lags:
            if date_block - lag >= 0:
                X[f'total_sales_lag_{lag}'] = X.loc[:, f'total_count_{int(date_block-lag)}']
            else:
                X[f'total_sales_lag_{lag}'] = 0
        X.drop([f'total_count_{i}' for i in range(34)], axis=1, inplace=True)
        return X
    
class GetSumOfSales(BaseEstimator, TransformerMixin):
    def __init__(self, lags):
        self.lags = lags
    def fit(self, X, y=None ):
        return self
    def transform(self, X, y=None):
        date_block = X['date_block_num'].iloc[0]
        for lag in self.lags:
            cols_to_sum = [f'item_count_{int(date_block-i)}' for i in range(1, lag + 1) if date_block-i >= 0]
            X[f'sum_sales_lag_{lag}'] = X.loc[:,cols_to_sum].sum(axis=1)
        X.drop([f'item_count_{i}' for i in range(34)], axis=1, inplace=True)
        X.drop(['date_block_num'], axis=1, inplace=True)
        return X
    
    
lag_steps = [("past_month_sales", GetLagSales([1,2,3,6,12])),
             ("last_month_price", GetLagPrice([1])),
             ("last_month_total_sales", GetLagTotalSales([1,3])),
                ("summed_sales", GetSumOfSales([6,12]))
            ]
        
lag_pipeline = Pipeline(steps=lag_steps)

In [83]:
train = feature_extraction_pipeline.fit_transform(train_data)

((1184316, 121), Index(['date_block_num', 'item_id', 'shop_id', 'item_cnt_month', 'date',
       'month_sin', 'month_cos', 'item_name', 'item_category_id',
       'item_appearence',
       ...
       'price_24', 'price_25', 'price_26', 'price_27', 'price_28', 'price_29',
       'price_30', 'price_31', 'price_32', 'price_33'],
      dtype='object', length=121), False)


In [84]:
train_per_month = [lag_pipeline.fit_transform(train.loc[train['date_block_num'] == block].copy()) for block in train['date_block_num'].unique()]
train = pd.concat(train_per_month, axis=0)

In [86]:
train.head()

Unnamed: 0,item_id,shop_id,item_cnt_month,month_sin,month_cos,item_category_id,sold_in_shops,global_cat_id,city_id,items_selling,...,item_sales_lag_1,item_sales_lag_2,item_sales_lag_3,item_sales_lag_6,item_sales_lag_12,price_lag_1,total_sales_lag_1,total_sales_lag_3,sum_sales_lag_6,sum_sales_lag_12
0,27,25,1.0,1.224647e-16,-1.0,19,2,5,15,3150,...,0.0,0.0,0.0,0.0,0.0,0.0,6866.0,9743.0,1.0,1.0
1,1397,25,1.0,1.224647e-16,-1.0,19,5,5,15,3150,...,0.0,0.0,0.0,0.0,0.0,0.0,6866.0,9743.0,0.0,0.0
2,1407,25,6.0,1.224647e-16,-1.0,19,20,5,15,3150,...,2.0,0.0,2.0,0.0,0.0,1299.0,6866.0,9743.0,17.0,17.0
3,1461,25,8.0,1.224647e-16,-1.0,19,38,5,15,3150,...,4.0,5.0,4.0,0.0,0.0,2499.0,6866.0,9743.0,13.0,13.0
4,1467,25,1.0,1.224647e-16,-1.0,19,25,5,15,3150,...,1.0,0.0,0.0,0.0,0.0,898.5,6866.0,9743.0,4.0,4.0
