In [51]:
import pandas as pd
import numpy as np
from itertools import product
import gc
from sklearn.preprocessing import LabelEncoder
import sys

In [52]:
data_path = '..\\Data\\'
# Read all files
sales            = pd.read_csv(data_path + 'sales_train.csv')
items            = pd.read_csv(data_path + 'items.csv')
item_categories  = pd.read_csv(data_path + 'item_categories_translated.csv')
shops            = pd.read_csv(data_path + 'shops_translated.csv')
test             = pd.read_csv(data_path + 'test.csv')

In [53]:
def downcast(df):

    '''
    Changes column types in the dataframe:  
        > 'bool'    type to 'int8'
        > 'float64' type to 'float16'
        > 'int64'   type to 'int16'
    '''

    # Select columns to downcast
    float_cols = [x for x in df if df[x].dtype == 'float64']
    int_cols   = [x for x in df if df[x].dtype == 'int64']
    bool_cols  = [x for x in df if df[x].dtype == 'bool']
    # Downcast
    df[bool_cols] = df[bool_cols].astype(np.int8)
    df[float_cols] = df[float_cols].astype(np.float16)
    df[int_cols]   = df[int_cols].astype(np.int16)
    
    return df

### Shop's dataframe preprocessing



In [54]:
shops['city'] = shops['shop_name'].apply(lambda x: x.replace('!','').split()[0])

In [55]:
# Encoding our city feature
label_encoder = LabelEncoder()
shops['city'] = label_encoder.fit_transform(shops['city'])

### Item's dataframe preprocessing

In [56]:
# Create a feature with the 1st-ever-sale date for every item
items['first_sale_date'] = sales.groupby('item_id').agg({'date_block_num': 'min'})['date_block_num']
items['first_sale_date'] = items['first_sale_date'].fillna(34)

### Item_categories' dataframe preprocessing

In [57]:
# Create a major feature
item_categories['major'] = item_categories['item_category_name'].apply(lambda x: x.split()[0])  

In [58]:
# Replace with 'etc' if category count is less than 5
item_categories['major'] = item_categories['major'].apply(lambda x: x if len(item_categories[item_categories['major']==x]) >= 5 else 'other')

In [59]:
# Encoding our major variable
label_encoder = LabelEncoder()
item_categories['major'] = label_encoder.fit_transform(item_categories['major'])

### Sale's dataframe preprocessing

In [60]:
# Outliers removal
sales = sales[(sales['item_price'] > 0) & (sales['item_price'] < 50000)]
sales = sales[(sales['item_cnt_day'] > 0) & (sales['item_cnt_day'] < 1000)]

In [61]:
# As we saw on EDA some shops are duplicated w/distinct shop_id and shop_name
print(f'> {shops.iloc[0,0]} (id: {shops.iloc[0,1]})         <--->  \
{shops.iloc[57,0]} (id: {shops.iloc[57,1]})\n\
> {shops.iloc[1,0]} (id: {shops.iloc[1,1]}) <--->  \
{shops.iloc[58,0]} (id: {shops.iloc[58,1]})\n\
> {shops.iloc[10,0]} (id: {shops.iloc[10,1]})             <--->  \
{shops.iloc[11,0]} (id: {shops.iloc[11,1]})\n\
> {shops.iloc[39,0]} (id: {shops.iloc[39,1]})       <--->  \
{shops.iloc[40,0]} (id: {shops.iloc[40,1]})\n')

> 0 (id: ! Yakutsk Ordzhonikidze, 56 fran)         <--->  57 (id: Yakutsk Ordzhonikidze, 56)
> 1 (id: ! Yakutsk shopping center "Central" Fran) <--->  58 (id: Yakutsk shopping center "Central")
> 10 (id: Zhukovsky st. Chkalova 39m?)             <--->  11 (id: Zhukovsky st. Chkalova 39m²)
> 39 (id: Rostnone TRK "Megacentr Horizont")       <--->  40 (id: Rostov on the Don TRK "Megcenter Horizon" island)



In [62]:
# We correct this cases
sales['shop_id'] = sales['shop_id'].replace({0: 57, 1: 58, 11: 10, 40: 39})
test['shop_id'] = test['shop_id'].replace({0: 57, 1: 58, 11: 10, 40: 39})

In [63]:
# Some of the shops that are in the sales dataframe are not in the Test set. 
# We Leak to improve predictions
shops_on_test = test['shop_id'].unique()
sales = sales[sales['shop_id'].isin(shops_on_test)]

### Preprocessing

As seen in the EDA, Test set is a result of all posible combinations of shops + items for a given month (a cartesian product), we'll mimic that approach in our train set for every month

In [64]:
index_cols = ['date_block_num','shop_id', 'item_id']

# We get the cartesian product for every month in the train set
combinations = [] 
for block_num in sales['date_block_num'].unique():
    cur_shops     = sales[sales['date_block_num']==block_num]['shop_id'].unique()
    cur_items     = sales[sales['date_block_num']==block_num]['item_id'].unique()
    combinations.append(np.array(list(product([block_num],cur_shops, cur_items))))

In [65]:
# Turn the combinations into a pandas dataframe
combinations = pd.DataFrame(np.vstack(combinations), columns = index_cols)

# Generate aggregated features on item_cnt_day and item_price
gb = sales.groupby(index_cols).agg({'item_cnt_day':'sum', 'item_price':'mean'})
gb = gb.reset_index()
gb.rename(columns={'item_cnt_day':'item_cnt_month',\
                   'item_price':'item_price_mean'}, inplace=True)

# Join the data to the combinations
df = pd.merge(combinations,gb,how='left',on=index_cols)

# Generate a feature for the number of items sold
gb = sales.groupby(index_cols).agg({'item_cnt_day': 'count'})
gb = gb.reset_index()
gb = gb.rename(columns={'item_cnt_day': 'item_count'})

df = pd.merge(df, gb, on=index_cols, how='left')


del combinations, gb, sales
gc.collect();

### Consolidate a full Dataframe

In [66]:
# Lets append to the DF the Test data
test['date_block_num'] = 34
df = pd.concat([df, test.drop('ID',axis=1)], ignore_index=True,\
               keys=index_cols)
df.fillna(0,inplace=True)

In [67]:
shops.drop(['shop_name','Unnamed: 0'],axis=1,inplace=True)
item_categories.drop(['item_category_name','Unnamed: 0'],axis=1,inplace=True)
items.drop(['item_name'],axis=1,inplace=True)

# get the final merged df
df = df.merge(shops, on='shop_id', how='left')
df = df.merge(items, on='item_id', how='left')
df = df.merge(item_categories, on='item_category_id', how='left')


df = downcast(df)

del shops, items, item_categories
gc.collect();

## Feature Engineering

### Mean Encoding

In [68]:
def mean_encodings(df, features):

    '''
    Create mean encodings on item_cnt_month based on the features passed
    '''

    temporal = df.groupby(features).agg({'item_cnt_month': 'mean'})
    temporal.reset_index(inplace=True)
    names = features + ['mean_sales']
    temporal.rename(columns={'item_cnt_month':'_'.join(names)}, inplace=True)

    #Mege with the dataset
    df = pd.merge(df, temporal, on=features, how='left')
    df = downcast(df)

    del temporal
    gc.collect();

    return df

In [69]:
df = mean_encodings(df,['date_block_num','item_id'])
df = mean_encodings(df,['date_block_num','item_id','city'])
df = mean_encodings(df,['date_block_num','shop_id','item_category_id'])

### Create Lag Features

In [70]:
def lag_features(df, index_features,lag_feature):

    '''
    Create lag features for -1,-2,-3 months on the index_features passed
    '''

    df_temp = df[index_features + [lag_feature]].copy() 

    # Create lag features
    for month in range(1, 4):
        lag_name = lag_feature +'_lag' + str(month)
        df_temp.columns = index_features + [lag_name]

        # Add 1 to date_block_num to merge df_temp with the passed df
        df_temp['date_block_num'] += month

        # Merge df with df_temp based on idx_feature
        df = df.merge(df_temp.drop_duplicates(),on=index_features,how='left')

        # Fillna with 0 
        df[lag_name] = df[lag_name].fillna(0)

        df = downcast(df)


    del df_temp
    gc.collect()
  
    return df

In [71]:
#Features to lag
features = ['item_price_mean','item_cnt_month','item_count', 'date_block_num_item_id_mean_sales',\
                                                                  'date_block_num_item_id_city_mean_sales']

for feat in features:
    df = lag_features(df, ['date_block_num', 'shop_id', 'item_id'], feat)

df = lag_features(df, ['date_block_num', 'shop_id', 'item_category_id'],
      'date_block_num_shop_id_item_category_id_mean_sales')

In [72]:
# Remove data from the first 2 months as they will have no lagged data
df = df.drop(df[df['date_block_num'] < 3].index)

### Final Feature Engineering

In [73]:
# Generate item_cnt_month mean from the last 3 months 
df['3_Months_Mean'] = df[['item_cnt_month_lag1', 'item_cnt_month_lag2', 'item_cnt_month_lag3']].mean(axis=1)

As we saw in the EDA file, the distribution from the Train set shows that the item_cnt_day is concentrated arround in the range [0,20]<br>
> We are going to clip this values respecting this insight (increasing our prediction score +10%)

In [74]:
clip = ['item_cnt_month', 'item_cnt_month_lag1',
       'item_cnt_month_lag2', 'item_cnt_month_lag3', 'item_count_lag1',
       'item_count_lag2', 'item_count_lag3',
       'date_block_num_item_id_mean_sales_lag1',
       'date_block_num_item_id_mean_sales_lag2',
       'date_block_num_item_id_mean_sales_lag3',
       'date_block_num_item_id_city_mean_sales_lag1',
       'date_block_num_item_id_city_mean_sales_lag2',
       'date_block_num_item_id_city_mean_sales_lag3',
       'date_block_num_shop_id_item_category_id_mean_sales_lag1',
       'date_block_num_shop_id_item_category_id_mean_sales_lag2',
       'date_block_num_shop_id_item_category_id_mean_sales_lag3','3_Months_Mean']

for col in clip:
    df[col] = df[col].clip(0,20)

In [75]:
# Generate percentage evolution from month to month
df['evolution_1'] = df['item_cnt_month_lag1']/df['item_cnt_month_lag2']
df['evolution_1'] = df['evolution_1'].replace([np.inf, -np.inf],np.nan).fillna(0)
df['evolution_2'] = df['item_cnt_month_lag2']/df['item_cnt_month_lag3']
df['evolution_2'] = df['evolution_2'].replace([np.inf, -np.inf],np.nan).fillna(0)

In [76]:
# Generate a binary future for the first_month of every item
df['first_month'] = df['first_sale_date'] == df['date_block_num']

In [77]:
# Generate a permanence feature
df['permanence'] = df['date_block_num'] - df['first_sale_date']

In [78]:
# Generate the n° of month of every instance
df['month'] = df['date_block_num']%12

In [79]:
# Drop the columns from that bring data from the current month and make final
# dataframe preprocessing
toDrop = ['item_price_mean', 'item_count',\
          'date_block_num_shop_id_item_category_id_mean_sales',\
          'date_block_num_item_id_city_mean_sales',\
          'date_block_num_item_id_mean_sales','first_sale_date']

df.drop(toDrop, axis=1, inplace=True)
df = downcast(df)
df.rename(columns={'item_cnt_month':'target'}, inplace=True)

In [80]:
df.to_pickle(data_path + 'df_processed.pkl')