In [44]:
import pandas as pd
import numpy as np

from utils import shop_city_mapping

---

# Loading Files

---

### Train Data

In [45]:
train_raw = pd.read_csv('data/sales_train.csv')
items = pd.read_csv('data/items_en.csv')
categories = pd.read_csv('data/item_categories_en.csv')
shops = pd.read_csv('data/shops_en.csv')

### Test Data

In [46]:
test = pd.read_csv('data/test.csv')
test.shape

(214200, 3)

In [47]:
train_raw

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.00,1.0
1,03.01.2013,0,25,2552,899.00,1.0
2,05.01.2013,0,25,2552,899.00,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.00,1.0
...,...,...,...,...,...,...
2935844,10.10.2015,33,25,7409,299.00,1.0
2935845,09.10.2015,33,25,7460,299.00,1.0
2935846,14.10.2015,33,25,7459,349.00,1.0
2935847,22.10.2015,33,25,7440,299.00,1.0


---

# Cleaning Data





---

### Shops

We observed that shop 10 & 11 are referring to the same shop. Let's regroup them under shop ID 10

In [48]:
train_raw.loc[train_raw['shop_id']==11,'shop_id'] = 10

### Items

We know that some items sold might be returned and thus recorded as a negative sale <br>
We should decide later on if and how to deal with that <br>
At this stage we will only remove the few cases we observed with aggregated negative volume since they are not in the test set


In [49]:
train_raw.shape

(2935849, 6)

In [50]:
item_stats = train_raw.groupby('item_id').agg({'item_cnt_day':'sum'})
item_to_remove = item_stats[item_stats['item_cnt_day']<0].index.to_list()
train_raw = train_raw.loc[~train_raw['item_id'].isin(item_to_remove)]

In [51]:
train_raw.shape

(2935838, 6)

We also noticed in the EDA that there is one occurence of item with a negative price. Let's fix it.

In [52]:
train_raw.loc[train_raw['item_price']==-1,'item_price'] = 2499

---

# Grouping data to match prediction scope

We will group data by month - item - shop

---

In [53]:
train_raw.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.0,1.0


In [54]:
train = train_raw.groupby(['date_block_num','item_id','shop_id']).agg({'item_cnt_day':'sum', 'item_price':'mean'}).reset_index()
train.shape

(1609116, 5)

In [55]:
train.head()

Unnamed: 0,date_block_num,item_id,shop_id,item_cnt_day,item_price
0,0,19,25,1.0,28.0
1,0,27,1,1.0,1890.0
2,0,27,2,1.0,2499.0
3,0,27,10,1.0,1890.0
4,0,27,19,1.0,2499.0


---

# Gathering all data

---

In [56]:
train = train.merge(items, on = 'item_id', how = 'left').merge(shops, on='shop_id', how = 'left').merge(categories, on = 'item_category_id', how ='left')
train.shape

(1609116, 12)

In [57]:
train.head()

Unnamed: 0,date_block_num,item_id,shop_id,item_cnt_day,item_price,item_name,item_category_id,item_name_en,shop_name,shop_name_en,item_category_name,item_category_name_en
0,0,19,25,1.0,28.0,/ЗОЛОТАЯ КОЛЛЕКЦИЯ м/ф-72,40,/Gold collection M/F-72,"Москва ТРК ""Атриум""",Moscow - Atrium,Кино - DVD,Movie - DVD
1,0,27,1,1.0,1890.0,"007 Legends [PS3, русская версия]",19,"007 legends [PS3, Russian version]","!Якутск ТЦ ""Центральный"" фран",Yakutsk - Central Mall,Игры - PS3,Games- PS3
2,0,27,2,1.0,2499.0,"007 Legends [PS3, русская версия]",19,"007 legends [PS3, Russian version]","Адыгея ТЦ ""Мега""",Adygea - Mega Mall,Игры - PS3,Games- PS3
3,0,27,10,1.0,1890.0,"007 Legends [PS3, русская версия]",19,"007 legends [PS3, Russian version]",Жуковский ул. Чкалова 39м?,Moscow - Jukovski Airport,Игры - PS3,Games- PS3
4,0,27,19,1.0,2499.0,"007 Legends [PS3, русская версия]",19,"007 legends [PS3, Russian version]","Курск ТЦ ""Пушкинский""",Kursk - Pushkinsky Mall,Игры - PS3,Games- PS3


---

# Feature Engineering

There are ways to approach this problem:
- We could approach it as pure time-series issue where and use a statistical model or multiple (let's say at item category level) to forecast sales volume globally using trends and seasonality. And then figure a way to distribute the result over shops and items.
- A second approach is to use a supervised machine-learning approach where our target is the current month volume and features include item, shop, category etc... It will not capture trends as well as the first approach but it will provide results with the required granularity

We will start with the second approach

---

###  Item Recency

Number of months since first observed sale for an item

In [58]:
train = train.sort_values(by=['item_id','date_block_num'])
item_first_month = train.drop_duplicates('item_id', keep = 'first')[['item_id','date_block_num']]
item_first_month.columns = ['item_id','item_first_month']
train = train.merge(item_first_month, on = 'item_id', how = 'left')
train['item_Recency'] = train['date_block_num'] - train['item_first_month']

###  Month of the year
We know there is a seasonality so we can capture it with month variable

In [59]:
train['month'] = train['date_block_num'].mod(12)+1
train.head()

Unnamed: 0,date_block_num,item_id,shop_id,item_cnt_day,item_price,item_name,item_category_id,item_name_en,shop_name,shop_name_en,item_category_name,item_category_name_en,item_first_month,item_Recency,month
0,20,0,54,1.0,58.0,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,40,!In the power of obsession (plast.) D,"Химки ТЦ ""Мега""",Moscow - Mega Khimki,Кино - DVD,Movie - DVD,20,0,9
1,15,1,55,2.0,4490.0,!ABBYY FineReader 12 Professional Edition Full...,76,! Abbyy Finereader 12 Professional Edition Ful...,Цифровой склад 1С-Онлайн,Internet - Digital Warehouse 1C Online,Программы - Для дома и офиса (Цифра),Programs - For Home and Office (Number),15,0,4
2,18,1,55,1.0,4490.0,!ABBYY FineReader 12 Professional Edition Full...,76,! Abbyy Finereader 12 Professional Edition Ful...,Цифровой склад 1С-Онлайн,Internet - Digital Warehouse 1C Online,Программы - Для дома и офиса (Цифра),Programs - For Home and Office (Number),15,3,7
3,19,1,55,1.0,4490.0,!ABBYY FineReader 12 Professional Edition Full...,76,! Abbyy Finereader 12 Professional Edition Ful...,Цифровой склад 1С-Онлайн,Internet - Digital Warehouse 1C Online,Программы - Для дома и офиса (Цифра),Programs - For Home and Office (Number),15,4,8
4,20,1,55,1.0,4490.0,!ABBYY FineReader 12 Professional Edition Full...,76,! Abbyy Finereader 12 Professional Edition Ful...,Цифровой склад 1С-Онлайн,Internet - Digital Warehouse 1C Online,Программы - Для дома и офиса (Цифра),Programs - For Home and Office (Number),15,5,9


###  Shop City

In [60]:
train['shop_city'] = train['shop_name_en'].str.extract('(.*) -')
train.head()

Unnamed: 0,date_block_num,item_id,shop_id,item_cnt_day,item_price,item_name,item_category_id,item_name_en,shop_name,shop_name_en,item_category_name,item_category_name_en,item_first_month,item_Recency,month,shop_city
0,20,0,54,1.0,58.0,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,40,!In the power of obsession (plast.) D,"Химки ТЦ ""Мега""",Moscow - Mega Khimki,Кино - DVD,Movie - DVD,20,0,9,Moscow
1,15,1,55,2.0,4490.0,!ABBYY FineReader 12 Professional Edition Full...,76,! Abbyy Finereader 12 Professional Edition Ful...,Цифровой склад 1С-Онлайн,Internet - Digital Warehouse 1C Online,Программы - Для дома и офиса (Цифра),Programs - For Home and Office (Number),15,0,4,Internet
2,18,1,55,1.0,4490.0,!ABBYY FineReader 12 Professional Edition Full...,76,! Abbyy Finereader 12 Professional Edition Ful...,Цифровой склад 1С-Онлайн,Internet - Digital Warehouse 1C Online,Программы - Для дома и офиса (Цифра),Programs - For Home and Office (Number),15,3,7,Internet
3,19,1,55,1.0,4490.0,!ABBYY FineReader 12 Professional Edition Full...,76,! Abbyy Finereader 12 Professional Edition Ful...,Цифровой склад 1С-Онлайн,Internet - Digital Warehouse 1C Online,Программы - Для дома и офиса (Цифра),Programs - For Home and Office (Number),15,4,8,Internet
4,20,1,55,1.0,4490.0,!ABBYY FineReader 12 Professional Edition Full...,76,! Abbyy Finereader 12 Professional Edition Ful...,Цифровой склад 1С-Онлайн,Internet - Digital Warehouse 1C Online,Программы - Для дома и офиса (Цифра),Programs - For Home and Office (Number),15,5,9,Internet


###  Window Data

We want to capture information of the preceeding month

In [61]:
def get_info_previous_month(df):
    data = df.sort_values(by = ['item_id','shop_id','date_block_num'])
    data['item_id_prev'] = data['item_id'].shift(1)
    data['has_previous_data'] = (data['item_id_prev'] == data['item_id']).astype(int)
    data.loc[data['has_previous_data'] == 1, 'item_cnt_day_prev'] = data['item_cnt_day']  
    data.loc[data['has_previous_data'] == 1, 'item_price_prev'] = data['item_price']
    data.loc[data['has_previous_data'] == 0, 'item_cnt_day_prev'] = 0  
    data.loc[data['has_previous_data'] == 0, 'item_price_prev'] = 0
    return data

In [62]:
train =  get_info_previous_month(train)
train.head()

Unnamed: 0,date_block_num,item_id,shop_id,item_cnt_day,item_price,item_name,item_category_id,item_name_en,shop_name,shop_name_en,item_category_name,item_category_name_en,item_first_month,item_Recency,month,shop_city,item_id_prev,has_previous_data,item_cnt_day_prev,item_price_prev
0,20,0,54,1.0,58.0,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,40,!In the power of obsession (plast.) D,"Химки ТЦ ""Мега""",Moscow - Mega Khimki,Кино - DVD,Movie - DVD,20,0,9,Moscow,,0,0.0,0.0
1,15,1,55,2.0,4490.0,!ABBYY FineReader 12 Professional Edition Full...,76,! Abbyy Finereader 12 Professional Edition Ful...,Цифровой склад 1С-Онлайн,Internet - Digital Warehouse 1C Online,Программы - Для дома и офиса (Цифра),Programs - For Home and Office (Number),15,0,4,Internet,0.0,0,0.0,0.0
2,18,1,55,1.0,4490.0,!ABBYY FineReader 12 Professional Edition Full...,76,! Abbyy Finereader 12 Professional Edition Ful...,Цифровой склад 1С-Онлайн,Internet - Digital Warehouse 1C Online,Программы - Для дома и офиса (Цифра),Programs - For Home and Office (Number),15,3,7,Internet,1.0,1,1.0,4490.0
3,19,1,55,1.0,4490.0,!ABBYY FineReader 12 Professional Edition Full...,76,! Abbyy Finereader 12 Professional Edition Ful...,Цифровой склад 1С-Онлайн,Internet - Digital Warehouse 1C Online,Программы - Для дома и офиса (Цифра),Programs - For Home and Office (Number),15,4,8,Internet,1.0,1,1.0,4490.0
4,20,1,55,1.0,4490.0,!ABBYY FineReader 12 Professional Edition Full...,76,! Abbyy Finereader 12 Professional Edition Ful...,Цифровой склад 1С-Онлайн,Internet - Digital Warehouse 1C Online,Программы - Для дома и офиса (Цифра),Programs - For Home and Office (Number),15,5,9,Internet,1.0,1,1.0,4490.0


###  Encoding

- We will need to encode the city variable
- Also, we have three id type variables which are categorical (item_id, shop_id and item_category_id) -> These may need encoding as well, depending on the model chosen, as they could be confused with ordinal data.

In [63]:
train['city_code'] = train['shop_city'].map(shop_city_mapping)
train.head()

Unnamed: 0,date_block_num,item_id,shop_id,item_cnt_day,item_price,item_name,item_category_id,item_name_en,shop_name,shop_name_en,...,item_category_name_en,item_first_month,item_Recency,month,shop_city,item_id_prev,has_previous_data,item_cnt_day_prev,item_price_prev,city_code
0,20,0,54,1.0,58.0,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,40,!In the power of obsession (plast.) D,"Химки ТЦ ""Мега""",Moscow - Mega Khimki,...,Movie - DVD,20,0,9,Moscow,,0,0.0,0.0,1
1,15,1,55,2.0,4490.0,!ABBYY FineReader 12 Professional Edition Full...,76,! Abbyy Finereader 12 Professional Edition Ful...,Цифровой склад 1С-Онлайн,Internet - Digital Warehouse 1C Online,...,Programs - For Home and Office (Number),15,0,4,Internet,0.0,0,0.0,0.0,5
2,18,1,55,1.0,4490.0,!ABBYY FineReader 12 Professional Edition Full...,76,! Abbyy Finereader 12 Professional Edition Ful...,Цифровой склад 1С-Онлайн,Internet - Digital Warehouse 1C Online,...,Programs - For Home and Office (Number),15,3,7,Internet,1.0,1,1.0,4490.0,5
3,19,1,55,1.0,4490.0,!ABBYY FineReader 12 Professional Edition Full...,76,! Abbyy Finereader 12 Professional Edition Ful...,Цифровой склад 1С-Онлайн,Internet - Digital Warehouse 1C Online,...,Programs - For Home and Office (Number),15,4,8,Internet,1.0,1,1.0,4490.0,5
4,20,1,55,1.0,4490.0,!ABBYY FineReader 12 Professional Edition Full...,76,! Abbyy Finereader 12 Professional Edition Ful...,Цифровой склад 1С-Онлайн,Internet - Digital Warehouse 1C Online,...,Programs - For Home and Office (Number),15,5,9,Internet,1.0,1,1.0,4490.0,5


###  Filtering

- Removing unneeded variables

In [64]:
col = ['date_block_num','item_id','shop_id','item_cnt_day','item_price','item_category_id','item_Recency','month','item_cnt_day_prev','item_price_prev','city_code']
train = train[col]

In [65]:
train.head()

Unnamed: 0,date_block_num,item_id,shop_id,item_cnt_day,item_price,item_category_id,item_Recency,month,item_cnt_day_prev,item_price_prev,city_code
0,20,0,54,1.0,58.0,40,0,9,0.0,0.0,1
1,15,1,55,2.0,4490.0,76,0,4,0.0,0.0,5
2,18,1,55,1.0,4490.0,76,3,7,1.0,4490.0,5
3,19,1,55,1.0,4490.0,76,4,8,1.0,4490.0,5
4,20,1,55,1.0,4490.0,76,5,9,1.0,4490.0,5


###  Exporting New Train Set

- Removing unneeded variables

In [66]:
train.to_csv("data/train_processed.csv", index=False)