In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import time
from googletrans import Translator
from nordvpn_switcher import initialize_VPN,rotate_VPN,terminate_VPN

from utils import plot_quantiles, plot_histograms, get_quantiles_from_values, categories_english, shops_english

In [2]:
# initialize_VPN(save=1)

In [3]:
settings = {
    "opsys": "Windows", "command": ["nordvpn", "-c", "-g"], 
    "settings": ['belgium', 'netherlands', 'germany', 'spain', 'france','United_States', 'Canada', 'Brazil', 'Argentina', 'Mexico', 'Chile', 'Costa_Rica', 'Australia'],
    "original_ip": "2a01:e0a:a4d:beb0:5944:c457:3e8f:b912",
    "cwd_path": "C:/Program Files/NordVPN"
           }

---

# Loading Data

---

### Main Data File

In [4]:
data = pd.read_csv('data/sales_train.csv')
data.shape

(2935849, 6)

In [5]:
data.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.0,1.0


### Items Data

In [6]:
items = pd.read_csv('data/items.csv')
items.shape

(22170, 3)

In [7]:
items.head()

Unnamed: 0,item_name,item_id,item_category_id
0,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,0,40
1,!ABBYY FineReader 12 Professional Edition Full...,1,76
2,***В ЛУЧАХ СЛАВЫ (UNV) D,2,40
3,***ГОЛУБАЯ ВОЛНА (Univ) D,3,40
4,***КОРОБКА (СТЕКЛО) D,4,40


### Item Categories Data

In [6]:
item_categories = pd.read_csv('data/item_categories.csv')
item_categories.shape

(84, 2)

In [7]:
item_categories.head()

Unnamed: 0,item_category_name,item_category_id
0,PC - Гарнитуры/Наушники,0
1,Аксессуары - PS2,1
2,Аксессуары - PS3,2
3,Аксессуары - PS4,3
4,Аксессуары - PSP,4


### Shops Data

In [8]:
shops = pd.read_csv('data/shops.csv')
shops.shape

(60, 2)

In [9]:
shops.head()

Unnamed: 0,shop_name,shop_id
0,"!Якутск Орджоникидзе, 56 фран",0
1,"!Якутск ТЦ ""Центральный"" фран",1
2,"Адыгея ТЦ ""Мега""",2
3,"Балашиха ТРК ""Октябрь-Киномир""",3
4,"Волжский ТЦ ""Волга Молл""",4


<div class="alert alert-block alert-info">
- Dataset has 2.9 millions sales entries and 6 columns <br>
- There are 22k unique items referenced <br>
- There are 84 unique categories of items referenced <br>
- Products are distributed across 60 shops <br>

### Test Set

In [10]:
test = pd.read_csv('data/test.csv')
test.shape

(214200, 3)

In [11]:
test.head()

Unnamed: 0,ID,shop_id,item_id
0,0,5,5037
1,1,5,5320
2,2,5,5233
3,3,5,5232
4,4,5,5268


---

# Data Exploration

Definitions :
- Counts : number of distinct items with at least one unit sold on a day
- Volumes : Counts * nb of units sold (item_cnt_day ) 
- Sales : Volumes * Price





---

### Missing Data

In [12]:
data.isnull().sum()

date              0
date_block_num    0
shop_id           0
item_id           0
item_price        0
item_cnt_day      0
dtype: int64

<div class="alert alert-block alert-info">
Dataset has no missing data

### Data Types

In [13]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2935849 entries, 0 to 2935848
Data columns (total 6 columns):
 #   Column          Dtype  
---  ------          -----  
 0   date            object 
 1   date_block_num  int64  
 2   shop_id         int64  
 3   item_id         int64  
 4   item_price      float64
 5   item_cnt_day    float64
dtypes: float64(2), int64(3), object(1)
memory usage: 134.4+ MB


<div class="alert alert-block alert-info">
Dataset contains: <br>
- one datetime field<br>
- 3 integer ID fields<br>
- 2 float fields<br>

### Merge Data

In [14]:
data_enriched = data.merge(items, on = 'item_id', how ='left').merge(item_categories, on = 'item_category_id', how = 'left').merge(shops, on = 'shop_id', how = 'left')
data_enriched.shape

(2935849, 10)

In [15]:
data_enriched.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,item_name,item_category_id,item_category_name,shop_name
0,02.01.2013,0,59,22154,999.0,1.0,ЯВЛЕНИЕ 2012 (BD),37,Кино - Blu-Ray,"Ярославль ТЦ ""Альтаир"""
1,03.01.2013,0,25,2552,899.0,1.0,DEEP PURPLE The House Of Blue Light LP,58,Музыка - Винил,"Москва ТРК ""Атриум"""
2,05.01.2013,0,25,2552,899.0,-1.0,DEEP PURPLE The House Of Blue Light LP,58,Музыка - Винил,"Москва ТРК ""Атриум"""
3,06.01.2013,0,25,2554,1709.05,1.0,DEEP PURPLE Who Do You Think We Are LP,58,Музыка - Винил,"Москва ТРК ""Атриум"""
4,15.01.2013,0,25,2555,1099.0,1.0,DEEP PURPLE 30 Very Best Of 2CD (Фирм.),56,Музыка - CD фирменного производства,"Москва ТРК ""Атриум"""


### Translate Name Fields

In [16]:
f = open('item_name_translation.json')
items_english = json.load(f)

In [17]:
start_time = time.time()
# translate category names 
data_enriched['item_category_name_en'] = data_enriched['item_category_name'].replace(categories_english, regex=True)
# translate shop names 
data_enriched['shop_name_en'] = data_enriched['shop_name'].replace(shops_english, regex=True)
# translate item names 
data_enriched['item_name_en'] = data_enriched['item_name'].map(items_english)
# adding a sales variable
data_enriched['sales'] = data_enriched['item_price'] * data_enriched['item_cnt_day']
print("{} seconds".format(time.time() - start_time))

380.9697074890137 seconds


In [22]:
data_enriched[data_enriched['item_name']=='! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D']

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,item_name,item_category_id,item_category_name,shop_name,item_category_name_en,shop_name_en,item_name_en,sales


In [20]:
items_english

{'! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D': '!In the power of obsession (plast.) D',
 '!ABBYY FineReader 12 Professional Edition Full [PC, Цифровая версия]': '! Abbyy Finereader 12 Professional Edition Full [PC, digital version]',
 '***В ЛУЧАХ СЛАВЫ (UNV) D': '*** In the rays of glory (unv) D',
 '***ГОЛУБАЯ ВОЛНА (Univ) D': '*** Blue wave (univ) D',
 '***КОРОБКА (СТЕКЛО) D': '*** Box (glass) D',
 '***НОВЫЕ АМЕРИКАНСКИЕ ГРАФФИТИ (UNI) D': '*** New American graffiti (UNI) D',
 '***УДАР ПО ВОРОТАМ (UNI) D': '*** Back on goal (UNI) D',
 '***УДАР ПО ВОРОТАМ-2 (UNI) D': '*** Blow on goal-2 (UNI) D',
 '***ЧАЙ С МУССОЛИНИ D': '*** Tea with Mussolini D',
 '***ШУГАРЛЭНДСКИЙ ЭКСПРЕСС (UNI) D': '*** Shugarland express (UNI) D',
 '*ЗА ГРАНЬЮ СМЕРТИ D': '*Behind the facet of death d',
 '*ЛИНИЯ СМЕРТИ D': '*Death line D',
 '*МИХЕЙ И ДЖУМАНДЖИ Сука любовь': '*Micah and Jumanji Bitch Love',
 '*СПАСАЯ ЭМИЛИ D': '*Saving Emily D',
 '*ЧОКНУТЫЙ ПРОФЕССОР /МАГИЯ/ D': '*Film Professor / Magic / D',
 '//АДРЕНАЛИН: 

In [19]:
data_enriched.head(50)

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,item_name,item_category_id,item_category_name,shop_name,item_category_name_en,shop_name_en,item_name_en,sales
0,02.01.2013,0,59,22154,999.0,1.0,ЯВЛЕНИЕ 2012 (BD),37,Кино - Blu-Ray,"Ярославль ТЦ ""Альтаир""",Movie - Blu-Ray,Yaroslavl - Altair,,999.0
1,03.01.2013,0,25,2552,899.0,1.0,DEEP PURPLE The House Of Blue Light LP,58,Музыка - Винил,"Москва ТРК ""Атриум""",Music - Vinyl,Moscow - Atrium,,899.0
2,05.01.2013,0,25,2552,899.0,-1.0,DEEP PURPLE The House Of Blue Light LP,58,Музыка - Винил,"Москва ТРК ""Атриум""",Music - Vinyl,Moscow - Atrium,,-899.0
3,06.01.2013,0,25,2554,1709.05,1.0,DEEP PURPLE Who Do You Think We Are LP,58,Музыка - Винил,"Москва ТРК ""Атриум""",Music - Vinyl,Moscow - Atrium,,1709.05
4,15.01.2013,0,25,2555,1099.0,1.0,DEEP PURPLE 30 Very Best Of 2CD (Фирм.),56,Музыка - CD фирменного производства,"Москва ТРК ""Атриум""",Music - CD Branded Production,Moscow - Atrium,,1099.0
5,10.01.2013,0,25,2564,349.0,1.0,DEEP PURPLE Perihelion: Live In Concert DVD (К...,59,Музыка - Музыкальное видео,"Москва ТРК ""Атриум""",Music - Music Video,Moscow - Atrium,,349.0
6,02.01.2013,0,25,2565,549.0,1.0,DEEP PURPLE Stormbringer (фирм.),56,Музыка - CD фирменного производства,"Москва ТРК ""Атриум""",Music - CD Branded Production,Moscow - Atrium,,549.0
7,04.01.2013,0,25,2572,239.0,1.0,DEFTONES Koi No Yokan,55,Музыка - CD локального производства,"Москва ТРК ""Атриум""",Music - CD Local Production,Moscow - Atrium,,239.0
8,11.01.2013,0,25,2572,299.0,1.0,DEFTONES Koi No Yokan,55,Музыка - CD локального производства,"Москва ТРК ""Атриум""",Music - CD Local Production,Moscow - Atrium,,299.0
9,03.01.2013,0,25,2573,299.0,3.0,DEL REY LANA Born To Die,55,Музыка - CD локального производства,"Москва ТРК ""Атриум""",Music - CD Local Production,Moscow - Atrium,,897.0


### Dates

In [None]:
data['date'] = pd.to_datetime(data['date'], format = "%d.%m.%Y")
data['date_m'] = data['date'].dt.strftime('%Y-%m')
data['date_d'] = data['date'].dt.strftime('%Y-%m-%d')

In [None]:
print(data['date'].min())
print(data['date'].max())

In [None]:
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15,3))
# Yearly Counts
data['date'].dt.year.value_counts().plot.bar(ax = ax1)
ax1.set_title('Yearly Counts')
# Yearly Volumes
data.groupby(data['date'].dt.year).agg({'item_cnt_day':'sum'}).plot.bar(ax = ax2)
ax2.set_title('Yearly Volumes')
ax2.legend().set_visible(False)
ax2.set(xlabel=None)
data.groupby(data['date'].dt.year).agg({'sales':'sum'}).plot.bar(ax = ax3)
# Yearly Sales
ax3.set_title('Yearly Sales')
ax3.legend().set_visible(False)
ax3.set(xlabel=None)

<div class="alert alert-block alert-info">
- Dates range from Jan 2013 up to October 2015 <br>
- Volume trend is globally decreasing with time <br>
- Sales trend is not aligned with volume trend and has peak in 2014. <br>
- This decorrelation between Sales and Volumes could be explained either by a difference in the products sold or by a price evolution

In [None]:
monthly_volumes = data.groupby(data['date_m']).agg({'item_cnt_day':'sum'})
monthly_sales = data.groupby(data['date_m']).agg({'sales':'sum'})

plot_histograms(monthly_volumes, monthly_sales, "Monthly Volumes", "Monthly Sales")

<div class="alert alert-block alert-info">
We can notice a seasonality with peak at year end <br>

In [None]:
top_volumes = data.groupby(data['date_d']).agg({'item_cnt_day':'sum'}).sort_values(by = 'item_cnt_day', ascending = False).head(50)
top_sales = data.groupby(data['date_d']).agg({'sales':'sum'}).sort_values(by = 'sales', ascending = False).head(50)

plot_histograms(top_volumes, top_sales, "Top 50 Volume Dates", "Top 50 Sales Dates", share_axis = False, height = 8)

<div class="alert alert-block alert-info">
Peak dates are mostly last days of the year<br>
29-11-2013 was an extreme high in sales

In [None]:
bottom_volumes = data.groupby(data['date_d']).agg({'item_cnt_day':'sum'}).sort_values(by = 'item_cnt_day', ascending = False).tail(50)
bottom_sales = data.groupby(data['date_d']).agg({'sales':'sum'}).sort_values(by = 'sales', ascending = False).tail(50)

plot_histograms(bottom_volumes, bottom_sales, "Bottom 50 Volume Dates", "Bottom 50 Sales Dates", share_axis = False, height = 8)

### Shops

In [None]:
data['shop_id'].nunique()

In [None]:
shop_volumes = data.groupby('shop_id').agg({'item_cnt_day':'sum'})
shop_sales = data.groupby('shop_id').agg({'sales':'sum'})

plot_histograms(shop_volumes, shop_sales, "Shops Global Volumes", "Shops Global Sales")

<div class="alert alert-block alert-info">
All 60 referenced shops are present in sales data <br>
There are strong discrepancies of volumes and sales between shops <br>

In [None]:
shop_rank_volumes = data.groupby('shop_id').agg({'item_cnt_day':'sum'}).sort_values(by ='item_cnt_day', ascending=False)
shop_rank_sales = data.groupby('shop_id').agg({'sales':'sum'}).sort_values(by ='sales', ascending=False)

plot_histograms(shop_rank_volumes, shop_rank_sales, "Shops Rank - Global Volumes", "Shops Rank - Global Sales", share_axis = False)

<div class="alert alert-block alert-info">
We can notice some difference in the ranks in volume and sales <br>
It could be explained either by <br>
    - a difference in the products sold<br>
    - a difference in the pricing <br>
    - both

In [None]:
monthly_shop = data.groupby(['shop_id','date_block_num']).agg({'item_cnt_day':'sum','sales':'sum'})

shop_rank_monthly_volumes = monthly_shop.groupby('shop_id').agg({'item_cnt_day':'mean'}).sort_values(by ='item_cnt_day', ascending=False)
shop_rank_monthly_sales = monthly_shop.groupby('shop_id').agg({'sales':'mean'}).sort_values(by ='sales', ascending=False)

plot_histograms(shop_rank_monthly_volumes, shop_rank_monthly_sales, "Shops Rank - Monthly Volumes", "Shops Rank - Monthly Sales", share_axis = False)

<div class="alert alert-block alert-info">
Looking at the average monthly figures we can observe a few shops going up in the rank (shops 0,1,9 for instance) <br>
This is probably due to a difference in the number of months with sales between shops <br>

In [None]:
f, ax = plt.subplots(figsize=(15,3))

# Shop Volumes
data.groupby(['shop_id']).agg({"date_block_num" : "nunique"}).plot.bar(ax=ax)
ax.legend().set_visible(False)
ax.set_title('Nb of sales months per Shop')

In [None]:
data.groupby(['shop_id']).agg({"date_block_num" : "nunique"})['date_block_num'].value_counts().sort_index(ascending = False)

<div class="alert alert-block alert-info">
Only half of the shops have sales records in all 34 months of observations <br>
10 shops have less than 10 months of data <br>
5 shops have less than 2 months of data <br>

#### Shops Market Share

In [None]:
shop_volumes

In [None]:
shop_volumes = shop_volumes.sort_values('item_cnt_day',ascending = False)
shop_volumes['volume_pct'] = round(shop_volumes['item_cnt_day'] / data['item_cnt_day'].sum()*100,2)
shop_volumes['volume_cum_pct'] = shop_volumes['volume_pct'].cumsum()
shop_volumes.head(20)

In [None]:
shop_sales = shop_sales.sort_values('sales',ascending = False)
shop_sales['sales_pct'] = round(shop_sales['sales'] / data['sales'].sum()*100,2)
shop_sales['sales_cum_pct'] = shop_sales['sales_pct'].cumsum()
shop_sales.head(20)

### Items 

In [None]:
print(data['item_id'].nunique())
print(items['item_id'].nunique())
print(test['item_id'].nunique())

In [None]:
all_items = set(items['item_id'].to_list())
data_items = set(data['item_id'].to_list())
missing = [item for item in all_items if item not in data_items]
len(missing)

In [None]:
test_items = set(test['item_id'].to_list())
print(len([a for a in test_items if a in missing]))
print(len(missing) / items.shape[0])

<div class="alert alert-block alert-info">
The test set contains 363 items that are not present in training set <br>
This represents 1.6% of all referenced items <br>
We'll need a solution for that

In [None]:
item_stats = data.groupby('item_id').agg({'item_cnt_day':'sum','sales':'sum','shop_id':'nunique','date_block_num':'nunique', 'item_price':'mean'}).reset_index()

#### Volume

In [None]:
item_stats['item_cnt_day'].describe().astype(int)

<div class="alert alert-block alert-info">
mean volume is 167 <br>
median volume is 33 <br>
max volume is 187,642

In [None]:
plot_quantiles(item_stats,'item_cnt_day', title='Item Volumes Quantiles')

In [None]:
values = [1, 10, 100, 1000, 2000]
get_quantiles_from_values(item_stats, 'item_cnt_day', values, 'volume')

In [None]:
item_stats.iloc[item_stats.item_cnt_day.idxmax()]

<div class="alert alert-block alert-info">
Item ID 20949 is the best selling Item in volume with 187,642 units sold for a total sales amount of 928,863 dollars  <br>
It is a cheap item with an average price of 4.91$ <br>
This item has sales record in 31 of the 34 observation months and was sold in 53 shops.

In [None]:
bug_id = item_stats[item_stats['item_cnt_day']<1]['item_id'].to_list()
data[data['item_id'].isin(bug_id)].sort_values(by = 'item_id')

<div class="alert alert-block alert-info">
Some items have negative number of units sold  <br>
At this point it is hard to understand how to interpret these figures <br>
In several cases we have +1 and -1 which could be a way to cancel a sale <br>
We'll dig further on this when analyzing the item_cnt_day field

#### Sales

In [None]:
plot_quantiles(item_stats, 'sales', title='Item Sales Quantiles')

In [None]:
values = [100, 1000, 10000, 100000, 1000000]
get_quantiles_from_values(item_stats, 'sales', values, 'sales')

In [None]:
item_stats[item_stats['sales']>100000000]

<div class="alert alert-block alert-info">
Item ID 6675 is the best selling Item in sales with 10,289 units sold for a total sales amount of 219M dollars  <br>
It is an expensive item with an average price of 22,113$ <br>
This item has sales record in 24 of the 34 observation months and was sold in 532 shops.

#### Shops presence per item

In [None]:
plot_quantiles(item_stats, 'shop_id', title='Number of shops per item - Quantiles')

In [None]:
values = [1, 3, 5, 10, 30, 40, 50]
get_quantiles_from_values(item_stats, 'shop_id', values, 'number of shops')

### Items Price

In [None]:
print(data['item_price'].nunique())
print(data['item_id'].nunique())

##### Before doing price statistics we will keep only one occurence of the same product price

In [None]:
price = data.drop_duplicates(['item_id','item_price'])
price.shape

In [None]:
price['item_price'].describe().astype(int)

<div class="alert alert-block alert-info">
mean price is 1,000 dollars <br>
median price is 399 dollars <br>
max price is around 300,000 dollars

In [None]:
plot_quantiles(price, 'item_price', title='Item Price Quantiles')

In [None]:
values = [50, 100, 500, 1000, 5000]
get_quantiles_from_values(price, 'item_price', values, 'price')

In [None]:
price[price['item_price']<0]

In [None]:
data[(data['item_id']==2973) & (data['shop_id']==32)]

<div class="alert alert-block alert-info">
1 item has negative price but we can correct this price by looking at the other sales for the same product in the same shop <br>
It is a bit tricky as the prior sale price was 2499 and the next sale price after was 1249 (looks like a 50% discount) <br>
We will use the original price which is the most frequent: 2499 <br>

In [None]:
data.loc[data['item_price']==-1,'item_price'] = 2499

In [None]:
# number of different price per item
price.head()

In [None]:
item_price = price.groupby('item_id').agg({'item_price':'nunique'}).reset_index()

In [None]:
item_price

In [None]:
plot_quantiles(item_price, 'item_price', title='Number of different prices for an item')

In [None]:
values = [1,2,3,5, 10, 20 ,50]
get_quantiles_from_values(item_price, 'item_price', values, 'nb of Prices')

<div class="alert alert-block alert-info">
half of the items have less than 3 different prices <br>
89% have less than 10 prices<br>
1% of the items have more than 45 different prices <br>
Price variation could be influenced by the following factors:<br>
- Date of the sale (with underlying economical context)<br>
- Shop of the sale <br>
- Potential discounts applied <br><br>

Let's dig further into this

In [None]:
item_shop_price = price.groupby(['item_id','shop_id']).agg({'item_price':'nunique'}).reset_index()

In [None]:
plot_quantiles(item_shop_price, 'item_price', title='Number of different prices for a couple shop - item')

In [None]:
values = [1,2,3, 5, 10]
get_quantiles_from_values(item_shop_price, 'item_price', values, 'nb of Prices')

<div class="alert alert-block alert-info">
Looking at item price per shop, 95% of the item have less than 3 different prices<br>
Price variance is mostly explained by shops <br>

In [None]:
high_variation = item_shop_price[item_shop_price['item_price']>20]

In [None]:
high_variation['item_id'].value_counts()

In [None]:
item_outlier = price[price['item_id']==17717]
item_outlier['item_price'].describe()

In [None]:
17717 in test['item_id']

<div class="alert alert-block alert-info">
Looking at items with high price variation on same shop (above 20), we can notice that one item is standing out (item ID 17717) <br>
This ID is present in the test set so we can't remove it <br>
Price for this item ranges from 9 to 16790 which is very strange

In [None]:
high_variation['shop_id'].value_counts()

In [None]:
shop_outlier = item_shop_price[item_shop_price['shop_id']==12].sort_values('item_price')
shop_outlier = shop_outlier[shop_outlier['item_price']>20]
shop_outlier

In [None]:
for item in [11365, 11369, 11370, 11371, 11372, 11373, 13753, 13754]:
    temp = price[price['item_id']==item]['item_price']
    print(item, int(temp.min()),  int(temp.median()),  int(temp.max()) )

In [None]:
for item in [11365, 11369, 11370, 11371, 11372, 11373, 13753, 13754]:
    print (item in test['item_id'])

<div class="alert alert-block alert-info">
Looking at shop with high price variation on given items (above 20), one shop is standing out (shop ID 12) <br>
This shop has 8 exclusive products with high variation<br>
These items are present in the test set so we can't remove them either <br>

### Item Categories

In [None]:
item_categories['item_category_name_en'] = item_categories['item_category_name'].replace(categories_english, regex=True)
items.item_category_id.nunique()

In [None]:
items = items.merge(item_categories, on = 'item_category_id', how = 'left' )
items.head()

In [None]:
categories_share = round(pd.DataFrame(items.item_category_id.value_counts(normalize = True))*100,2)
categories_share['cumulative_pct'] = categories_share['item_category_id'].cumsum()
categories_share.rename(columns = {'item_category_id':'items_pct'})
categories_share = categories_share.merge(item_categories, left_index = True, right_on = 'item_category_id', how='left')
categories_share.head(20)

<div class="alert alert-block alert-info">
There are 84 categories of items <br>
Category ID 40 (Movie - DVD) represents 22% of the referenced items <br>
Top 5 categories represent 50% of the reference items <br>
top 20 categories cover 80% of the referenced items

In [None]:
data_enriched = data.merge(items, on = 'item_id', how ='left')
data_enriched.head()

In [None]:
categories_volumes = data_enriched.groupby('item_category_id').agg({'item_cnt_day':'sum'})
categories_sales = data_enriched.groupby('item_category_id').agg({'sales':'sum'})

plot_histograms(categories_volumes, categories_sales, "Global Volumes by Item Category", "Global Sales by Item Category")

In [None]:
categories_volumes = categories_volumes.sort_values('item_cnt_day',ascending = False)
categories_volumes['volume_pct'] = round(categories_volumes['item_cnt_day'] / data_enriched['item_cnt_day'].sum()*100,2)
categories_volumes['volume_cum_pct'] = categories_volumes['volume_pct'].cumsum()
categories_volumes = categories_volumes.merge(item_categories, left_index = True, right_on = 'item_category_id', how = 'left')
categories_volumes.head(20)

<div class="alert alert-block alert-info">
Category ID 40 (Movie - DVD) represents 18% of the volume of sold items<br>
Top 5 categories Includes Movies (DVD & BluRay), Games (PC & PS3) and Local Music CDs and it represent 52% of the volume of sold items <br>
top 20 categories cover 84% of the referenced items

In [None]:
categories_sales = categories_sales.sort_values('sales',ascending = False)
categories_sales['sales_pct'] = round(categories_sales['sales'] / data_enriched['sales'].sum()*100,2)
categories_sales['sales_cum_pct'] = categories_sales['sales_pct'].cumsum()
categories_sales = categories_sales.merge(item_categories, left_index = True, right_on = 'item_category_id', how = 'left')
categories_sales.head(20)

<div class="alert alert-block alert-info">
Category ID 19 (Games - PS3) represents 12% of the total amount of sales<br>
Top 5 categories are Games categories and represent 46% of the sales <br>
top 20 categories cover 80% of the referenced items

In [None]:
def plot_monthly(df: pd.DataFrame, title: str, variable: str = 'item_cnt_day', segment: str , width: int = 15, height: int = 6) -> None:
    """Plotting monthly."""
    fig, ax = plt.subplots(figsize=(width, height))
    # Volume Dimension
    volume.plot.bar(ax=ax1)
    ax1.set_title(title1)
    ax1.legend().set_visible(False)
    ax1.set(xlabel=None)
    # Sales Dimension
    if share_axis:
        sales.plot.bar(ax=ax2, sharex=ax1)
    else:
        sales.plot.bar(ax=ax2)
    ax2.set_title(title2)
    ax2.legend().set_visible(False)
    ax2.set(xlabel=None)


In [None]:
monthly_categories = data_enriched.groupby(['date_block_num','item_category_id']).agg({'item_cnt_day':'sum','sales':'sum','item_id':'nunique'}).reset_index()


In [None]:
categories_sales 

In [None]:
top_categories = categories_volumes['item_category_id'].head(5).to_list()

monthly_categories_top = monthly_categories[monthly_categories['item_category_id'].isin(top_categories)]

fig, ax = plt.subplots(figsize=(15, 6))
sns.lineplot(x = "date_block_num", y = "item_cnt_day", hue='item_category_id', data = monthly_categories_top)
plt.show()