In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf

# First lets do some exploration

In [2]:
cat_df=pd.read_csv("competitive-data-science-predict-future-sales/item_categories.csv")
items_df=pd.read_csv("competitive-data-science-predict-future-sales/items.csv")
sales_df=pd.read_csv("competitive-data-science-predict-future-sales/sales_train.csv")
shops_df=pd.read_csv("competitive-data-science-predict-future-sales/shops.csv")

In [3]:
cat_df.head()

Unnamed: 0,item_category_name,item_category_id
0,PC - Гарнитуры/Наушники,0
1,Аксессуары - PS2,1
2,Аксессуары - PS3,2
3,Аксессуары - PS4,3
4,Аксессуары - PSP,4


In [4]:
shops_df.head()

Unnamed: 0,shop_name,shop_id
0,"!Якутск Орджоникидзе, 56 фран",0
1,"!Якутск ТЦ ""Центральный"" фран",1
2,"Адыгея ТЦ ""Мега""",2
3,"Балашиха ТРК ""Октябрь-Киномир""",3
4,"Волжский ТЦ ""Волга Молл""",4


First thing we notice from the categories data frame is that it will not be useful while making analysis and predictions as we don't really get much value from the names of the items. At least not as it is, we might try to break it down on different brands to maybe do a better analysis but this does not seem particulalry useful in the small amount of products for each brand. Similarly for the shops, it would maybe only be useful if we consider different ways of clustering like city or size or any information like this but only with the shop name does not seem to hold particularly useful.

In [5]:
items_df.head()

Unnamed: 0,item_name,item_id,item_category_id
0,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,0,40
1,!ABBYY FineReader 12 Professional Edition Full...,1,76
2,***В ЛУЧАХ СЛАВЫ (UNV) D,2,40
3,***ГОЛУБАЯ ВОЛНА (Univ) D,3,40
4,***КОРОБКА (СТЕКЛО) D,4,40


This list will be useful to consider to see if maybe making predictions on some different categories is more useful than outside of its category. It seems more direct to maybe add the column item_category_id to the sales_df to have ease of access to this information.

In [6]:
sales_df["date"]=pd.to_datetime(sales_df["date"])
sales_df.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,2013-02-01,0,59,22154,999.0,1.0
1,2013-03-01,0,25,2552,899.0,1.0
2,2013-05-01,0,25,2552,899.0,-1.0
3,2013-06-01,0,25,2554,1709.05,1.0
4,2013-01-15,0,25,2555,1099.0,1.0


This table gives us information of sales divided by product, day and shop we will be able to use this to estimate sales of every item we are interested on. Lets also add the information of categories we would also like to know.

In [7]:
cat_dict={}
for i in range(len(items_df)):
    cat_dict[items_df["item_id"][i].item()]=items_df["item_category_id"][i].item()
sales_df["item_category_id"]=sales_df["item_id"].map(cat_dict)
sales_df.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,item_category_id
0,2013-02-01,0,59,22154,999.0,1.0,37
1,2013-03-01,0,25,2552,899.0,1.0,58
2,2013-05-01,0,25,2552,899.0,-1.0,58
3,2013-06-01,0,25,2554,1709.05,1.0,58
4,2013-01-15,0,25,2555,1099.0,1.0,56


In [8]:
sales_pmth=sales_df.groupby(["date_block_num", "shop_id", "item_id"]).agg({"item_cnt_day":"sum", "item_price":"min", "item_category_id":"min"})
sales_pmth=sales_pmth.rename(columns={"item_cnt_day":"item_cnt_month"})
sales_pmth=sales_pmth.reset_index()
sales_pmth.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,item_price,item_category_id
0,0,0,32,6.0,221.0,40
1,0,0,33,3.0,347.0,37
2,0,0,35,1.0,247.0,40
3,0,0,43,1.0,221.0,40
4,0,0,51,2.0,127.0,57


# Let us now compare the distribution for each item where we compare sales for all items, all shops or categories, etc.

We can do this by doing some t-tests to see if we expect the distributions to be different. For this let us first define a function to perform the t-test.

In [9]:
def t_test(dist1, dist2, p_t=0.05):
    """
    This function will take two distributions (represented by their dataframe) and will perform a t-test to test the hypothesis that our distributions are actually the same.
    
    dist Dataframe     representing a distribution
    dist2 Dataframe    representing another distribution we want to compare
    p_t float          the p-value use as a treshold we want to use
    
    returns (rejected,p) (Bool, float)     Rejected is a bool that says if we reject or not our hypothesis that these are the same distribution and p is its p-value
    """
    from scipy.stats import norm
    rejected=False
    
    if len(dist2)==0:
        return (rejected, 0)
    
    avg1=dist1.mean()
    var1=dist1.var()
    n1=len(dist1)
    
    avg2=dist2.mean()
    var2=dist2.var()
    n2=len(dist2)
    
    s=(1/n1+1/n2)*((n1-1)*var1+(n2-1)*var2)/(n1+n2-2)
    
    t=(avg1-avg2)/np.sqrt(s)
    
    p=1-2*np.abs(norm.cdf(t)-1/2)
    
    if p<p_t:
        rejected=True
        
    return(rejected, p)
    

In [10]:
# population versus category
items_pmth=sales_pmth.groupby("date_block_num").mean().reset_index()["item_cnt_month"]
cat_pmth=sales_pmth.groupby(["date_block_num", "item_category_id"]).mean().reset_index()[["item_category_id","item_cnt_month"]]

t_test(items_pmth, cat_pmth[cat_pmth["item_category_id"]==2]["item_cnt_month"])

(True, 1.9526602557107253e-12)

In [11]:
#This code will print any category where the initial hypothesis is not rejected. (Meaning those where we expect it to have the same distribution as the general item.)
for i in range(len(cat_df)):
    if t_test(items_pmth, cat_pmth[cat_pmth["item_category_id"]==i]["item_cnt_month"])[0]==False:
        print(i)

6
10
19
23
33
36
50
51
52
69
70
74


In [12]:
#Repeat for different shops
shop_pmth=sales_pmth.groupby(["date_block_num", "shop_id"]).mean().reset_index()[["shop_id","item_cnt_month"]]

t_test(items_pmth, shop_pmth[shop_pmth["shop_id"]==0]["item_cnt_month"])

(False, 0.3232721474052701)

In [13]:
for i in range(len(cat_df)):
    if t_test(items_pmth, shop_pmth[shop_pmth["shop_id"]==i]["item_cnt_month"])[0]==False:
        print(i)

0
1
11
15
18
24
36
40
43
57
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83


As a hypothesis I think the reason we have some categories and some shops for which the hypothesis is rejected is because they have a small sample size. I will test this by also seeing the length of their dataframe.

In [14]:
for i in range(len(cat_df)):
    if t_test(items_pmth, cat_pmth[cat_pmth["item_category_id"]==i]["item_cnt_month"])[0]==False:
        print(i, len(cat_pmth[cat_pmth["item_category_id"]==i]["item_cnt_month"]))

6 34
10 1
19 34
23 34
33 34
36 3
50 1
51 1
52 1
69 34
70 34
74 13


In [15]:
for i in range(len(cat_df)):
    if t_test(items_pmth, shop_pmth[shop_pmth["shop_id"]==i]["item_cnt_month"])[0]==False:
        print(i,len(shop_pmth[shop_pmth["shop_id"]==i]["item_cnt_month"]))

0 2
1 2
11 1
15 34
18 34
24 34
36 1
40 11
43 25
57 32
60 0
61 0
62 0
63 0
64 0
65 0
66 0
67 0
68 0
69 0
70 0
71 0
72 0
73 0
74 0
75 0
76 0
77 0
78 0
79 0
80 0
81 0
82 0
83 0


Analyzing our data it seems that for the most part our hypothesis is right and most examples that are considered not significant are because we have fewer data points like 1 or 2 (or even 0) but this is not the case for all. However I will consider all of these categories and shops as one for the objective of making predictions. 

In [16]:
sales_pmth

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,item_price,item_category_id
0,0,0,32,6.0,221.0,40
1,0,0,33,3.0,347.0,37
2,0,0,35,1.0,247.0,40
3,0,0,43,1.0,221.0,40
4,0,0,51,2.0,127.0,57
...,...,...,...,...,...,...
1609119,33,59,22087,6.0,119.0,83
1609120,33,59,22088,2.0,119.0,83
1609121,33,59,22091,1.0,179.0,83
1609122,33,59,22100,1.0,629.0,42


In [17]:
new_cat_dict={}
new_shop_dict={}
for i in range(len(cat_df)):
    if t_test(items_pmth, cat_pmth[cat_pmth["item_category_id"]==i]["item_cnt_month"])[0]==False:
        new_cat_dict[i]=len(cat_df)+1
    else:
        new_cat_dict[i]=i
        
    if t_test(items_pmth, shop_pmth[shop_pmth["shop_id"]==i]["item_cnt_month"])[0]==False:
        new_shop_dict[i]=len(shops_df)+1
    else:
        new_shop_dict[i]=i

sales_pmth["item_category_id"]=sales_pmth["item_category_id"].map(new_cat_dict)
sales_pmth["shop_id"]=sales_pmth["shop_id"].map(new_shop_dict)
#sales_pmth.drop("item_price")

sales_pmth.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,item_price,item_category_id
0,0,61,32,6.0,221.0,40
1,0,61,33,3.0,347.0,37
2,0,61,35,1.0,247.0,40
3,0,61,43,1.0,221.0,40
4,0,61,51,2.0,127.0,57


# Let us now do a RNN that predicts the number of sales in a month. We will do this by predicting by category and then having a residual model that predicts for each individual value.

In [55]:
cat_df=sales_pmth.groupby(["date_block_num","item_category_id"]).agg({"item_cnt_month":"sum"}).reset_index()
cat_df=cat_df.set_index(['date_block_num','item_category_id']).unstack(fill_value=0.0).stack().reset_index()
cat_df.head()

Unnamed: 0,date_block_num,item_category_id,item_cnt_month
0,0,0,1.0
1,0,1,1.0
2,0,2,1390.0
3,0,3,440.0
4,0,4,251.0


In [43]:
x_cat=[]
y_cat=[]
x_cat_test=[]
for data in list(cat_df.groupby("item_category_id")):
    one_cat=data[1]["item_cnt_month"]
    x_cat_test.append(list(one_cat.iloc[22:34]))
    for i in range(34-12-1): #we use #months-12-1
        x_cat.append(list(one_cat.iloc[i:i+12]))
        y_cat.append(one_cat.iloc[i+12])

x_cat=np.reshape(np.array(x_cat), (np.shape(x_cat)[0],np.shape(x_cat)[1],1))
x_cat_test=np.reshape(np.array(x_cat_test),(np.shape(x_cat_test)[0],np.shape(x_cat_test)[1],1))
y_cat=np.reshape(np.array(y_cat),(np.shape(y_cat)[0],1))

In [44]:
np.shape(x_cat)

(1533, 12, 1)

In [22]:
item_df=sales_pmth.groupby(["date_block_num","item_id"]).agg({"item_cnt_month":"sum"}).reset_index()
item_df=item_df.set_index(['date_block_num','item_id']).unstack(fill_value=0.0).stack().reset_index()
item_df

Unnamed: 0,date_block_num,item_id,item_cnt_month
0,0,0,0.0
1,0,1,0.0
2,0,2,0.0
3,0,3,0.0
4,0,4,0.0
...,...,...,...
741433,33,22165,0.0
741434,33,22166,11.0
741435,33,22167,37.0
741436,33,22168,0.0


In [63]:
x_item=[]
y_item=[]
x_item_test=[]
for data in list(item_df.groupby("item_id")):
    one_item=data[1]["item_cnt_month"]
    x_item_test.append(list(one_item.iloc[22:34]))
    for i in range(34-12-1): #we use #months-12-1
        x_item.append(list(one_item.iloc[i:i+12]))
        y_item.append(one_item.iloc[i+12])

x_item=np.reshape(np.array(x_item), (np.shape(x_item)[0],np.shape(x_item)[1],1))
x_item_test=np.reshape(np.array(x_item_test),(np.shape(x_item_test)[0],np.shape(x_item_test)[1],1))
y_item=np.reshape(np.array(y_item),(np.shape(y_item)[0],1))

In [64]:
np.shape(x_item)

(457947, 12, 1)

In [88]:
model_cat=tf.keras.Sequential((tf.keras.layers.GRU(4), tf.keras.layers.Dense(6, activation="relu"), tf.keras.layers.Dense(4, activation="relu"),  tf.keras.layers.Dense(1, activation="relu")))
model_cat.compile(loss="mse", optimizer="adam")
model_cat.fit(x_cat,y_cat, batch_size=8, epochs=100, validation_split=.1, callbacks=[tf.keras.callbacks.EarlyStopping('val_loss',patience=10,restore_best_weights=True)])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100


<tensorflow.python.keras.callbacks.History at 0x1066c7a7f48>

In [93]:
model_item=tf.keras.Sequential((tf.keras.layers.GRU(2, return_sequences=True), tf.keras.layers.GRU(4), tf.keras.layers.Dense(4, activation="relu"), tf.keras.layers.Dense(6, activation="relu"),  tf.keras.layers.Dense(1)))
model_item.compile(loss="mse", optimizer="adam")
model_cat.fit(x_item, y_item-model_cat(x_item), batch_size=64, epochs=50, validation_split=.1, callbacks=[tf.keras.callbacks.EarlyStopping('val_loss',patience=10,restore_best_weights=True)])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50

KeyboardInterrupt: 

In [74]:
model_cat(x_item_test)+model_item(x_item_test)

<tf.Tensor: shape=(21807, 1), dtype=float32, numpy=
array([[  -7.366337 ],
       [  -7.366337 ],
       [  -7.366502 ],
       ...,
       [-170.57222  ],
       [  -7.3661633],
       [  -7.3661613]], dtype=float32)>

In [51]:
test=pd.read_csv("competitive-data-science-predict-future-sales/test.csv", index_col=0)
test


Unnamed: 0_level_0,shop_id,item_id
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
0,5,5037
1,5,5320
2,5,5233
3,5,5232
4,5,5268
...,...,...
214195,45,18454
214196,45,16188
214197,45,15757
214198,45,19648
