In [5]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
% matplotlib inline
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

In [6]:
## Data Source: https://www.kaggle.com/manjeetsingh/retaildataset
# read the ['Features data set.csv', 'sales data-set.csv', 'stores data-set.csv'] from the /data folder
import os
print("current directory is : " + os.getcwd()) 
print('the /data folder contains:') 
print(os.listdir("./data"))

stores = pd.read_csv('./data/stores data-set.csv')
features = pd.read_csv('./data/Features data set.csv')
sales = pd.read_csv('./data/sales data-set.csv')

## I don't intend to use so instead of replacing na-s with 0 I better drop them all
markdown_cols = features.filter(like='MarkDown').columns
for column in markdown_cols:
     features = features.drop(column,axis=1)

# On features Fill na gaps forward on CPI and Unemployment, Data seems sorted by date already
# https://pandas.pydata.org/pandas-docs/stable/user_guide/missing_data.html
features['CPI'] = features['CPI'].fillna(method='pad')
features['Unemployment'] = features['Unemployment'].fillna(method='pad')

current directory is : C:\Users\User\projects\retail_kaggle
the /data folder contains:
['Features data set.csv', 'sales data-set.csv', 'stores data-set.csv']


In [7]:
# merge left all tables, use only keys from left frame, similar to a SQL left outer join; preserve key order.
    
retail = pd.merge(sales, features, how = 'left', on = ['Store', 'Date', 'IsHoliday'])
retail = pd.merge(retail, stores, how = 'left', on = ['Store'])

# format date in datetime, we need it to extract Year and Year week later to split the data in train, test
retail['Date'] = pd.to_datetime(retail['Date'])
retail['Year'] = pd.DatetimeIndex(retail['Date']).year
retail['Month'] = pd.DatetimeIndex(retail['Date']).month

retail['Year-Week'] = retail['Date'].dt.strftime('%Y-%U')

# I noticed there are Sales figures that are negative, cleanup:
retail = retail[retail['Weekly_Sales']>=0]

# turn IsHoliday into an Integer, useful later for numeric computations
retail['IsHoliday'] = retail['IsHoliday'].astype(int)

# Sort by Date as the data is not ordered by Date
retail = retail.sort_values(by=['Date'])

# checkout the dataset we're going to use further
retail.describe()

Unnamed: 0,Store,Dept,Weekly_Sales,IsHoliday,Temperature,Fuel_Price,CPI,Unemployment,Size,Year,Month
count,420285.0,420285.0,420285.0,420285.0,420285.0,420285.0,420285.0,420285.0,420285.0,420285.0,420285.0
mean,22.195477,44.242771,16030.329773,0.07034,60.090474,3.360888,171.212152,7.960077,136749.569176,2010.968443,6.478501
std,12.787213,30.507197,22728.500149,0.25572,18.44826,0.458523,39.16228,1.863873,60992.688568,0.796893,3.324688
min,1.0,1.0,0.0,0.0,-2.06,2.472,126.064,3.879,34875.0,2010.0,1.0
25%,11.0,18.0,2117.56,0.0,46.68,2.933,132.022667,6.891,93638.0,2010.0,4.0
50%,22.0,37.0,7659.09,0.0,62.09,3.452,182.350989,7.866,140167.0,2011.0,6.0
75%,33.0,74.0,20268.38,0.0,74.28,3.738,212.445487,8.567,202505.0,2012.0,9.0
max,45.0,99.0,693099.36,1.0,100.14,4.468,227.232807,14.313,219622.0,2012.0,12.0


In [8]:
# Lag data, retail data is already ordered by Date
for i in range(1,9):
    c_name = 'Sales_Lag'+ str(i)
    retail[c_name] = retail.groupby(['Store','Dept'])['Weekly_Sales'].shift(-i)
    
retail.query('Store==1 & Dept==1').head()

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday,Temperature,Fuel_Price,CPI,Unemployment,Type,...,Month,Year-Week,Sales_Lag1,Sales_Lag2,Sales_Lag3,Sales_Lag4,Sales_Lag5,Sales_Lag6,Sales_Lag7,Sales_Lag8
34,1,1,2010-01-10,20094.19,0,71.89,2.603,211.671989,7.838,A,...,1,2010-02,57258.43,16333.14,41595.55,19403.54,16241.78,22517.56,22136.64,26229.21
8,1,1,2010-02-04,57258.43,0,62.27,2.719,210.82045,7.808,A,...,2,2010-05,16333.14,41595.55,19403.54,16241.78,22517.56,22136.64,26229.21,17558.09
21,1,1,2010-02-07,16333.14,0,80.91,2.669,211.223533,7.787,A,...,2,2010-06,41595.55,19403.54,16241.78,22517.56,22136.64,26229.21,17558.09,17596.96
2,1,1,2010-02-19,41595.55,0,39.93,2.514,211.289143,8.106,A,...,2,2010-07,19403.54,16241.78,22517.56,22136.64,26229.21,17558.09,17596.96,16145.35
3,1,1,2010-02-26,19403.54,0,46.63,2.561,211.319643,8.106,A,...,2,2010-08,16241.78,22517.56,22136.64,26229.21,17558.09,17596.96,16145.35,16555.11


In [34]:

from IPython.display import clear_output

results_df = []
columns = ['Pred_Dt','Store','Dept','Sales_Lag1','Sales_Lag2','Sales_Lag3','Sales_Lag4','Sales_Lag5','Sales_Lag6','Sales_Lag7','Sales_Lag8']
results_df = pd.DataFrame(columns=columns) 
predict_date = '2012-30'
c = 0
store_list = retail['Store'].unique()
for c_store in store_list:
    dept_list=retail[retail['Store']==c_store]['Dept'].unique()
    for c_dept in dept_list:
        tmp_pred = []
        for i in range(1,9):
            c_name = 'Sales_Lag'+ str(i)
            my_cols = ['Date','Year-Week','Temperature','Fuel_Price','CPI','Unemployment','Size', 'Dept', 'IsHoliday']
            X_cols = ['Temperature','Fuel_Price','CPI','Unemployment','Size', 'Dept', 'IsHoliday']
            my_cols.append(c_name)
            retail_tmp = retail.query('Store=={store} & Dept=={dept}'.format(store=c_store, dept=c_dept))[my_cols]      
            retail_tmp = retail_tmp.dropna()
            # retail_tmp.set_index(retail_tmp['Date'], inplace=True) 
            X = retail_tmp[X_cols]
            y = retail_tmp[c_name]

            c += 1
            clear_output(wait=True)
            print("Iteration " + str(c) + " Data: " + str(c_store)+ '/' + str(c_dept)  + str(retail_tmp.shape), flush=True)
            
    
            if  X.count()[0]>20:
                #model = RandomForestRegressor(n_estimators=50, criterion='mse', random_state=42, n_jobs=-1)
                model = DecisionTreeRegressor(max_depth=8)
                model.fit(X,y)
                y_pred = model.predict(X)

                # print(mean_squared_error(y, y_pred))
                # print("R^2: {0:.4f}".format(r2_score(y, y_pred)))
          
                
                # generate predictions for one 
                pred_one = retail_tmp[retail_tmp['Year-Week']==predict_date][X_cols]

 
                if pred_one.count()[0]>0: 
                    y_test_pred = model.predict(pred_one)
                   # print(y_test_pred)
               
                else:
                        y_test_pred =0
            else:
                y_test_pred =0
            
            tmp_pred.append(float(y_test_pred))    
                
        results_df = results_df.append({"Pred_Dt":predict_date,"Store": c_store,'Dept':c_dept,
                                        'Sales_Lag1':tmp_pred[0],
                                        'Sales_Lag2':tmp_pred[1],
                                        'Sales_Lag3':tmp_pred[2],
                                        'Sales_Lag4':tmp_pred[3],
                                        'Sales_Lag5':tmp_pred[4],
                                        'Sales_Lag6':tmp_pred[5],
                                        'Sales_Lag7':tmp_pred[6],
                                        'Sales_Lag8':tmp_pred[7]}
                                       , ignore_index=True)
        # print('On' + str(predict_date) + 'the predicted ' + str(c_name) + ' for Store: '+ str(c_store) + " dept " + str(c_dept) + " is:" + str(float(y_test_pred)))



Iteration 26584 Data: 19/39(1, 10)


In [35]:
Sales_col = results_df.columns[results_df.columns.str.contains(pat = 'Lag')]
results_df.set_index('Store')
tmp_res= results_df[Sales_col].groupby(results_df.index).sum(axis=1).sum(axis=1)

In [36]:
print('Prediction for each store for the next 8 weeks, as of year-week: ' + str(predict_date))
print(tmp_res.astype(int))

Prediction for each store for the next 8 weeks, as of year-week: 2012-30
0        262904
1         56704
2        634612
3        180732
4        301122
5          4895
6        123174
7         35229
8        410337
9        773830
10       322670
11       285238
12       245167
13        44291
14         3599
15        64071
16       256346
17        24034
18       172933
19        75288
20       724066
21        63366
22        92131
23        36102
24        37719
25          390
26        25614
27        23987
28       514612
29      1219329
         ...   
3293     149274
3294     204205
3295     136131
3296      32835
3297     180876
3298     216525
3299     339189
3300     338580
3301     198102
3302          0
3303       5241
3304      16968
3305      63717
3306      99760
3307      33832
3308      42746
3309      99145
3310      54155
3311      57375
3312      51541
3313     210330
3314          0
3315          0
3316          0
3317          0
3318       2618
3319          0

In [12]:
print('Prediction for the entire network for the next 8 weeks, as of year-week: ' + str(predict_date))
print(tmp_res.astype(int).sum(axis=0))

Prediction for the entire network for the next 8 weeks, as of year-week: 2012-30
64857645
