In [2]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression

In [3]:
## Data Source: https://www.kaggle.com/manjeetsingh/retaildataset
# read the ['Features data set.csv', 'sales data-set.csv', 'stores data-set.csv'] from the /data folder
import os
print("current directory is : " + os.getcwd()) 
print('the /data folder contains:') 
print(os.listdir("./data"))

stores = pd.read_csv('./data/stores data-set.csv')
features = pd.read_csv('./data/Features data set.csv')
sales = pd.read_csv('./data/sales data-set.csv')

## I don't intend to use so instead of replacing na-s with 0 I better drop them all
markdown_cols = features.filter(like='MarkDown').columns
for column in markdown_cols:
     features = features.drop(column,axis=1)

# On features Fill na gaps forward on CPI and Unemployment, Data seems sorted by date already
# https://pandas.pydata.org/pandas-docs/stable/user_guide/missing_data.html
features['CPI'] = features['CPI'].fillna(method='pad')
features['Unemployment'] = features['Unemployment'].fillna(method='pad')

current directory is : C:\Users\Dragos\projects\retail_kaggle
the /data folder contains:
['Features data set.csv', 'sales data-set.csv', 'stores data-set.csv']


In [None]:
# merge left all tables, use only keys from left frame, similar to a SQL left outer join; preserve key order.
    
retail = pd.merge(sales, features, how = 'left', on = ['Store', 'Date', 'IsHoliday'])
retail = pd.merge(retail, stores, how = 'left', on = ['Store'])

# format date in datetime, we need it to extract Year and Year week later to split the data in train, test
retail['Date'] = pd.to_datetime(retail['Date'])
retail['Year'] = pd.DatetimeIndex(retail['Date']).year
retail['Month'] = pd.DatetimeIndex(retail['Date']).month

retail['Year-Week'] = retail['Date'].dt.strftime('%Y-%U')

# I noticed there are Sales figures that are negative, cleanup:
retail = retail[retail['Weekly_Sales']>=0]

# turn IsHoliday into an Integer, useful later for numeric computations
retail['IsHoliday'] = retail['IsHoliday'].astype(int)

# Sort by Date as the data is not ordered by Date
retail = retail.sort_values(by=['Date'])

# checkout the dataset we're going to use further
retail.describe()

In [None]:
# Lag data, retail data is already ordered by Date
for i in range(1,9):
    c_name = 'Sales_Lag'+ str(i)
    retail[c_name] = retail.groupby(['Store','Dept'])['Weekly_Sales'].shift(-i)
    
retail.query('Store==1 & Dept==1').head()

In [None]:
from IPython.core.debugger import set_trace
results_df = []
columns = ['Pred_Dt','Store','Dept','Sales_Lag1','Sales_Lag2','Sales_Lag3','Sales_Lag4','Sales_Lag5','Sales_Lag6','Sales_Lag7','Sales_Lag8']
results_df = pd.DataFrame(columns=columns) 
predict_date = '2012-30'
    
store_list = retail['Store'].unique()
for c_store in store_list:
    dept_list=retail[retail['Store']==c_store]['Dept'].unique()
    for c_dept in dept_list:
        tmp_pred = []
        for i in range(1,9):
            c_name = 'Sales_Lag'+ str(i)
            my_cols = ['Date','Year-Week','Temperature','Fuel_Price','CPI','Unemployment','Size', 'Dept', 'IsHoliday']
            X_cols = ['Temperature','Fuel_Price','CPI','Unemployment','Size', 'Dept', 'IsHoliday']
            my_cols.append(c_name)
            retail_tmp = retail.query('Store=={store} & Dept=={dept}'.format(store=c_store, dept=c_dept))[my_cols]      
            retail_tmp = retail_tmp.dropna()
            # retail_tmp.set_index(retail_tmp['Date'], inplace=True) 
            X = retail_tmp[X_cols]
            y = retail_tmp[c_name]

            # print(str(c_store)+ '/' + str(c_dept) + " Shape: " + str(retail_tmp.shape))
            
    
            if  X.count()[0]>20:
                model =  LinearRegression()
                model.fit(X,y)
                y_pred = model.predict(X)

                # print(mean_squared_error(y, y_pred))
                # print("R^2: {0:.4f}".format(r2_score(y, y_pred)))
          
                
                # generate predictions for one 
                pred_one = retail_tmp[retail_tmp['Year-Week']==predict_date][X_cols]

 
                if pred_one.count()[0]>0: 
                    y_test_pred = model.predict(pred_one)
                   # print(y_test_pred)
               
                else:
                        y_test_pred =0
            else:
                y_test_pred =0
            
            tmp_pred.append(float(y_test_pred))    
                
        results_df = results_df.append({"Pred_Dt":predict_date,"Store": c_store,'Dept':c_dept,
                                        'Sales_Lag1':tmp_pred[0],
                                        'Sales_Lag2':tmp_pred[1],
                                        'Sales_Lag3':tmp_pred[2],
                                        'Sales_Lag4':tmp_pred[3],
                                        'Sales_Lag5':tmp_pred[4],
                                        'Sales_Lag6':tmp_pred[5],
                                        'Sales_Lag7':tmp_pred[6],
                                        'Sales_Lag8':tmp_pred[7]}
                                       , ignore_index=True)
        # print('On' + str(predict_date) + 'the predicted ' + str(c_name) + ' for Store: '+ str(c_store) + " dept " + str(c_dept) + " is:" + str(float(y_test_pred)))



In [None]:
Sales_col = results_df.columns[results_df.columns.str.contains(pat = 'Lag')]
results_df.set_index('Store')
tmp_res= results_df[Sales_col].groupby(results_df['Store']).sum(axis=1).sum(axis=1)

In [None]:
print('Prediction for each store for the next 8 weeks, as of year-week: ' + str(predict_date))
print(tmp_res.astype(int))

In [None]:
print('Prediction for the entire network for the next 8 weeks, as of year-week: ' + str(predict_date))
print(tmp_res.astype(int).sum(axis=0))