In [1]:
import pandas as pd
import numpy as np
from statsmodels.formula.api import ols
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import pickle
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go

In [2]:
sales = pd.read_csv("../../Dataset/FINAL_LinkedCleanSalesWeatherWithEncoding.csv",index_col="date",parse_dates=True)

In [3]:
sales.head()

Unnamed: 0_level_0,station_nbr,item_nbr,units,tmax,tmin,depart,dewpoint,wetbulb,heat,cool,...,smoke,widespread_dust,sandstorm,squall,freezing,shallow,partial,patches,blowing,vicinity
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-01-01,1,1,0,52.0,31.0,,36.0,40.0,23.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2012-01-01,1,2,0,52.0,31.0,,36.0,40.0,23.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2012-01-01,1,3,0,52.0,31.0,,36.0,40.0,23.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2012-01-01,1,4,0,52.0,31.0,,36.0,40.0,23.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2012-01-01,1,5,0,52.0,31.0,,36.0,40.0,23.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [4]:
sales.groupby("item_nbr")["units"].sum().sort_values(ascending=False).head()

item_nbr
45    1005111
9      916615
5      846662
44     577193
16     226772
Name: units, dtype: int64

In [5]:
# functions
def saveFile(model, filename):
    pickle.dump(model, open(filename, 'wb'))
    
def loadFile(filename):
    return pickle.load(open(filename, 'rb'))

def predict_sales(target_itemSales, sales_items, model):
    target_itemSales["units_p"] = pd.DataFrame(model.predict(sales_items))
    target_itemSales.date = pd.to_datetime(target_itemSales.date)
    target_itemSales.set_index("date",inplace=True)

    target_itemSales.units_p.fillna(0,inplace=True)
    target_itemSales.units_p = target_itemSales.units_p.astype(np.int64)
    target_itemSales.units_p = target_itemSales.units_p.apply(lambda x: 0 if x < 0 else x)
    
def create_chart(df, title):
    trace1 = go.Scatter(
        x = df.index,
        y = df["units"],
        mode = 'markers',
        name = 'units sold'
    )

    trace2 = go.Scatter(
        x = df.index,
        y = df["units_p"],
        mode = 'line',
        name = 'predict'
    )

    layout= go.Layout(
        title= title,
        xaxis= dict(
            title= 'Date',
        ),
        yaxis=dict(
            title= 'Units Sold',
        ),
        showlegend= True
    )

    data = [trace1, trace2]
    fig= go.Figure(data=data, layout=layout)
    iplot(fig)

In [6]:
# load regression
regression = loadFile("../../Regression Analysis/OptRegressionMSE")

In [7]:
regression.head()

Unnamed: 0_level_0,model,selection,rsquared_adj,MSE
item_nbr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,<statsmodels.regression.linear_model.Regressio...,backward,0.048489,0.130139
2,<statsmodels.regression.linear_model.Regressio...,backward,0.061097,0.918617
3,<statsmodels.regression.linear_model.Regressio...,backward,0.08902,0.075939
4,<statsmodels.regression.linear_model.Regressio...,backward,0.007371,0.026946
5,<statsmodels.regression.linear_model.Regressio...,forward,0.176935,3611.746326


In [8]:
# get models
model_45 = regression.loc[45]['model']
model_9 = regression.loc[9]['model']
model_5 = regression.loc[5]['model']

In [9]:
# filter on items
sales_items_45 = sales.loc[sales.item_nbr == 45]
sales_items_9 = sales.loc[sales.item_nbr == 9]
sales_items_5 = sales.loc[sales.item_nbr == 5]

In [10]:
# create only units
target_itemSales_45 = sales_items_45.loc[:]['units']
target_itemSales_9 = sales_items_9.loc[:]['units']
target_itemSales_5 = sales_items_5.loc[:]['units']

In [11]:
# reindex
sales_items_45 = sales_items_45.reset_index()
target_itemSales_45 = target_itemSales_45.reset_index() 

sales_items_9 = sales_items_9.reset_index()
target_itemSales_9 = target_itemSales_9.reset_index() 

sales_items_5 = sales_items_5.reset_index()
target_itemSales_5 = target_itemSales_5.reset_index() 


In [12]:
predict_sales(target_itemSales_45, sales_items_45, model_45)
predict_sales(target_itemSales_9, sales_items_9, model_9)
predict_sales(target_itemSales_5, sales_items_5, model_5)

In [13]:
create_chart(target_itemSales_45.resample("M").sum(), 'Monthly Sale Prediction for Item 45')

In [14]:
create_chart(target_itemSales_9.resample("M").sum(), 'Monthly Sale Prediction for Item 9')

In [15]:
create_chart(target_itemSales_5.resample("M").sum(), 'Monthly Sale Prediction for Item 5')

In [16]:
create_chart(target_itemSales_45.resample("A").sum(), 'Yearly Sale Prediction for Item 45')

In [17]:
create_chart(target_itemSales_9.resample("A").sum(), 'Yearly Sale Prediction for Item 9')

In [18]:
create_chart(target_itemSales_5.resample("A").sum(), 'Yearly Sale Prediction for Item 5')