## World Wide Products!

This week, we will be looking at forecasting models using a sample dataset that deals with demand forecasting. 

In [1]:
# import
import pandas as pd
import numpy as np

In [27]:
df = pd.read_csv("../data/external/Historical Product Demand.csv")
df.dropna()
df.head(5)

Unnamed: 0,Product_Code,Warehouse,Product_Category,Date,Order_Demand
0,Product_0993,Whse_J,Category_028,2012/7/27,100
1,Product_0979,Whse_J,Category_028,2012/1/19,500
2,Product_0979,Whse_J,Category_028,2012/2/3,500
3,Product_0979,Whse_J,Category_028,2012/2/9,500
4,Product_0979,Whse_J,Category_028,2012/3/2,500


In [29]:
# convert date to datetime
import datetime
df['Date'] = pd.to_datetime(df['Date'])
df['Order_Demand'] = pd.to_numeric(df['Order_Demand'], errors='coerce')
df.head(5)

Unnamed: 0,Product_Code,Warehouse,Product_Category,Date,Order_Demand
0,Product_0993,Whse_J,Category_028,2012-07-27,100.0
1,Product_0979,Whse_J,Category_028,2012-01-19,500.0
2,Product_0979,Whse_J,Category_028,2012-02-03,500.0
3,Product_0979,Whse_J,Category_028,2012-02-09,500.0
4,Product_0979,Whse_J,Category_028,2012-03-02,500.0


In [30]:
# create new year and month features 
df['Year'], df['Month'] = df['Date'].dt.year, df['Date'].dt.month
df.head(5)

Unnamed: 0,Product_Code,Warehouse,Product_Category,Date,Order_Demand,Year,Month
0,Product_0993,Whse_J,Category_028,2012-07-27,100.0,2012.0,7.0
1,Product_0979,Whse_J,Category_028,2012-01-19,500.0,2012.0,1.0
2,Product_0979,Whse_J,Category_028,2012-02-03,500.0,2012.0,2.0
3,Product_0979,Whse_J,Category_028,2012-02-09,500.0,2012.0,2.0
4,Product_0979,Whse_J,Category_028,2012-03-02,500.0,2012.0,3.0


In [31]:
# want to understand the scope of the data we are looking at
df.shape

(1048575, 7)

In [32]:
# this dataset is too huge too look at. let's take a 100,000 entry sample
s = df.sample(random_state = 1, frac = 0.01, replace=True)
s.head(5)

Unnamed: 0,Product_Code,Warehouse,Product_Category,Date,Order_Demand,Year,Month
128037,Product_1107,Whse_A,Category_006,2012-04-12,200.0,2012.0,4.0
491755,Product_1453,Whse_J,Category_019,2014-10-30,10000.0,2014.0,10.0
470924,Product_0976,Whse_J,Category_028,2014-04-14,500.0,2014.0,4.0
791624,Product_0033,Whse_J,Category_005,2015-08-27,500.0,2015.0,8.0
491263,Product_0606,Whse_J,Category_001,2014-11-13,250.0,2014.0,11.0


In [33]:
# look at plots of our features compared to demand
import matplotlib.pyplot as plt
df.sort_values(by=['Product_Code'], inplace=True)
df.set_index(keys=['Product_Code'], drop = False, inplace = True)
products = df['Product_Code'].unique().tolist()

dfs = []
for product in products:
    temp = df.loc[df.Product_Code == product]
    dfs.append(temp)

In [35]:
# look at some naive stats
len(dfs)

2160

In [37]:
# add all-time mean demand for each product
for d in dfs:
    m = d["Order_Demand"].mean()
    d['Mean_Demand'] = m

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [42]:
# let's look at a random product
tester = dfs[1]

In [43]:
# split data
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
decision = tester[['Order_Demand']]
features = tester[['Product_Code','Warehouse', 'Product_Category', 'Date', 'Year', 'Month', 'Mean_Demand']]
train, test, train_d, test_d = train_test_split(features,
                                                decision,
                                                test_size = 0.2,
                                                random_state = 14)

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
