In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import math

from fbprophet import Prophet

In [2]:
# function to run Prophet and return the forecast
def ProphetForecast(df_input):
    m = Prophet(daily_seasonality=False, yearly_seasonality=True, 
            seasonality_mode='multiplicative', growth='logistic')
    m.fit(df_input)
    # find how many periods we need by finding the last day in the item
    days_to_predict = pd.to_datetime('2016-1-31') - df_input['ds'].max()
    future = m.make_future_dataframe(periods=days_to_predict.days)
    # in logarithmic space 10-2 is 0.01
    future['floor'] = -2
    future['cap'] = df_input['y'].max()
    forecast = m.predict(future)
    return forecast

In [3]:
df_sales = pd.read_csv('sales_train_v2.csv')
df_sales['ds'] = pd.to_datetime(df_sales['date'],  dayfirst=True)

# remove 0 or negative item count
df_sales = df_sales.loc[df_sales['item_cnt_day']>0]


# Create dataframe of most sold items

In [4]:
# find the most sold items
df_sales_totalitem = df_sales.loc[(df_sales['item_cnt_day']>0)]
df_sales_totalitem = df_sales_totalitem.groupby(['item_id'])['item_cnt_day'].sum().reset_index(name='item_cnt_total')
df_sales_totalitem = df_sales_totalitem.sort_values(by='item_cnt_total',ascending=False)
# drop items that did not sell more than 3000 items to start
df_sales_totalitem = df_sales_totalitem.loc[df_sales_totalitem['item_cnt_total']>3000]

# Loop starts here

In [5]:
# create a yhat dataframe to put the forecasted data for each item
yhat = df_sales[['ds']]
for index, row in df_sales_totalitem.iterrows():
    # create df for prophet
    df_fbprophet = df_sales.loc[(df_sales['item_id']==row['item_id'])]
    # sum the total sold for each day for this item
    df_fbprophet = df_fbprophet.groupby(['ds'])['item_cnt_day'].sum().reset_index(name='y')

    # convert to log space
    df_fbprophet['y'] = np.log(df_fbprophet['y'])

    # in logarithmic space 10-2 is 0.01
    df_fbprophet['floor']=-2
    df_fbprophet['cap']=df_fbprophet['y'].max()
    forecast = ProphetForecast(df_fbprophet)
    # forecast for the whole month of November
    print('item_id: ' + str(row['item_id']) + ', total items:' + str(row['item_cnt_total']), end="")
    print(', Nov 2015 forecast: '
          + str(int(round(np.exp(forecast.loc[(forecast['ds']>=pd.to_datetime('2015-11-1')) 
                             & (forecast['ds']<=pd.to_datetime('2015-11-30'))]['yhat']).sum()))))
    # make a new column in forecast with the name of our item
    #forecast[str(int(row['item_id']))] = np.exp(forecast['yhat'])
    # merge that data into the correct date time in our dataframe
    #yhat = pd.merge(yhat,forecast[['ds', str(int(row['item_id']))]], how='outer', on='ds')


INFO:numexpr.utils:NumExpr defaulting to 2 threads.


item_id: 20949.0, total items:187660.0, Nov 2015 forecast: 3079
item_id: 2808.0, total items:17255.0, Nov 2015 forecast: 40
item_id: 3732.0, total items:16676.0, Nov 2015 forecast: 45
item_id: 17717.0, total items:15830.0, Nov 2015 forecast: 733
item_id: 5822.0, total items:14522.0, Nov 2015 forecast: 235
item_id: 3734.0, total items:11733.0, Nov 2015 forecast: 45
item_id: 6675.0, total items:10315.0, Nov 2015 forecast: 50
item_id: 3731.0, total items:10105.0, Nov 2015 forecast: 51
item_id: 1855.0, total items:10041.0, Nov 2015 forecast: 37
item_id: 16787.0, total items:9255.0, Nov 2015 forecast: 25
item_id: 7856.0, total items:9016.0, Nov 2015 forecast: 62
item_id: 4181.0, total items:8872.0, Nov 2015 forecast: 198
item_id: 3331.0, total items:8278.0, Nov 2015 forecast: 36
item_id: 2445.0, total items:7800.0, Nov 2015 forecast: 45
item_id: 2308.0, total items:7598.0, Nov 2015 forecast: 148
item_id: 4870.0, total items:7574.0, Nov 2015 forecast: 93
item_id: 6738.0, total items:7357.0, 



item_id: 4249.0, total items:3597.0, Nov 2015 forecast: 62
item_id: 3335.0, total items:3592.0, Nov 2015 forecast: 27
item_id: 10298.0, total items:3567.0, Nov 2015 forecast: 64
item_id: 1556.0, total items:3548.0, Nov 2015 forecast: 9
item_id: 2252.0, total items:3546.0, Nov 2015 forecast: 74
item_id: 16227.0, total items:3544.0, Nov 2015 forecast: 77
item_id: 4806.0, total items:3536.0, Nov 2015 forecast: 133
item_id: 21404.0, total items:3528.0, Nov 2015 forecast: 40
item_id: 16169.0, total items:3502.0, Nov 2015 forecast: 81
item_id: 1511.0, total items:3498.0, Nov 2015 forecast: 2
item_id: 6502.0, total items:3495.0, Nov 2015 forecast: 8
item_id: 22087.0, total items:3448.0, Nov 2015 forecast: 62
item_id: 21440.0, total items:3406.0, Nov 2015 forecast: 36
item_id: 7834.0, total items:3363.0, Nov 2015 forecast: 26
item_id: 7172.0, total items:3355.0, Nov 2015 forecast: 58
item_id: 12168.0, total items:3298.0, Nov 2015 forecast: 7
item_id: 1555.0, total items:3202.0, Nov 2015 foreca

KeyboardInterrupt: 

In [None]:
#fig, ax = plt.subplots(figsize=(15,8))
#ax = yhat.plot(ax=ax, x='ds', y='17717',  color='r',   legend=False, grid=True)
#ax = yhat.plot(ax=ax, x='ds', y='20949',  color='b',   legend=False, grid=True)

In [None]:
# 30 days of November
yhat.loc[(yhat['ds']>=pd.to_datetime('2015-11-1'))
             & (yhat['ds']<=pd.to_datetime('2015-11-30'))]