# Forecasting sales of mini-courses using prophet

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Data processing
import pandas as pd
import numpy as np

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt

# Process bar
from tqdm import tqdm
# Tracking time
from time import time

# Prophet model for time series forecast
from prophet import Prophet

In [None]:
# read the data
train = pd.read_csv(r'C:\Users\DELL\Kaggle_data\train.csv')
test = pd.read_csv(r'C:\Users\DELL\Kaggle_data\test.csv')
sample_submission = pd.read_csv(r'C:\Users\DELL\Kaggle_data\sample_submission.csv')

In [None]:
train.head()

In [None]:
plt.rcParams["figure.figsize"] = (9,6)

In [None]:
# create Unique id for each group, making the group id start from 1
train['unique_id'] = train.groupby(['country', 'store', 'product'], sort = False).ngroup()+1
# set the date to be the index but we can see it is repeated 
train.set_index('date')

In [None]:
# drop clumns not needed while creating the prophet model
train = train.drop(['id', 'country', 'store', 'product'], axis= 1)
train.head()

In [None]:
# Rename coluumns date and num_sold to ds and y as needed by prophet package. 
train.rename(columns= {'date': 'ds', 'num_sold':'y'}, inplace=True)
train.head()

In [None]:
# check the dataset
train.info()

In [None]:
# converting ds to datetime
train['ds'] = pd.to_datetime(train['ds'])
train.info()

In [None]:
# Group the data by ticker
groups_by_uniqueID = train.groupby('unique_id')
# Check the groups in the dataframe
groups_by_uniqueID.groups.keys()

In [None]:
groups_by_uniqueID.head()

In [None]:
# create list of unique_id
unique_id_list = list(groups_by_uniqueID.groups.keys())

In [None]:
# define function to train and forecast for specific horizon.

def train_and_forecast(group):
  # Initiate the model
  m = Prophet()
  
  # Fit the model
  m.fit(group)
  # Make predictions
  future = m.make_future_dataframe(periods= 365)
  forecast = m.predict(future)[['ds', 'yhat', 'yhat_lower', 'yhat_upper']]
  forecast['unique_id'] = group['unique_id'].iloc[0]
  
  # Return the forecasted results
  return forecast[['ds', 'unique_id', 'yhat', 'yhat_upper', 'yhat_lower']]

In [None]:
# Start time
start_time = time()
# Create an empty dataframe
for_loop_forecast = pd.DataFrame()
# Loop through each ticker
for unique_id in unique_id_list:
  # Get the data for the ticker
  group = groups_by_uniqueID.get_group(unique_id)  
  # Make forecast
  forecast = train_and_forecast(group)
  # Add the forecast results to the dataframe
  for_loop_forecast = pd.concat((for_loop_forecast, forecast))
print('The time used for the for-loop forecast is ', time()-start_time)
# Take a look at the data
for_loop_forecast.head()

In [None]:
min(for_loop_forecast['ds']), max(for_loop_forecast['ds'])

In [None]:
sales_forecast = for_loop_forecast[for_loop_forecast['ds'] >= '2022-01-01']
sales_forecast.head()

In [None]:
sales_forecast.info()

In [None]:
test_copy = test
test_copy['date'] = pd.to_datetime(test_copy['date'])
# create Unique id for each group, making the group id start from 1
test_copy['unique_id'] = test_copy.groupby(['country', 'store', 'product'], sort = False).ngroup()+1
# set the date to be the index but we can see it is repeated 
test_copy.set_index('date')
test_copy.info()

In [None]:
final = sales_forecast.merge(test_copy, how = 'left', left_on = ['unique_id', 'ds'], right_on = ['unique_id','date'])
final.head()

In [None]:
submission = final[['id', 'yhat']].sort_values('id').reset_index(drop = True)
submission.rename(columns={"yhat": "num_sold"}, inplace = True)
submission['num_sold'] = submission['num_sold'].astype(int) 
submission.head()

In [None]:
submission.info()

In [None]:
submission.to_csv('submission.csv', index = False)