In [None]:
# Importing required libraries
from dateutil.parser import parse
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

import statsmodels.api as sm
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.stattools import kpss
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.arima.model import ARIMA,ARIMAResults
from statsmodels.tsa.seasonal import seasonal_decompose
from pmdarima import auto_arima

# Todo: To be removed
import seaborn as sns
import plotly.express as px
import chart_studio.plotly as ply
import cufflinks as cf
import matplotlib.pyplot as plt
%matplotlib inline

cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)


In [None]:
pip install cufflinks 

In [None]:
# Importing CSV Dataset
#file = r"C:\Users\plahare\Downloads\BeerWineLiquor.csv"
df = pd.read_csv("BeerWineLiquor.csv")


# Setting Target col and Date col

In [None]:
# Dropdown for selecting target column for forecasting and date column 
target_col = 'beer'
ts_col = 'date'
# Drop down

# Set Index

In [None]:
# Changing date to datetime and set it as an index
def setIndex(data):
    data[ts_col] = pd.to_datetime(data[ts_col])
    data.set_index(ts_col,inplace=True)
    print(data.head())
    return

In [None]:
setIndex(df)

# Shape of Data / Rows & Cols

In [None]:
# displaying rows and columns
def Shape_df(data):
    s=data.shape
    print('No of rows :{}'.format(s[0]))
    print('No of Columns:{}'.format(s[1]))

In [None]:
Shape_df(df)

# Head & Tail

In [None]:
# giving choice to user to display head or tail 
# Todo: Remove choice
def display_head_tail(data, choice='Head'):
    if choice == 'Head':
        return data.head()
    elif choice=='Tail':
        return data.tail()
    else:
        return {"message": "Invalid choice."}

In [None]:
display_head_tail(df)
display_head_tail(df, 'Head')
display_head_tail(df, 'Tail')


# Describe function

In [None]:
# display descriptive statistics 
def Describe_data(df):
    return df.describe()

In [None]:
Describe_data(df)

# Resampling  Countinous/Discontinous

In [None]:
#df= df.asfreq(pd.infer_freq(df.index))

In [None]:
# Resampling Function 

# Team is working on this will update you 
def check_Continuity(data):
    c=pd.infer_freq(data.index)
    if c==None:
        print("This is non-continuous data")
        #Function for Resampling
    else:
        print("This is continuous data ")
        print(c)

In [None]:
#check_Continuity(df)

# Null Value Treatment

## List of columns having null values

In [None]:
#This functions creates a dictionary where columns are keys and values are percentage of null values present in that column 
def null_list(df):
    
    mydict={}#an empty dictionary for storing null value percentage
    list1=[]
    for i in df.columns:
        if df[i].isnull().sum()>0: #this is to create a dictionary with columns which has null values.
            mydict[i]=[(df.isnull().sum())*100 / len(df)][0][i]
    for j,k in mydict.items():
        list1.append(j)
    
    if len(list1)==0: 
        return {"message": "This dataset doesn't have any null values, kindly proceed with the EDA."}
        # print("This dataset doesn't have any null values , kindly proceed with the EDA .\n")
    else:
        return mydict

In [None]:
null_list(df)

## Graph to display percentage of null values

In [None]:
#for plotting the null values. this function plots graph of columns in the x-axis and its percentage of null values in the y-axis
def graph(df):
    
    null_percentage=(df.isnull().sum() *100)/len(df)
    x=np.array(df.columns)
    y=np.array(null_percentage)

    plt.figure(figsize=(12,10))
    
    # set orientation for X axis labels
    plt.xticks(rotation=70)

    {
        'xlabel': 'Columns',
        'ylabel': 'Percentage',
        'graphTitle': 'Percentage of null values present in each column',
        'nullPerrcentage': null_percentage,
        'xData': x,
        'yData': y,
        'graphType': 'bar'
    }

# draw bar chart
    plt.bar(x,y)
    return plt.show()

In [None]:
graph(df)

## Null Value Treatment

In [None]:
# Takes dataframe as an input and returns a dictionary with column names and null %
def get_null_percentages(df):
    mydict={}

    for key in df.columns:
        mydict[key] = [(df.isnull().sum())*100 / len(df)][0][key]
    
    return mydict

In [None]:
# Takes dataframe and column name as an input.
def drop_rows(df, col_name):
    return df.dropna(subset=[col_name], axis=0, how="any", inplace=True)


In [None]:
# Takes dataframe and column name as an input.
def drop_cols(df, col_name):
    return df.drop([col_name],axis=1,inplace=True)


In [None]:
# Takes dataframe, column name, and impute method as an input.
def impute(df, col_name, impute_method='interpolation'):

    if df.dtypes[col_name] == str or df.dtypes[col_name] == object:
        return df[col_name].fillna(df[col_name].mode()[0], inplace=True)

    else:
        flag1 = (df[col_name].isnull() & df[col_name].shift(-1).isnull()).any()
        flag2 = df[col_name].head(1).isnull().bool()
        flag3 = df[col_name].tail(1).isnull().bool()

        if flag1 or flag2 or flag3:
            return df[col_name].fillna(df[col_name].interpolate(method='linear', limit_direction="both"), inplace=True)

        elif impute_method == "locf" and (flag1 == False and flag2 == False and flag3 == False):
            return df[col_name].fillna(df[col_name].ffill(), inplace=True)
            
        elif impute_method == "nocb" and (flag1 == False and flag2 == False and flag3 == False):
            return df[col_name].fillna(df[col_name].bfill(), inplace=True)


In [None]:
# valid_choices = ['drop_rows', 'drop_cols', 'impute']

# def treat_nulls(df, col_name, choice='drop_rows'):

#     if df and col_name and (choice in valid_choices):

#         if choice == valid_choices[0]:
#             return drop_rows(df, col_name)

#         elif choice ==  valid_choices[1]:
#             return drop_cols(df, col_name)

#         elif choice ==  valid_choices[2]:
#             return impute(df, col_name)
            
#         else:
#             return {"message": "Invalid choice."}    

#     else:
#         return {"message": "Please provide dataframe and column name to process."}

In [3]:
#This function iterates through whole dataframe and treats the missing values with appropriate method chosen by the user.
def null_values(df, choice = 'drop_rows', listOfCols = []):

    #an empty dictionary for storing the null values and its percentage
    mydict={}

    for key in df.columns:
        mydict[key] = [(df.isnull().sum())*100 / len(df)][0][key]
    
    '''

        {
            'col1': 1,
            'col2': 0, 
            'col3': 10,
            'col4': 100
        }

    ''' 

    #looping through the whole dataframe using dictionary"mydict"
    # Note: key refers to the column name, and value refers to its NULL %
    for key, value in mydict.items():
        
        # if column has 0% null values, then ignore it. else do something to that column.
        if value==0:
            pass

        else:

            flag=True

            while flag:
                choice=input("Kindly choose whether you want to opt for dropping the rows/columns or would like to impute the values? Please type 'drop_rows' for dropping the rows or 'drop_column' for dropping the columns and 'impute' for filling the missing values\n")
                
                if choice=="drop_rows" or choice=="drop_column" or choice=="impute":
                    flag=False
                else :
                    print("enter a valid choice")

            #if user chooses to drop the rows, then it will perform the following operation
            if choice=="drop_rows":
                df.dropna(subset=[key],axis=0,how="any",inplace=True)
                
                mydict1={}
                for key in df.columns:
                    mydict1[key]=[(df.isnull().sum())*100 / len(df)][0][key]
                mydict.update(mydict1)


             #if user chooses to drop the column, then it will perform the following operation   
            elif choice=="drop_column":
                df.drop([key],axis=1,inplace=True)


            #if user chooses to impute the missing values, then it will perform the following operation
            elif choice=="impute":

                if df.dtypes[key]==str or df.dtypes[key]==object:
                    df[key].fillna(df[key].mode()[0], inplace=True)

                else:
                    
                    boolean=(df[key].isnull() & df[key].shift(-1).isnull()).any()
                    boolean1=df[key].head(1).isnull().bool()
                    boolean2=df[key].tail(1).isnull().bool()

                    if boolean==True or boolean1==True or boolean2==True:
                        df[key].fillna(df[key].interpolate(method='linear',limit_direction="both"),inplace=True)

                        
                    else:
                        Flag1=True
                        
                        while Flag1:
                            impute=input("Kindly Choose any one method for imputing missing values - please type 'LOCF' or 'NOCB'  or 'Interpolation'.\n")

                            if impute=="LOCF" or impute=="NOCB" or impute=="Interpolation":
                                Flag1=False
                            else:
                                print("enter a valid input")
                        
                        if impute=="LOCF":
                            df[key].fillna(df[key].ffill(),inplace=True)
                        elif impute=="NOCB":
                            df[key].fillna(df[key].bfill(),inplace=True)
                        elif impute=="Interpolation":
                            df[key].fillna(df[key].interpolate(method='linear',limit_direction="both"),inplace=True)
                        
                            
            
                
    print(df.isnull().sum())
    print("The null values have been successfully treated!")            

In [None]:
null_values(df)

# EDA

## Date vs target_col

In [None]:
#full plot of target column
def Plot_col(df,col_name):
    title = '{}'.format(col_name)
    df[target_col].plot(figsize=(12,6),title=title).autoscale(axis='both',tight=True)
    print("Interpretation:\n This graph represents visualization of dependent or target variable w.r.t Time.This depicts how the dependent variable varies with the time. X axis represents time and Y axis represents dependent variable. ")
    #df.plot(ts_col,col_name,figsize=(12,6),title=title).autoscale(axis='both',tight=True);

In [None]:
Plot_col(df,target_col)

In [None]:
# this is an interactive plot of above code using plotly
def plotly_line(data,col_name):
    fig= px.line(x=data.index,y=data[col_name])
    fig.show()
    print("Interpretation:\n This graph represents visualization of dependent or target variable w.r.t Time.This depicts how the dependent variable varies with the time. X axis represents time and Y axis represents dependent variable. ")

In [None]:
plotly_line(df,target_col)

In [None]:
#code to show the alias image to the user
from IPython.display import Image
Image(filename="C:\\Users\\DB4\\Downloads\\MicrosoftTeams-image.png",width=1000,height=400)

## Resampled plot

In [None]:
#resampled plot as per aliases input by user
def resample_plot(data,col_name):
    title = 'Resampled {} graph'.format(col_name)
    
    resample_alias = input("Please enter an offset alias: ")
    data[col_name].resample(resample_alias).max().plot.bar(figsize=(16,6), title=title);
    print("Interpretation:\n Resampling:\n Conversion of frequency of time in time series data. \nThis graph represents visualization of resampled dependent or target variable w.r.t Time.This depicts how the resampled dependent variable varies with the time. X axis represents resampled or extended time and Y axis represents dependent variable. Main use of this plot is to show how the data behaves with different frequencies. This deals with the missing dates also, so it helps to make data continuous. ")

In [None]:
resample_plot(df,target_col)

In [None]:
#plotly function for resampled plot
def Plotly_bar_resample(data,col_name):
    resample_alias = input("Please enter an offset alias: ")
    fig=px.bar(data[col_name].resample(resample_alias).max(),y=col_name,color=col_name,color_continuous_scale=px.colors.sequential.Aggrnyl_r)
    fig.update_layout(title_text='Resampled {} graph'.format(col_name))
    fig.show()

In [None]:
Plotly_bar_resample(df,target_col)

## Top n Values

In [None]:
# displaying top n values in dataframe
def top_n_values(data,col_name):
    n = int(input("How many top values do you want to see?\n"))
    print("Below are the top {0} values in the {1} column: ".format(n, col_name))
    return pd.DataFrame(data[col_name].sort_values(ascending = False).head(n))

In [None]:

top_n_dataf =top_n_values(df,target_col)
top_n_dataf

In [None]:
# plotly plot for visualizing top n values
def plot_top_n(data,col_name):
    n= len(top_n_dataf)
    fig = px.bar(data, x=data.index, y=col_name, labels={'x':'{}'.format(col_name)},
             color=col_name, color_continuous_scale=px.colors.sequential.Brwnyl)
    fig.update_layout(title_text='Top {} {} graph'.format(n,col_name))
    fig.show()
    print("Interpretation:\n This graph represents visualization of Top values of dependent or target variable w.r.t Time. X axis represents time and Y axis represents top values of dependent variable. ")

In [None]:
plot_top_n(top_n_dataf,target_col)

# Stationarity Check

## Seasonal Decompose during EDA

In [None]:
# seasonal decomposition plot
def decomposition(series):
    
    from statsmodels.tsa.seasonal import seasonal_decompose
    from dateutil.parser import parse
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import seaborn as sns
   
    

    plt.rcParams.update({'figure.figsize': (20,10)})
    y = series.to_frame()
    
    choice=input("Enter 'M' for Multiplicative decomposition & 'A' for Additive decomposition & 'MA' for both :\n")
    
    if choice == 'M':
        
    # Multiplicative Decomposition 
       seasonal_decompose(y, model='multiplicative',period = 52).plot().suptitle('Multiplicative Decompose', fontsize=22)
    elif choice == 'A':
        
    # Additive Decomposition
       seasonal_decompose(y, model='additive',period = 52).plot().suptitle('Additive Decompose', fontsize=22);
    elif choice=="MA":
        seasonal_decompose(y, model='multiplicative',period = 52).plot().suptitle('Multiplicative Decompose', fontsize=22)
        
        seasonal_decompose(y, model='additive',period = 52).plot().suptitle('Additive Decompose', fontsize=22)
    else:
        print(" This is invalid choice. Please choose Either M or A")
    print("""Interpretation:\n Here X axis represents Time and Y axis represents Normal scaled data. Time series has 4 components Trend,seasonality,cyclical variation and irregular variation. \n Trend component: This is useful in predicting future movements. Over a long period of time, the trend shows whether the data tends to increase or decrease. \n 
            Seasonal component: The seasonal component of a time series is the variation in some variable due to some predetermined patterns in its behavior. \n Cyclical component: The cyclical component in a time series is the part of the movement in the variable which can be explained by other cyclical movements in the economy. \n  irregular component: this term gives information about non-seasonal patterns.\n
            \nTime series has two types of decomposition models Additive Model and Multiplicative model. The plot shows the decomposition of your time series data in its seasonal component, its trend component and the remainder. If you add or multiply the decomposition together you would get back the actual data. First block represents original series , second represents trend , third represents seasonality presents, fourth represents error component or residual. 
            \nFor additive if we add below three blocks we get original data series. Similarly for multiplicative we have to multiply the components. """)
    


In [None]:
decomposition(df[target_col])

## Stationarity Check

## Stationarity Check Plot

In [None]:
# Plot for checking stationarity
def stationarity_check_plot(timeseries,col_name):
    #Determing rolling statistics
    rolmean = timeseries.rolling(12).mean()
    rolstd = timeseries.rolling(12).std()
    #Plot rolling statistics:
    plt.figure(figsize=(20,6))
    plt.plot(timeseries, color='blue',label='Original')
    plt.plot(rolmean, color='red', label='Rolling Mean')
    plt.plot(rolstd, color='black', label = 'Rolling Std')
    plt.legend(loc='best')
    plt.title('Rolling Mean and Standard Deviation for {}'.format(col_name))
    plt.show(block=False)
    print("Interpretation:\n\n Stationarity:\n\n Stationarity means that the statistical properties of a process generating a time series do not change over time. That is Mean and Standard deviation is approximately constant over time.\n\nStationarity Graph represents stationarity of the series w.r.t. Time. X axis depicts time and Y axis depicts Dependent variable . Blue line represents the original Time series data , Red line represents Mean of the series data and Black line represents standard deviation of the series. ")

In [None]:
stationarity_check_plot(df,target_col)

In [None]:
# Adf test for checking stationarity and display output to user
def adf_test(series):
    from statsmodels.tsa.stattools import adfuller
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import seaborn as sns
   
    print(f'Augmented Dickey-Fuller Test: ')
    result = adfuller(series.dropna(),autolag='AIC') # .dropna() handles differenced data
    
    labels = ['ADF test statistic','p-value','# lags used','# observations']
    out = pd.Series(result[0:4],index=labels)

    for key,val in result[4].items():
        out[f'critical value ({key})']=val
        
    print(out.to_string(), '\n')          # .to_string() removes the line "dtype: float64"
    
    if result[1] <= 0.05:
        print("Strong evidence against the null hypothesis")
        print("Reject the null hypothesis")
        print("Data has no unit root and is stationary")
    else:
        print("Weak evidence against the null hypothesis")
        print("Fail to reject the null hypothesis")
        print("Data has a unit root and is non-stationary")
        
    return out


In [None]:
adf_test(df[target_col])

In [None]:
# KPSS test for stationarity and display output
from statsmodels.tsa.stattools import kpss
def kpss_test(series):  
    statistic, p_value, n_lags, critical_values = kpss(series)
    # Format Output
    print(f'KPSS Statistic: {statistic}')
    print(f'p-value: {p_value}')
    print(f'num lags: {n_lags}')
    print('Critial Values:')
    for key, value in critical_values.items():
        print(f'   {key} : {value}')
    print(f'Result: The series is {"not " if p_value < 0.05 else ""}stationary')

In [None]:
kpss_test(df)

In [None]:
# Conversion of non stationarity to stationarity
def non_stationarity_stationarity(data,series):
    adf=input("Enter result of adf test either stationary or non stationary :\n")
    kpss=input("Enter result of kpss test either stationary or non stationary :\n")
    
    if adf=="stationary" and kpss=="stationary":
        
        print("Data has no unit root and is Stationary")
        
    elif adf=="non stationary" and kpss=="non stationary":
        
        print("Data has unit root and is non stationary, please make data stationary")
        
        choice=input("Enter T for Transformation method or D for differencing method :\n")
    
        if choice == 'T':
            data['data_log']=np.sqrt(series)
            data['data_diff']=data['data_log'].diff().dropna()
            adf_test(data['data_diff']); 
        
        elif choice == 'D':
            data["diff_1"] =series.diff(periods=1)
            data['diff_1'].dropna()
            adf_test(data['diff_1']); 
        
        else:
            print(" This is invalid choice. Please choose Either T or D")
        
    elif adf=="non stationary" and kpss=="stationary":
        data['data_log']=np.sqrt(series)
        data['data_diff']=data['data_log'].diff().dropna()
        adf_test(data['data_diff']); 
        
    elif adf=="stationary" and kpss=="non stationary":
        
        data["diff_1"] =series.diff(periods=1)
        data['diff_1'].dropna()
        adf_test(data['diff_1']); 
        
    else:
        print("Please enter valid input")

In [None]:
non_stationarity_stationarity(df,df[target_col])

# ACF PACF


In [None]:
# Plots for ACF and PACF
def ACF_PACF(series):



    choice=input("Ideal Choice for lags are considered to be 10% to 30% of the length of the data.That means lags between 10 to 30 might be used, Please choose Accordingly.:\n")
    lags=int(choice)
    plt.rcParams.update({'figure.figsize': (20,6)})

    sm.graphics.tsa.plot_acf(series, lags=lags,title='auto correlation ',zero=False);
    sm.graphics.tsa.plot_pacf(series, lags=lags,title='partial auto correlation ',zero=False);
    print("Interpretation : \n ")
    print("""ACF represnts auto correlation between varibles w.r.t Time into consideration all components of time series.PACF represnts correlation function of the variables with residuals partially . \n""")
    print("Both ACF & PACF starts at lag 0 , which is the correlation of variables with itself and therefore results in a correlation of 1. Difference between both is inclusion and exclusion of indirect correlations. Blue area depicts 95% confidence interval.\n")
    print("CONCLUSION:\n")
    print( """ Sharp Drop Point: 
            Instant drop lag just after lag 0.

            ACF sharp drop point implies MA order & PACF sharp drop point implies AR order 

            Some basic approach for model choosing are as follows:

            1. ACF plot declines gradually and PACF drops instantly use AR model.
            2. ACF drops instantly and PACF declines gradually use MA model. 
            3. Both declines gradually use ARMA model
            4. Both drops instantly we are not able to model the time series.

            Note:

            ARIMA and SARIMA models are Intergrated ARMA models we will use the same identified orders from both the plots.


            """)

In [None]:
ACF_PACF(df[target_col])

# Train Test Split

In [None]:
# spliting dataset
def split(data):
    size_input=float(input("Please enter the size of percentage where you want to split the data-for eg 0.75 for 75% or 0.80 for 80%"))
    #splitting 85%/15% because of little amount of data
    size = int(len(data) * size_input)
    train= data[:size]
    test = data[size:]
    return(train,test)
    
    
    




In [None]:
train,test= split(df)

In [None]:
train.head()#ignore this

In [None]:
train.shape#ignore this

In [None]:
test.shape#ignore this

# Forecasting 

## AutoArima

In [None]:
# Autoarima model
def gen_autoArima(df, col, m, f, periods, maxp=5, maxd=2, maxq=5, maxP=5, maxD=2, maxQ=5):
    automodel= auto_arima(df[col], seasonal=True, m=m, start_p=0, start_q=0, d=None, D=None, stepwise=True, max_p= maxp, max_d= maxd, max_q = maxq,
                         max_P= maxP, max_D= maxD, max_Q= maxQ)
    print(automodel.summary())
    preds, confint = automodel.predict(n_periods=periods, return_conf_int=True)
    index_of_fc = pd.date_range(df.index[-1], periods = periods, freq=f)
    fitted_series = pd.Series(preds, index=index_of_fc)
    lower_series = pd.Series(confint[:, 0], index=index_of_fc)
    upper_series = pd.Series(confint[:, 1], index=index_of_fc)
    print(preds)
    plt.plot(df[target_col])
    plt.plot(fitted_series, color='darkgreen')
    fitted_series.to_excel('Output_forecast.xlsx')
    plt.fill_between(lower_series.index,
                 lower_series,
                 upper_series,
                 color='k', alpha=.15)
    plt.savefig('Forecast_autoARIMA.png')

In [None]:
gen_autoArima(train, target_col, 1, 'M', 98, maxp=5, maxd=2, maxq=5, maxP=5, maxD=2, maxQ=5)

In [None]:
def gen_autoArima_plotly(df, col, m, f, periods, maxp=5, maxd=2, maxq=5, maxP=5, maxD=2, maxQ=5):
    automodel= auto_arima(df[col], seasonal=True, m=m, start_p=0, start_q=0, d=None, D=None, stepwise=True, max_p= maxp, max_d= maxd, max_q = maxq,
                         max_P= maxP, max_D= maxD, max_Q= maxQ)
    print(automodel.summary())
    preds, confint = automodel.predict(n_periods=periods, return_conf_int=True)
    index_of_fc = pd.date_range(df.index[-1], periods = periods, freq=f)
    fitted_series = pd.Series(preds, index=index_of_fc)
    lower_series = pd.Series(confint[:, 0], index=index_of_fc)
    upper_series = pd.Series(confint[:, 1], index=index_of_fc)
    print(preds)
    fitted_series.to_excel('Output_forecast_plotly.xlsx')
    fitted_dataframe=pd.DataFrame(fitted_series,index=index_of_fc)
    pd.concat([train[target_col],fitted_dataframe],axis=1).iplot()

In [None]:
gen_autoArima_plotly(train, target_col, 1, 'M', 49, maxp=5, maxd=2, maxq=5, maxP=5, maxD=2, maxQ=5)

## Functions for simpler models

In [None]:
#function for naive model
def naive_method(test_df):
    y_hat_naive = test_df.copy()
    y_hat_naive['naive_forecast'] = train[target_col][train_len-1]
    plt.figure(figsize=(12,4))
    plt.plot(train[target_col], label='Train')
    plt.plot(test[target_col], label='Test')
    plt.plot(y_hat_naive['naive_forecast'], label='Naive forecast')
    plt.legend(loc='best')
    plt.title('Naive Method')
    plt.show()
    rmse = np.sqrt(mean_squared_error(test[target_col], y_hat_naive['naive_forecast'])).round(2)
    mape = np.round(np.mean(np.abs(test[target_col]-y_hat_naive['naive_forecast'])/test[target_col])*100,2)
    results = pd.DataFrame({'Method':['Naive method'], 'MAPE': [mape], 'RMSE': [rmse]})
    results = results[['Method', 'RMSE', 'MAPE']]
    return results

In [None]:
#average method
def average_method(test_df):
    y_hat_average = test_df.copy()
    y_hat_average['average_forecast'] = train[target_col].mean()
    plt.figure(figsize=(12,4))
    plt.plot(train[target_col], label='Train')
    plt.plot(test[target_col], label='Test')
    plt.plot(y_hat_average['average_forecast'], label='Average forecast')
    plt.legend(loc='best')
    plt.title('Average Method')
    plt.show()
    rmse = np.sqrt(mean_squared_error(test[target_col], y_hat_average['average_forecast'])).round(2)
    mape = np.round(np.mean(np.abs(test[target_col]-y_hat_average['average_forecast'])/test[target_col])*100,2)
    results = pd.DataFrame({'Method':['Average method'], 'MAPE': [mape], 'RMSE': [rmse]})
    results = results[['Method', 'RMSE', 'MAPE']]
    return results

In [None]:
#function for Simple Moving average model
def simple_moving_average(df, ma_window):
    y_hat_sma = df.copy()
    y_hat_sma['sma_forecast'] = data[target_col].rolling(ma_window).mean()
    y_hat_sma['sma_forecast'][train_len:] = y_hat_sma['sma_forecast'][train_len-1]
    plt.figure(figsize=(12,4))
    plt.plot(train[target_col], label='Train')
    plt.plot(test[target_col], label='Test')
    plt.plot(y_hat_sma['sma_forecast'], label='Simple moving average forecast')
    plt.legend(loc='best')
    plt.title('Simple Moving Average Method')
    plt.show()
    rmse = np.sqrt(mean_squared_error(test[target_col], y_hat_sma['sma_forecast'][train_len:])).round(2)
    mape = np.round(np.mean(np.abs(test[target_col]-y_hat_sma['sma_forecast'][train_len:])/test[target_col])*100,2)
    results = pd.DataFrame({'Method':['Simple moving average forecast'], 'RMSE': [rmse],'MAPE': [mape] })
    results = results[['Method', 'RMSE', 'MAPE']]
    return results

In [None]:
#function for simple exponential smoothing model
def simple_exponential_smoothing(test_df,forecast_duration):
    model = SimpleExpSmoothing(train[target_col])
    model_fit = model.fit(smoothing_level=0.2,optimized=False)
    model_fit.params
    y_hat_ses = test_df.copy()
    y_hat_ses['ses_forecast'] = model_fit.forecast(forecast_duration)
    plt.figure(figsize=(12,4))
    plt.plot(train[target_col], label='Train')
    plt.plot(test[target_col], label='Test')
    plt.plot(y_hat_ses['ses_forecast'], label='Simple exponential smoothing forecast')
    plt.legend(loc='best')
    plt.title('Simple Exponential Smoothing Method')
    plt.show()
    rmse = np.sqrt(mean_squared_error(test[target_col], y_hat_ses['ses_forecast'])).round(2)
    mape = np.round(np.mean(np.abs(test[target_col]-y_hat_ses['ses_forecast'])/test[target_col])*100,2)

    results = pd.DataFrame({'Method':['Simple exponential smoothing forecast'], 'RMSE': [rmse],'MAPE': [mape] })
    return results

In [None]:
#function for holt exponential smoothing model
def HoltExponentialSmoothing(test_df,seasonal_periods,forecast_duration):
    model = ExponentialSmoothing(np.asarray(train[target_col]) ,seasonal_periods=seasonal_periods ,trend='additive', seasonal=None)
    model_fit = model.fit(smoothing_level=0.2, smoothing_slope=0.01, optimized=False)
    print(model_fit.params)
    y_hat_holt = test_df.copy()
    y_hat_holt['holt_forecast'] = model_fit.forecast(forecast_duration)
    plt.figure(figsize=(12,4))
    plt.plot( train[target_col], label='Train')
    plt.plot(test[target_col], label='Test')
    plt.plot(y_hat_holt['holt_forecast'], label='Holt\'s exponential smoothing forecast')
    plt.legend(loc='best')
    plt.title('Holt\'s Exponential Smoothing Method')
    plt.show()
    rmse = np.sqrt(mean_squared_error(test[target_col], y_hat_holt['holt_forecast'])).round(2)
    mape = np.round(np.mean(np.abs(test[target_col]-y_hat_holt['holt_forecast'])/test[target_col])*100,2)

    results = pd.DataFrame({'Method':['Holt\'s exponential smoothing method'], 'RMSE': [rmse],'MAPE': [mape] })
    return results

In [None]:
#function for Holtwinter exp smoothing additive model
def Holtwinter_exponentialsmoothing_additive(test_df,seasonal_periods,forecast_duration):    
    y_hat_hwa = test.copy()
    model = ExponentialSmoothing(np.asarray(train[target_col]) ,seasonal_periods=seasonal_periods ,trend='add', seasonal='add')
    model_fit = model.fit(optimized=True)
    y_hat_hwa['hw_forecast'] = model_fit.forecast(forecast_duration)
    plt.figure(figsize=(12,4))
    plt.plot( train[target_col], label='Train')
    plt.plot(test[target_col], label='Test')
    plt.plot(y_hat_hwa['hw_forecast'], label='Holt Winters\'s additive forecast')
    plt.legend(loc='best')
    plt.title('Holt Winters\' Additive Method')
    plt.show()
    rmse = np.sqrt(mean_squared_error(test[target_col], y_hat_hwa['hw_forecast'])).round(2)
    mape = np.round(np.mean(np.abs(test[target_col]-y_hat_hwa['hw_forecast'])/test[target_col])*100,2)
    results = pd.DataFrame({'Method':['Holt Winters\' additive method'], 'RMSE': [rmse],'MAPE': [mape] })
    return results

In [None]:
#function for holtwinter exp smoothing multiplicative method
def Holtwinter_exponentialsmoothing_multiplicative(test_df,seasonal_periods,forecast_duration):    
    y_hat_hwa = test.copy()
    model = ExponentialSmoothing(np.asarray(train[target_col]) ,seasonal_periods=seasonal_periods ,trend='add', seasonal='mul')
    model_fit = model.fit(optimized=True)
    y_hat_hwa['hw_forecast'] = model_fit.forecast(forecast_duration)
    plt.figure(figsize=(12,4))
    plt.plot( train[target_col], label='Train')
    plt.plot(test[target_col], label='Test')
    plt.plot(y_hat_hwa['hw_forecast'], label='Holt Winters\'s multiplicative forecast')
    plt.legend(loc='best')
    plt.title('Holt Winters\' multiplicative Method')
    plt.show()
    rmse = np.sqrt(mean_squared_error(test[target_col], y_hat_hwa['hw_forecast'])).round(2)
    mape = np.round(np.mean(np.abs(test[target_col]-y_hat_hwa['hw_forecast'])/test[target_col])*100,2)
    results = pd.DataFrame({'Method':['Holt Winters\' multiplicative method'], 'RMSE': [rmse],'MAPE': [mape] })
    return results