In [4]:
import pandas as pd 
import numpy as np 
from pmdarima.model_selection import train_test_split
from neuralprophet import NeuralProphet
from prophet import Prophet
from sklearn.metrics import mean_absolute_percentage_error
import mlflow
import mlflow.pyfunc
import os
from datetime import timedelta
from dateutil.relativedelta import relativedelta
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("train_data.csv")
df

Unnamed: 0,date,store,item,sales
0,2013-01-01,1,1,13
1,2013-01-02,1,1,11
2,2013-01-03,1,1,14
3,2013-01-04,1,1,13
4,2013-01-05,1,1,10
...,...,...,...,...
91295,2017-12-27,1,50,38
91296,2017-12-28,1,50,52
91297,2017-12-29,1,50,59
91298,2017-12-30,1,50,66


In [3]:
"""data for initial first model traning"""
base_data = df[df["date"] < "2017-12-01"]
base_data

Unnamed: 0,date,store,item,sales
0,2013-01-01,1,1,13
1,2013-01-02,1,1,11
2,2013-01-03,1,1,14
3,2013-01-04,1,1,13
4,2013-01-05,1,1,10
...,...,...,...,...
91264,2017-11-26,1,50,92
91265,2017-11-27,1,50,54
91266,2017-11-28,1,50,80
91267,2017-11-29,1,50,54


In [4]:
"""data for foracsating next month and building new model with updated data"""
first_month_data = df[df["date"] > "2017-12-01"]
first_month_data

Unnamed: 0,date,store,item,sales
1796,2017-12-02,1,1,16
1797,2017-12-03,1,1,31
1798,2017-12-04,1,1,7
1799,2017-12-05,1,1,20
1800,2017-12-06,1,1,17
...,...,...,...,...
91295,2017-12-27,1,50,38
91296,2017-12-28,1,50,52
91297,2017-12-29,1,50,59
91298,2017-12-30,1,50,66


# Phase 1 : Preprocess

In [5]:
"""In pre process first grouping by items i.e. 50 
   then by droping unique columns geting final X and Y dataframe"""
def pre_pros(df):
    y = {}
    X = {}
    for item,data in df.groupby("item"):
        data = data.drop([col for col in data.columns if data[col].nunique()==1], axis=1)
        data.rename(columns={"sales" : f"sales_{item}"}, inplace=True)
        X.update({"date" : data["date"].values})
        y.update({f"sales_{item}" : data[f"sales_{item}"].values})
    df_X = pd.DataFrame(X)
    df_y = pd.DataFrame(y)
    df_final = pd.concat([df_X, df_y], axis=1)
    
    if not os.path.exists(r"D:\MLOPs POC\prepros_sales_data\updated_sales_data.csv"):
        os.mkdir("prepros_sales_data") 
        df_final.to_csv(r"D:\MLOPs POC\prepros_sales_data\updated_sales_data.csv", index=False )
    else:
        df_old = pd.read_csv(r"D:\MLOPs POC\prepros_sales_data\updated_sales_data.csv")
        df_new = df_final
        df_final = pd.concat([df_old,df_new], axis=0, ignore_index= True)
        df_final.to_csv(r"D:\MLOPs POC\prepros_sales_data\updated_sales_data.csv", index=False)
    return df_final

In [6]:
proc_data = pre_pros(base_data)

In [7]:
proc_data

Unnamed: 0,date,sales_1,sales_2,sales_3,sales_4,sales_5,sales_6,sales_7,sales_8,sales_9,...,sales_41,sales_42,sales_43,sales_44,sales_45,sales_46,sales_47,sales_48,sales_49,sales_50
0,2013-01-01,13,33,15,10,11,31,25,33,18,...,6,21,22,20,37,30,17,21,18,30
1,2013-01-02,11,43,30,11,6,36,23,37,23,...,15,24,27,15,40,30,15,26,10,32
2,2013-01-03,14,23,14,8,8,18,34,38,25,...,5,14,19,11,42,30,5,25,17,25
3,2013-01-04,13,18,10,19,9,19,36,54,22,...,9,22,29,22,49,37,13,26,22,32
4,2013-01-05,10,34,23,12,8,31,38,51,29,...,13,18,34,19,52,28,12,28,15,35
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1790,2017-11-26,24,78,43,28,32,73,81,85,58,...,25,42,59,41,92,72,34,66,24,92
1791,2017-11-27,10,58,23,21,20,50,50,54,50,...,16,24,33,27,76,49,19,34,31,54
1792,2017-11-28,14,57,36,23,18,60,53,71,64,...,15,34,46,31,79,65,14,53,27,80
1793,2017-11-29,28,66,42,18,9,67,61,83,53,...,38,45,53,27,80,54,22,56,23,54


# Phase 2 :  Build Model

In [8]:
def train_test_data(data, split_test = 0.2):
    train, test = train_test_split(data, test_size = split_test) # splite by dates 
    return {"train" : train, "test" : test}

In [9]:
dict_data = train_test_data(proc_data, 0.1)

In [10]:
dict_data

{'train':             date  sales_1  sales_2  sales_3  sales_4  sales_5  sales_6  \
 0     2013-01-01       13       33       15       10       11       31   
 1     2013-01-02       11       43       30       11        6       36   
 2     2013-01-03       14       23       14        8        8       18   
 3     2013-01-04       13       18       10       19        9       19   
 4     2013-01-05       10       34       23       12        8       31   
 ...          ...      ...      ...      ...      ...      ...      ...   
 1610  2017-05-30       17       57       44       26       20       65   
 1611  2017-05-31       20       60       55       25       27       69   
 1612  2017-06-01       29       83       57       22       27       65   
 1613  2017-06-02       25       85       54       19       19       56   
 1614  2017-06-03       36      106       37       34       27       56   
 
       sales_7  sales_8  sales_9  ...  sales_41  sales_42  sales_43  sales_44  \
 0      

In [11]:
"""model evaluation metrics"""
item_metrics = {}
def eval_model(train_true,train_pred,test_true,test_pred):
    MAPE_train = mean_absolute_percentage_error(train_true,train_pred)
    MAPE_test = mean_absolute_percentage_error(test_true,test_pred)
    dict_eval = {"MAPE_train" : MAPE_train, "MAPE_test" : MAPE_test}
    return dict_eval

"""building nuralprofet model for each item i.e. 50 models"""
def model_build_NP(data):    
    """using only train and test data"""
    train = data["train"]
    test = data["test"]
    # date time formate
    train["date"] = pd.to_datetime(train["date"], format="%Y-%m-%d") 
    test["date"] = pd.to_datetime(test["date"], format="%Y-%m-%d")
    
    model_dict = {}
    col_names = train.columns
    for y_indx in range(1, len(train.columns)):
        """item key for model_dict"""
        item_name = col_names[y_indx]
            
        #with mlflow.start_run(run_name=f"{item_name}_run") as run:
        """train data processing as pere NP formate"""
        tsdf_train = train.iloc[:, [0, y_indx]]
        tsdf_train.columns = ["ds", "y"] 

        """test data processing as pere NP formate"""
        tsdf_test = test.iloc[:, [0, y_indx]]
        tsdf_test.columns = ["ds", "y"] 

        """model traning"""
        # n_lags= 7 perform good but issue in 30 days data creation for fourcast
        npf = Prophet()
        npf.fit(tsdf_train)
        model_dict[item_name] = npf

        """train and test pred and true values"""
        pred_train = npf.predict(tsdf_train)
        pred_test = npf.predict(tsdf_test)
        # change with lag eg. lag 7 : pred.iloc[7:,1], pred.iloc[7:,2]
        train_true = pred_train.iloc[:,1]
        train_pred = pred_train.iloc[:,2]
        test_true = pred_test.iloc[:,1]
        test_pred = pred_test.iloc[:,2]

        """model evaluation metrics"""
        dict_eval = eval_model(train_true,train_pred,test_true,test_pred)
        item_metrics[item_name] = dict_eval

        """mlflow tracking metrics and model item wise i.e. 50"""
        #mlflow.prophet.log_model(npf, f"{item_name}_model") # we need model train on whole data not just on train data 
        #mlflow.log_metric("MAPE_train",dict_eval["MAPE_train"])
        #mlflow.log_metric("MAPE_test",dict_eval["MAPE_test"])
    return model_dict



In [12]:
dict_models = model_build_NP(dict_data)

19:44:29 - cmdstanpy - INFO - Chain [1] start processing
19:44:29 - cmdstanpy - INFO - Chain [1] done processing
19:44:30 - cmdstanpy - INFO - Chain [1] start processing
19:44:30 - cmdstanpy - INFO - Chain [1] done processing
19:44:31 - cmdstanpy - INFO - Chain [1] start processing
19:44:31 - cmdstanpy - INFO - Chain [1] done processing
19:44:32 - cmdstanpy - INFO - Chain [1] start processing
19:44:32 - cmdstanpy - INFO - Chain [1] done processing
19:44:33 - cmdstanpy - INFO - Chain [1] start processing
19:44:33 - cmdstanpy - INFO - Chain [1] done processing
19:44:34 - cmdstanpy - INFO - Chain [1] start processing
19:44:34 - cmdstanpy - INFO - Chain [1] done processing
19:44:35 - cmdstanpy - INFO - Chain [1] start processing
19:44:35 - cmdstanpy - INFO - Chain [1] done processing
19:44:36 - cmdstanpy - INFO - Chain [1] start processing
19:44:36 - cmdstanpy - INFO - Chain [1] done processing
19:44:37 - cmdstanpy - INFO - Chain [1] start processing
19:44:37 - cmdstanpy - INFO - Chain [1]

In [13]:
"""all the 50 model for each item"""
dict_models

{'sales_1': <prophet.forecaster.Prophet at 0x255a8ac6b20>,
 'sales_2': <prophet.forecaster.Prophet at 0x255aed8a6d0>,
 'sales_3': <prophet.forecaster.Prophet at 0x255a1d295e0>,
 'sales_4': <prophet.forecaster.Prophet at 0x255b0e68310>,
 'sales_5': <prophet.forecaster.Prophet at 0x255aed8b280>,
 'sales_6': <prophet.forecaster.Prophet at 0x255ae8cfc10>,
 'sales_7': <prophet.forecaster.Prophet at 0x255a1d294f0>,
 'sales_8': <prophet.forecaster.Prophet at 0x255b0e50340>,
 'sales_9': <prophet.forecaster.Prophet at 0x255ae9f21c0>,
 'sales_10': <prophet.forecaster.Prophet at 0x255b1729490>,
 'sales_11': <prophet.forecaster.Prophet at 0x255af085670>,
 'sales_12': <prophet.forecaster.Prophet at 0x255b1886760>,
 'sales_13': <prophet.forecaster.Prophet at 0x255b1e25ac0>,
 'sales_14': <prophet.forecaster.Prophet at 0x255b17c7070>,
 'sales_15': <prophet.forecaster.Prophet at 0x255b10731f0>,
 'sales_16': <prophet.forecaster.Prophet at 0x255b0e7b8e0>,
 'sales_17': <prophet.forecaster.Prophet at 0x255

In [14]:
"""item wise performance metrics"""
item_metrics

{'sales_1': {'MAPE_train': 0.31293262154274404,
  'MAPE_test': 0.1816406446365615},
 'sales_2': {'MAPE_train': 0.23635606019912578,
  'MAPE_test': 0.13632838222863575},
 'sales_3': {'MAPE_train': 0.26584104061628766,
  'MAPE_test': 0.15077071225737426},
 'sales_4': {'MAPE_train': 0.31270471028770236,
  'MAPE_test': 0.19140446871260752},
 'sales_5': {'MAPE_train': 0.33070292608222035,
  'MAPE_test': 0.195850931763766},
 'sales_6': {'MAPE_train': 0.23516444437777825,
  'MAPE_test': 0.1371350899118209},
 'sales_7': {'MAPE_train': 0.23908950999327924,
  'MAPE_test': 0.1379815972358593},
 'sales_8': {'MAPE_train': 0.21826527167601414,
  'MAPE_test': 0.12600744452158355},
 'sales_9': {'MAPE_train': 0.24034952251228348,
  'MAPE_test': 0.13957642239248752},
 'sales_10': {'MAPE_train': 0.2229429130877759,
  'MAPE_test': 0.12457737278366593},
 'sales_11': {'MAPE_train': 0.22326183535983068,
  'MAPE_test': 0.1269732159338189},
 'sales_12': {'MAPE_train': 0.22825447201580062,
  'MAPE_test': 0.1279

# phase 3: final model building with all data (we can add model compare in future here)

In [15]:
def final_model_build_NP(data): 
    """considering whole data for traning final models"""
    data["date"] = pd.to_datetime(data["date"], format="%Y-%m-%d") # date time formate
    final_model_dict = {}
    col_names = data.columns
    for y_indx in range(1, len(data.columns)):
        item_name = col_names[y_indx] # item key for model_dict
        with mlflow.start_run(run_name=f"{item_name}_run") as run:
            tsdf = data.iloc[:, [0, y_indx]]
            tsdf.columns = ["ds", "y"] # NP formate colums
            # model traning
            npf = Prophet() 
            npf.fit(tsdf)
            final_model_dict[item_name] = npf
              
            """mlflow tracking metrics and model item wise i.e. 50"""
            mlflow.prophet.log_model(npf, f"{item_name}_model") # we need model train on whole data not just on train data
            """eval metrics from traning and testing model"""
            dict_eval = item_metrics[item_name]
            mlflow.log_metric("MAPE_train",dict_eval["MAPE_train"])
            mlflow.log_metric("MAPE_test",dict_eval["MAPE_test"])
    return final_model_dict

In [16]:
final_models = final_model_build_NP(proc_data)

The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh()

All git commands will error until this is rectified.

$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - error|e|raise|r|2: for a raised exception

Example:
    export GIT_PYTHON_REFRESH=quiet

19:45:18 - cmdstanpy - INFO - Chain [1] start processing
19:45:19 - cmdstanpy - INFO - Chain [1] done processing
19:45:19 - cmdstanpy - INFO - Chain [1] start processing
19:45:19 - cmdstanpy - INFO - Chain [1] done processing
19:45:19 - cmdstanpy - INFO - Chain [1] start processing
19:45:19 - cmdstanpy - INFO - Chain [1] done processing
19:45:20 - cmdstanpy - INFO - Chain [1] start processing
19:45:20 - cmdstanpy - INFO - Chain [1] done processing
19:45:20 - cmdstanpy - INFO - Chain [1] start processing
19:45:20 - cmdstanpy - INFO - Chain [1] done processing
19:45:20 - cmdstanpy - INFO 

In [17]:
final_models

{'sales_1': <prophet.forecaster.Prophet at 0x255aed8bb20>,
 'sales_2': <prophet.forecaster.Prophet at 0x255b25d5be0>,
 'sales_3': <prophet.forecaster.Prophet at 0x255b16c95e0>,
 'sales_4': <prophet.forecaster.Prophet at 0x255b25d9100>,
 'sales_5': <prophet.forecaster.Prophet at 0x255b26801c0>,
 'sales_6': <prophet.forecaster.Prophet at 0x255b175a2b0>,
 'sales_7': <prophet.forecaster.Prophet at 0x255b273b070>,
 'sales_8': <prophet.forecaster.Prophet at 0x255b12351c0>,
 'sales_9': <prophet.forecaster.Prophet at 0x255b2bd6b20>,
 'sales_10': <prophet.forecaster.Prophet at 0x255b2a9e820>,
 'sales_11': <prophet.forecaster.Prophet at 0x255b2c2a6d0>,
 'sales_12': <prophet.forecaster.Prophet at 0x255b27dc370>,
 'sales_13': <prophet.forecaster.Prophet at 0x255b2c19130>,
 'sales_14': <prophet.forecaster.Prophet at 0x255b2aa9040>,
 'sales_15': <prophet.forecaster.Prophet at 0x255b2e9a070>,
 'sales_16': <prophet.forecaster.Prophet at 0x255b2f04a00>,
 'sales_17': <prophet.forecaster.Prophet at 0x255

# Phase 4 : fourcast next month with final models

In [18]:
def fourcast(final_models,process_data,period=30):    
    process_data["date"] = pd.to_datetime(process_data["date"], format="%Y-%m-%d") # date time formate

    col_names = process_data.columns
    
    dict_fourcast = {}
    for y_indx in range(1, len(process_data.columns)):
        tsdf = process_data.iloc[:, [0, y_indx]]
        item_name = col_names[y_indx] # item key for model_dict
        
        tsdf.columns = ["ds", "y"] # NP formate colums
        # model pred and eval
        npfinal = final_models[item_name]
        future = npfinal.make_future_dataframe(periods= period, include_history=False) 
        forecast = npfinal.predict(future)
        # single dataframe with all items sales
        dict_fourcast[item_name] = forecast["yhat"].values
        df_fourcast = pd.DataFrame(dict_fourcast)
        final_fourcast = pd.concat([future["ds"], df_fourcast], axis=1)
        final_fourcast.rename(columns = {"ds" : "Date"}, inplace = True)
    return final_fourcast
    

In [19]:
final_fourcast1 = fourcast(final_models, proc_data, 30)

In [20]:
final_fourcast1

Unnamed: 0,Date,sales_1,sales_2,sales_3,sales_4,sales_5,sales_6,sales_7,sales_8,sales_9,...,sales_41,sales_42,sales_43,sales_44,sales_45,sales_46,sales_47,sales_48,sales_49,sales_50
0,2017-12-01,21.461146,56.732091,36.508991,21.995378,18.669934,56.311696,56.275436,76.49726,49.883639,...,21.278587,36.800428,51.171689,29.709783,78.515818,56.686964,21.785077,50.125728,28.572159,64.689224
1,2017-12-02,22.891849,59.616974,37.803947,23.619834,19.601652,59.31959,60.503248,80.063332,53.193619,...,22.553251,38.349515,53.362997,31.204366,81.457784,58.487182,22.756346,51.774499,29.953917,68.655083
2,2017-12-03,23.242096,62.175462,40.080671,24.323526,20.485296,62.080427,62.099121,83.610619,54.815616,...,22.582603,40.349255,56.033472,31.848294,86.945157,62.166156,23.624523,55.218688,32.00696,70.274117
3,2017-12-04,14.594405,40.191519,26.477423,15.813441,13.69302,39.929309,39.960319,54.38678,36.311316,...,14.34962,27.371596,37.406067,21.287018,55.879935,39.765636,15.722134,35.272704,20.720365,46.635924
4,2017-12-05,16.85229,46.347642,29.550642,18.421093,15.694098,46.668962,46.266546,63.773379,41.314639,...,16.644199,30.813281,41.591876,24.476882,64.2079,45.686223,18.068426,41.220441,23.521023,53.584539
5,2017-12-06,17.061863,46.438333,29.513367,18.131713,15.751666,46.50239,45.73507,62.492473,41.195743,...,16.651688,31.197781,42.026645,23.7039,63.313279,45.590675,18.095166,40.033151,22.912017,53.660735
6,2017-12-07,17.36754,48.733608,30.935497,19.753474,17.041155,47.839966,49.21623,65.663805,43.757468,...,17.385876,32.525868,43.990045,25.714695,66.184952,48.793192,19.136596,42.842811,25.419245,56.099262
7,2017-12-08,18.662473,51.592854,33.199197,19.838191,17.119311,51.654967,51.292968,70.072951,45.407792,...,18.989698,34.314625,46.518604,26.650111,70.84989,51.819335,19.87376,45.823383,25.85264,59.039568
8,2017-12-09,20.235522,54.905874,34.630811,21.519769,18.052636,54.986033,55.840424,73.963868,48.86744,...,20.548412,35.949134,48.977314,28.394781,74.233877,54.032982,21.012456,47.767657,27.417119,63.281228
9,2017-12-10,20.75832,57.919659,37.065497,22.304343,18.950466,58.099048,57.778518,77.880202,50.667641,...,20.881094,38.039037,51.954168,29.308301,80.211429,58.144396,22.057124,51.5293,29.673299,65.213402


# Phase 5 : model training with new month  data

In [22]:
proc_data1 = pre_pros(first_month_data)
dict_data1 = train_test_data(proc_data1, 0.1)
dict_models1 = model_build_NP(dict_data1)

19:52:26 - cmdstanpy - INFO - Chain [1] start processing
19:52:26 - cmdstanpy - INFO - Chain [1] done processing
19:52:27 - cmdstanpy - INFO - Chain [1] start processing
19:52:28 - cmdstanpy - INFO - Chain [1] done processing
19:52:28 - cmdstanpy - INFO - Chain [1] start processing
19:52:29 - cmdstanpy - INFO - Chain [1] done processing
19:52:29 - cmdstanpy - INFO - Chain [1] start processing
19:52:30 - cmdstanpy - INFO - Chain [1] done processing
19:52:30 - cmdstanpy - INFO - Chain [1] start processing
19:52:31 - cmdstanpy - INFO - Chain [1] done processing
19:52:31 - cmdstanpy - INFO - Chain [1] start processing
19:52:31 - cmdstanpy - INFO - Chain [1] done processing
19:52:32 - cmdstanpy - INFO - Chain [1] start processing
19:52:32 - cmdstanpy - INFO - Chain [1] done processing
19:52:33 - cmdstanpy - INFO - Chain [1] start processing
19:52:33 - cmdstanpy - INFO - Chain [1] done processing
19:52:34 - cmdstanpy - INFO - Chain [1] start processing
19:52:34 - cmdstanpy - INFO - Chain [1]

In [23]:
item_metrics #new month model metrics get updated automaticaly 

{'sales_1': {'MAPE_train': 0.3120482395469088,
  'MAPE_test': 0.2409655020999336},
 'sales_2': {'MAPE_train': 0.23545938809849726,
  'MAPE_test': 0.1838315985950451},
 'sales_3': {'MAPE_train': 0.2642563843196537,
  'MAPE_test': 0.2014685337950327},
 'sales_4': {'MAPE_train': 0.3111503175218473,
  'MAPE_test': 0.24151605072571736},
 'sales_5': {'MAPE_train': 0.3280330296003663,
  'MAPE_test': 0.25026235010915865},
 'sales_6': {'MAPE_train': 0.2325894597985513,
  'MAPE_test': 0.1829220410085376},
 'sales_7': {'MAPE_train': 0.23874217195668276,
  'MAPE_test': 0.18461857256661027},
 'sales_8': {'MAPE_train': 0.2178324462445006,
  'MAPE_test': 0.16557633265586408},
 'sales_9': {'MAPE_train': 0.23934321342693216,
  'MAPE_test': 0.191918067111268},
 'sales_10': {'MAPE_train': 0.22206520653780842,
  'MAPE_test': 0.16945335036640635},
 'sales_11': {'MAPE_train': 0.22133478909756202,
  'MAPE_test': 0.17163768076120023},
 'sales_12': {'MAPE_train': 0.2275402354357561,
  'MAPE_test': 0.1728185781

In [24]:
final_models1 = final_model_build_NP(proc_data1)
final_fourcast1 = fourcast(final_models1, proc_data1, 30)

19:54:17 - cmdstanpy - INFO - Chain [1] start processing
19:54:18 - cmdstanpy - INFO - Chain [1] done processing
19:54:18 - cmdstanpy - INFO - Chain [1] start processing
19:54:18 - cmdstanpy - INFO - Chain [1] done processing
19:54:18 - cmdstanpy - INFO - Chain [1] start processing
19:54:18 - cmdstanpy - INFO - Chain [1] done processing
19:54:19 - cmdstanpy - INFO - Chain [1] start processing
19:54:19 - cmdstanpy - INFO - Chain [1] done processing
19:54:19 - cmdstanpy - INFO - Chain [1] start processing
19:54:19 - cmdstanpy - INFO - Chain [1] done processing
19:54:19 - cmdstanpy - INFO - Chain [1] start processing
19:54:19 - cmdstanpy - INFO - Chain [1] done processing
19:54:20 - cmdstanpy - INFO - Chain [1] start processing
19:54:20 - cmdstanpy - INFO - Chain [1] done processing
19:54:20 - cmdstanpy - INFO - Chain [1] start processing
19:54:20 - cmdstanpy - INFO - Chain [1] done processing
19:54:20 - cmdstanpy - INFO - Chain [1] start processing
19:54:20 - cmdstanpy - INFO - Chain [1]

In [25]:
final_fourcast1

Unnamed: 0,Date,sales_1,sales_2,sales_3,sales_4,sales_5,sales_6,sales_7,sales_8,sales_9,...,sales_41,sales_42,sales_43,sales_44,sales_45,sales_46,sales_47,sales_48,sales_49,sales_50
0,2018-01-01,12.990871,36.710792,22.244283,13.094758,10.805022,35.707487,33.396575,47.513961,28.673324,...,14.1958,21.068768,32.405951,18.915565,48.377535,35.993397,13.65814,31.293067,18.461178,39.453513
1,2018-01-02,15.648146,43.581068,25.791425,16.076413,13.068793,43.084344,40.414126,57.707841,34.396179,...,16.779046,24.909667,37.246424,22.52977,57.903507,42.649329,16.237121,37.867506,21.525822,47.275865
2,2018-01-03,16.292762,44.429779,26.388117,16.089837,13.410397,43.459514,40.530307,57.444818,34.858678,...,16.828457,25.665363,38.231568,22.31027,58.135331,43.258603,16.475382,37.261575,21.198691,48.157965
3,2018-01-04,16.955217,47.228217,28.270257,17.98817,14.960456,45.453094,44.650899,61.480716,38.124565,...,17.691133,27.373494,40.64424,24.573493,61.92088,47.052426,17.706782,40.564703,24.010146,51.267046
4,2018-01-05,18.515751,50.577717,31.008177,18.332595,15.324086,49.77272,47.245275,66.694356,40.431314,...,19.341199,29.448673,43.578446,25.853609,67.47689,50.684164,18.643584,44.085227,24.713965,54.844827
5,2018-01-06,20.437696,54.349928,32.880344,20.197932,16.601442,53.444063,52.282227,71.076994,44.314747,...,20.856766,31.517869,46.479631,27.71223,71.501842,53.370215,19.829655,46.430199,26.528299,59.641422
6,2018-01-07,21.222345,57.678228,35.722447,21.074979,17.690775,56.889911,54.590749,75.5484,46.754244,...,21.130032,33.914148,49.752248,28.863458,78.275266,57.919404,20.992621,50.521363,28.871422,62.064692
7,2018-01-08,12.950375,36.497665,22.646216,12.912494,11.183888,35.436004,33.358449,47.536523,29.075196,...,13.084715,21.459069,31.682247,18.772567,48.425568,36.316426,13.41439,31.384832,18.083924,39.166139
8,2018-01-09,15.512914,43.292052,26.147563,15.804155,13.42059,42.770706,40.41853,57.616247,34.854756,...,15.600516,25.357711,36.398552,22.35836,57.864348,42.938363,15.994775,37.939509,21.129341,46.947252
9,2018-01-10,16.074463,44.081812,26.696488,15.73695,13.730245,43.117335,40.590686,57.249954,35.379496,...,15.611984,26.165064,37.284126,22.122621,58.025569,43.520805,16.24525,37.326736,20.797189,47.805326


# Phase 6 : mlflow run analysis and select best 50

In [72]:
def best_models():
    list_run = mlflow.search_runs() # tags.mlflow.runName base comare model and select final 
    runName_list = set(list_run["tags.mlflow.runName"])
    best_model_runs = {}
    for run_name in runName_list:
        df_item = list_run[list_run["tags.mlflow.runName"]==run_name]
        df_item = df_item.reset_index()
        df_item = df_item.iloc[: , 1:]
        df_item = df_item.sort_values("metrics.MAPE_test")
        best_model_runs[run_name] = df_item.loc[0]["run_id"]
    return best_model_runs

In [139]:
run_model_dict = best_models()

In [140]:
run_model_dict

{'sales_7_run': '304e3853d09445cba717a631dc1a16d1',
 'sales_30_run': '04f70f57253c460ebb74b1164fccbe3b',
 'sales_42_run': '81fbc66c602a466bbd82d947291d1f65',
 'sales_29_run': 'cf1f857dc9034c34abbdaadca325820f',
 'sales_36_run': 'c7b9a1fbe51244b79a50d39bdd890cee',
 'sales_25_run': '33a71cc1ad71475e9b9af820e4565073',
 'sales_20_run': 'f24205df14914a4eac60fef139dac8b7',
 'sales_17_run': 'a5b49299fb80438fa74ec6034fe39a84',
 'sales_10_run': '0c07dc91dcad447fb307bb34142d8669',
 'sales_41_run': '85ff53939ade4970a9039cdfd54569b0',
 'sales_13_run': 'e8cd11402feb43d18e1aa8aec95127db',
 'sales_21_run': '9dca5de9d5244dcc9d0d3ac0003edbb3',
 'sales_44_run': 'd917d44808b049dd94e5c05a258b6935',
 'sales_38_run': '9d719754ee134aa1ab57bb5134e2b4db',
 'sales_16_run': '240a707720b14c6195da37b4f8750d15',
 'sales_48_run': '8f7babe2fab84bcbba4e15c0ba4f2e64',
 'sales_49_run': '088ec796cef6409e96581bf2de93793a',
 'sales_1_run': 'c46dc75cc8344237bfb1c04c31966e19',
 'sales_6_run': 'f715bee4e57a49219c5eb3ce22a0b21

# Phase 7 : Forecasting next month with selected model runs

In [187]:
def forecast_with_run_model(run_model_dict, latest_proc_data, next_month_count = 1):
    """create dataframe for next month forecasting with current data used for traning model"""
    date_last = list(proc_data1["date"])[-1]
    strat = date_last + timedelta(1)
    end = date_last + relativedelta(months=+1)
    data_next_month = pd.date_range(start= strat, end= end , freq="D")
    data_next_month_df = pd.DataFrame({'ds':data_next_month})
    
    """fetch each best model from dict and forecast"""
    dict_fourcast = {}
    for name,run_id in run_model_dict.items():
        model_folder_name = f"{name[:-3]}model"
        pyfunc_uri = f"runs:/{run_id}/{model_folder_name}" # need to provide dynamic sales_7_model i.e item name folder here
        pyfunc_model = mlflow.pyfunc.load_model(pyfunc_uri) # we are using profet flavor    
        pred_df = pyfunc_model.predict(data_next_month_df)
        
        """final single datafarem with each item sales for next month"""
        name_col = name[:-4]
        dict_fourcast[name_col] = pred_df["yhat"].values
        df_fourcast = pd.DataFrame(dict_fourcast)
        final_fourcast = pd.concat([data_next_month_df["ds"], df_fourcast], axis=1)
        final_fourcast.rename(columns = {"ds" : "Date"}, inplace = True)
        
    """make dataframe in standered formate"""
    #final_fourcast = final_fourcast.reindex(sorted(df.columns), axis=1)
    col_name_order = ["Date"]
    col_name_order.extend([f"sales_{i}" for i in range(1,51)])
    final_fourcast = final_fourcast.reindex(col_name_order, axis=1)
    return final_fourcast

In [185]:
forecasted_month = forecast_with_run_model(run_model_dict, proc_data1, 1)

In [186]:
forecasted_month # forecast with model saved with mlflow 

Unnamed: 0,Date,sales_1,sales_2,sales_3,sales_4,sales_5,sales_6,sales_7,sales_8,sales_9,...,sales_41,sales_42,sales_43,sales_44,sales_45,sales_46,sales_47,sales_48,sales_49,sales_50
0,2018-01-01,12.990871,36.710792,22.244283,13.094758,10.805022,35.707487,33.396575,47.513961,28.673324,...,14.1958,21.068768,32.405951,18.915565,48.377535,35.993397,13.65814,31.293067,18.461178,39.453513
1,2018-01-02,15.648146,43.581068,25.791425,16.076413,13.068793,43.084344,40.414126,57.707841,34.396179,...,16.779046,24.909667,37.246424,22.52977,57.903507,42.649329,16.237121,37.867506,21.525822,47.275865
2,2018-01-03,16.292762,44.429779,26.388117,16.089837,13.410397,43.459514,40.530307,57.444818,34.858678,...,16.828457,25.665363,38.231568,22.31027,58.135331,43.258603,16.475382,37.261575,21.198691,48.157965
3,2018-01-04,16.955217,47.228217,28.270257,17.98817,14.960456,45.453094,44.650899,61.480716,38.124565,...,17.691133,27.373494,40.64424,24.573493,61.92088,47.052426,17.706782,40.564703,24.010146,51.267046
4,2018-01-05,18.515751,50.577717,31.008177,18.332595,15.324086,49.77272,47.245275,66.694356,40.431314,...,19.341199,29.448673,43.578446,25.853609,67.47689,50.684164,18.643584,44.085227,24.713965,54.844827
5,2018-01-06,20.437696,54.349928,32.880344,20.197932,16.601442,53.444063,52.282227,71.076994,44.314747,...,20.856766,31.517869,46.479631,27.71223,71.501842,53.370215,19.829655,46.430199,26.528299,59.641422
6,2018-01-07,21.222345,57.678228,35.722447,21.074979,17.690775,56.889911,54.590749,75.5484,46.754244,...,21.130032,33.914148,49.752248,28.863458,78.275266,57.919404,20.992621,50.521363,28.871422,62.064692
7,2018-01-08,12.950375,36.497665,22.646216,12.912494,11.183888,35.436004,33.358449,47.536523,29.075196,...,13.084715,21.459069,31.682247,18.772567,48.425568,36.316426,13.41439,31.384832,18.083924,39.166139
8,2018-01-09,15.512914,43.292052,26.147563,15.804155,13.42059,42.770706,40.41853,57.616247,34.854756,...,15.600516,25.357711,36.398552,22.35836,57.864348,42.938363,15.994775,37.939509,21.129341,46.947252
9,2018-01-10,16.074463,44.081812,26.696488,15.73695,13.730245,43.117335,40.590686,57.249954,35.379496,...,15.611984,26.165064,37.284126,22.122621,58.025569,43.520805,16.24525,37.326736,20.797189,47.805326


In [189]:
final_fourcast1 # forecast with model directly 

Unnamed: 0,Date,sales_1,sales_2,sales_3,sales_4,sales_5,sales_6,sales_7,sales_8,sales_9,...,sales_41,sales_42,sales_43,sales_44,sales_45,sales_46,sales_47,sales_48,sales_49,sales_50
0,2018-01-01,12.990871,36.710792,22.244283,13.094758,10.805022,35.707487,33.396575,47.513961,28.673324,...,14.1958,21.068768,32.405951,18.915565,48.377535,35.993397,13.65814,31.293067,18.461178,39.453513
1,2018-01-02,15.648146,43.581068,25.791425,16.076413,13.068793,43.084344,40.414126,57.707841,34.396179,...,16.779046,24.909667,37.246424,22.52977,57.903507,42.649329,16.237121,37.867506,21.525822,47.275865
2,2018-01-03,16.292762,44.429779,26.388117,16.089837,13.410397,43.459514,40.530307,57.444818,34.858678,...,16.828457,25.665363,38.231568,22.31027,58.135331,43.258603,16.475382,37.261575,21.198691,48.157965
3,2018-01-04,16.955217,47.228217,28.270257,17.98817,14.960456,45.453094,44.650899,61.480716,38.124565,...,17.691133,27.373494,40.64424,24.573493,61.92088,47.052426,17.706782,40.564703,24.010146,51.267046
4,2018-01-05,18.515751,50.577717,31.008177,18.332595,15.324086,49.77272,47.245275,66.694356,40.431314,...,19.341199,29.448673,43.578446,25.853609,67.47689,50.684164,18.643584,44.085227,24.713965,54.844827
5,2018-01-06,20.437696,54.349928,32.880344,20.197932,16.601442,53.444063,52.282227,71.076994,44.314747,...,20.856766,31.517869,46.479631,27.71223,71.501842,53.370215,19.829655,46.430199,26.528299,59.641422
6,2018-01-07,21.222345,57.678228,35.722447,21.074979,17.690775,56.889911,54.590749,75.5484,46.754244,...,21.130032,33.914148,49.752248,28.863458,78.275266,57.919404,20.992621,50.521363,28.871422,62.064692
7,2018-01-08,12.950375,36.497665,22.646216,12.912494,11.183888,35.436004,33.358449,47.536523,29.075196,...,13.084715,21.459069,31.682247,18.772567,48.425568,36.316426,13.41439,31.384832,18.083924,39.166139
8,2018-01-09,15.512914,43.292052,26.147563,15.804155,13.42059,42.770706,40.41853,57.616247,34.854756,...,15.600516,25.357711,36.398552,22.35836,57.864348,42.938363,15.994775,37.939509,21.129341,46.947252
9,2018-01-10,16.074463,44.081812,26.696488,15.73695,13.730245,43.117335,40.590686,57.249954,35.379496,...,15.611984,26.165064,37.284126,22.122621,58.025569,43.520805,16.24525,37.326736,20.797189,47.805326


### next version for forecasting

In [None]:
def forecast_with_run_model(run_model_dict, latest_proc_data, next_month_count = 1):
    """create dataframe for next month forecasting with current data used for traning model"""
    date_last = list(proc_data1["date"])[-1]
    strat = date_last + timedelta(1)
    end = date_last + relativedelta(months=+1)
    data_next_month = pd.date_range(start= strat, end= end , freq="D")
    data_next_month_df = pd.DataFrame({'ds':data_next_month})
    
    """fetch each best model from dict and forecast"""
    dict_fourcast = {}
    for name,run_id in run_model_dict.items():
        model_folder_name = f"{name[:-3]}model"
        pyfunc_uri = f"runs:/{run_id}/{model_folder_name}" # need to provide dynamic sales_7_model i.e item name folder here
        pyfunc_model = mlflow.pyfunc.load_model(pyfunc_uri) # we are using profet flavor    
        pred_df = pyfunc_model.predict(data_next_month_df)
        
        """final single datafarem with each item sales for next month"""
        name_col = name[:-4]
        dict_fourcast[name_col] = pred_df["yhat"].values
        df_fourcast = pd.DataFrame(dict_fourcast)
        final_fourcast = pd.concat([data_next_month_df["ds"], df_fourcast], axis=1)
        final_fourcast.rename(columns = {"ds" : "Date"}, inplace = True)
        
    """make dataframe in standered formate"""
    #final_fourcast = final_fourcast.reindex(sorted(df.columns), axis=1)
    col_name_order = ["Date"]
    col_name_order.extend([f"sales_{i}" for i in range(1,51)])
    final_fourcast = final_fourcast.reindex(col_name_order, axis=1)
    return final_fourcast

In [3]:
import pandas as pd 

In [12]:
new_data_path = r"D:\MLOPs POC\python_files\samp_data\test\next_month_data.csv"
updated_path = r"D:\MLOPs POC\python_files\prepros_sales_data\updated_sales_data.csv"

In [13]:
new_date_last = pd.read_csv(new_data_path)
new_date_last = list(new_date_last["date"])[-1]

updated_date_last = pd.read_csv(updated_path)
updated_date_last = list(updated_date_last["date"])[-1]

In [14]:
if new_date_last == updated_date_last:
    """no need of pre processing"""
    
else:
    pass

True

In [16]:
not new_date_last == updated_date_last

False

In [18]:
"""Performance analysis of model by using evaluation metrics i.e. MAPE
by building profet model for each item i.e. 50 models"""
import pandas as pd
from pmdarima.model_selection import train_test_split
from prophet import Prophet
from sklearn.metrics import mean_absolute_percentage_error

eval_item_metrics = {}


def eval_model(train_true, train_pred, test_true, test_pred):
    """MAPE metrics"""
    mape_train = mean_absolute_percentage_error(train_true, train_pred)
    mape_test = mean_absolute_percentage_error(test_true, test_pred)
    dict_eval = {"MAPE_train": mape_train, "MAPE_test": mape_test}
    return dict_eval


def train_test_data(data, split_test):
    """Train test split"""
    train, test = train_test_split(data, test_size=split_test)  # splite by dates
    return {"train": train, "test": test}


def eval_model_train(data, split_test=0.2, mod, item_name):
    """Using only train and test data"""
    data_dic = train_test_data(data, split_test)
    train = data_dic["train"]
    test = data_dic["test"]
    # convert to date time formate
    train["date"] = pd.to_datetime(train["date"], format="%Y-%m-%d")
    test["date"] = pd.to_datetime(test["date"], format="%Y-%m-%d")

    model_dict = {}
    col_names = train.columns
#     for y_indx in range(1, len(train.columns)):
    # item key for model_dict
    item_name = col_names[y_indx]

    # train data processing as pere NP formate
    tsdf_train = train.iloc[:, [0, y_indx]]
    tsdf_train.columns = ["ds", "y"]

    # test data processing as pere NP formate
    tsdf_test = test.iloc[:, [0, y_indx]]
    tsdf_test.columns = ["ds", "y"]

    # model traning
#         mod = Prophet()
#         mod.fit(tsdf_train)
#         model_dict[item_name] = mod

    # train and test pred and true values
    pred_train = mod.predict(tsdf_train)
    pred_test = mod.predict(tsdf_test)
    train_true = pred_train.iloc[:, 1]
    train_pred = pred_train.iloc[:, 2]
    test_true = pred_test.iloc[:, 1]
    test_pred = pred_test.iloc[:, 2]

    # model evaluation metrics
    dict_eval = eval_model(train_true, train_pred, test_true, test_pred)
    eval_item_metrics[item_name] = dict_eval

    return model_dict


In [19]:
"""Product wise Profet model is build and check its performance by using MAPE metrics.
Store all the model and metrics data with the help of mlflow"""
import sys
import pandas as pd
import mlflow
from prophet import Prophet
from statsmodels.tsa.statespace.sarimax import SARIMAX

# sys.path.insert(0, "./Model")
# from model_eval import (
#     eval_item_metrics,
# )  # variable where all the MAPE values availble wrt each item


def train_final_model(data, *arg):  # *arg consist of list of algos
    """considering whole data for traning final models"""
    # date time formate
    data["date"] = pd.to_datetime(data["date"], format="%Y-%m-%d")
    final_model_dict = {}
    col_names = data.columns

    # work on algo one by one
    for algo in arg:
        if algo == Prophet:
            algo_name = "Prophet"
            for y_indx in range(1, len(data.columns)):
                item_name = col_names[y_indx]  # item key for model_dict
                with mlflow.start_run(run_name=f"{item_name}_run") as _:
                    tsdf = data.iloc[:, [0, y_indx]]
                    tsdf.columns = ["ds", "y"]  # NP formate colums
                    # model traning
                    mod = Prophet()
                    mod.fit(tsdf)
                    final_model_dict[f"{algo_name}_{item_name}"] = mod

                    # mlflow tracking metrics and model item wise i.e. 50
                    mlflow.prophet.log_model(
                        mod, f"{algo_name}_{item_name}_model"
                    )  # model train on whole data

                    # eval metrics from traning and testing model
                    dict_eval = eval_model_train(data, split_test=0.2, mod, item_name)  # need to make it more dynamic for multipplealgo models 
################################## working ##########################################
                    # using eval_model performance metrics as per item
                    mlflow.log_metric("MAPE_train", dict_eval["MAPE_train"])
                    mlflow.log_metric("MAPE_test", dict_eval["MAPE_test"])
                    
        elif algo == SARIMAX:
            algo_name = "SARIMAX"
            for y_indx in range(1, len(data.columns)):
                item_name = col_names[y_indx]
                with mlflow.start_run(run_name=f"{item_name}_run") as _:
                    data_tsf = data.set_index("date")
                    model=SARIMAX(data_tsf, order=(1, 1, 1),seasonal_order=(1,1,1,7))
                    result = model.fit()
                    final_model_dict[f"{algo_name}_{item_name}"] = mod
                    
                    # mlflow tracking metrics and model item wise i.e. 50
                    mlflow.statsmodels.log_model(mod, f"{algo_name}_{item_name}_model") # model train on whole data

                    # eval metrics from traning and testing model
                    dict_eval = eval_item_metrics[item_name] # need to make it more dynamic for multipplealgo models 
                    
                    result.predict(start=train.index[0],end=train.index[-1])

    return final_model_dict


In [93]:
updated_path = r"D:\MLOPs POC\python_files\prepros_sales_data\updated_sales_data.csv"
data = pd.read_csv(updated_path)
data = data.sort_values("date")

In [98]:
data["date"] = pd.to_datetime(data["date"], format="%Y-%m-%d")

In [107]:
data = data.resample("D", on="date").sum()

In [108]:
data

Unnamed: 0_level_0,sales_1,sales_2,sales_3,sales_4,sales_5,sales_6,sales_7,sales_8,sales_9,sales_10,...,sales_41,sales_42,sales_43,sales_44,sales_45,sales_46,sales_47,sales_48,sales_49,sales_50
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-01-01,208,528,240,160,176,496,400,528,288,592,...,96,336,352,320,592,480,272,336,288,480
2013-01-02,176,688,480,176,96,576,368,592,368,544,...,240,384,432,240,640,480,240,416,160,512
2013-01-03,224,368,224,128,128,288,544,608,400,512,...,80,224,304,176,672,480,80,400,272,400
2013-01-04,208,288,160,304,144,304,576,864,352,720,...,144,352,464,352,784,592,208,416,352,512
2013-01-05,160,544,368,192,128,496,608,816,464,560,...,208,288,544,304,832,448,192,448,240,560
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-01-27,1470,4704,4116,1911,1764,6174,6027,8820,6174,7497,...,1911,4116,5880,3969,8526,6615,2352,5145,2058,7350
2017-01-28,2352,7056,4410,2499,1470,5586,6174,8085,8085,7056,...,2940,5439,5880,3087,9114,6321,3087,4998,3528,7203
2017-01-29,3528,6909,3969,2940,2058,5880,7203,7644,5292,8379,...,3822,3381,7350,2940,9408,6468,2793,6468,4116,7203
2017-01-30,1323,4116,3969,2499,588,4116,5586,4704,4704,6321,...,1176,3822,4704,1764,5586,4704,1617,4557,2352,6321


In [109]:
# train_final_model(data, [Prophet, auto_arima])

In [None]:
# tresting on auto_arima

In [117]:
tsest_arima = data[["sales_1"]]
tsest_arima

Unnamed: 0_level_0,sales_1
date,Unnamed: 1_level_1
2013-01-01,208
2013-01-02,176
2013-01-03,224
2013-01-04,208
2013-01-05,160
...,...
2017-01-27,1470
2017-01-28,2352
2017-01-29,3528
2017-01-30,1323


In [120]:
train = tsest_arima.iloc[:1000,:]
train

Unnamed: 0_level_0,sales_1
date,Unnamed: 1_level_1
2013-01-01,208
2013-01-02,176
2013-01-03,224
2013-01-04,208
2013-01-05,160
...,...
2015-09-23,352
2015-09-24,448
2015-09-25,320
2015-09-26,320


In [121]:
test = tsest_arima.iloc[1000:,:]
test

Unnamed: 0_level_0,sales_1
date,Unnamed: 1_level_1
2015-09-28,240
2015-09-29,304
2015-09-30,240
2015-10-01,400
2015-10-02,192
...,...
2017-01-27,1470
2017-01-28,2352
2017-01-29,3528
2017-01-30,1323


In [122]:
from statsmodels.tsa.statespace.sarimax import SARIMAX

In [123]:
model=SARIMAX(train, order=(1, 1, 1),seasonal_order=(1,1,1,7))
result = model.fit()
result.predict(start=train.index[0],end=train.index[-1])

date
2013-01-01      0.000000
2013-01-02    207.121996
2013-01-03    176.251227
2013-01-04    223.602915
2013-01-05    208.126465
                 ...    
2015-09-23    358.170341
2015-09-24    363.917120
2015-09-25    391.617128
2015-09-26    438.351922
2015-09-27    423.113651
Freq: D, Name: predicted_mean, Length: 1000, dtype: float64

In [124]:
result.predict(start=test.index[0],end=test.index[-1])

2015-09-28    310.308052
2015-09-29    346.072298
2015-09-30    356.484723
2015-10-01    373.493617
2015-10-02    386.122605
                 ...    
2017-01-27    485.526128
2017-01-28    517.591531
2017-01-29    528.534860
2017-01-30    407.188293
2017-01-31    444.092583
Freq: D, Name: predicted_mean, Length: 492, dtype: float64

In [68]:
test

Unnamed: 0_level_0,sales_1
date,Unnamed: 1_level_1
2016-06-04,28
2016-06-04,28
2016-06-04,28
2016-06-04,28
2016-06-04,28
...,...
2017-01-31,17
2017-01-31,17
2017-01-31,17
2017-01-31,17


In [70]:
test.index[-1]

'2017-01-31'

In [69]:
result.predict(start=test.index[0],end=test.index[-1])

KeyError: 'The `end` argument could not be matched to a location related to the index of the data.'

In [None]:
model.predict()

In [44]:
mode=SARIMAX
model = mode(train,order=(1, 1, 1))#,seasonal_order=(1,1,1,7))
result = model.fit()


NameError: name 'train' is not defined

In [32]:
result.predict(start=train.index[0],end=train.index[-1])

date
2013-01-01     0.000000
2013-01-02    12.999754
2013-01-03    11.000076
2013-01-04    13.999886
2013-01-05    13.000038
                ...    
2017-01-27    16.743548
2017-01-28    16.294170
2017-01-29    19.253491
2017-01-30    11.486846
2017-01-31    13.480249
Name: predicted_mean, Length: 23407, dtype: float64

In [73]:
"""Performance analysis of model by using evaluation metrics i.e. MAPE
by building profet model for each item i.e. 50 models"""
import pandas as pd
from pmdarima.model_selection import train_test_split
from prophet import Prophet
from sklearn.metrics import mean_absolute_percentage_error
from statsmodels.tsa.statespace.sarimax import SARIMAX

eval_item_metrics = {}


def eval_model(train_true, train_pred, test_true, test_pred):
    """MAPE metrics"""
    mape_train = mean_absolute_percentage_error(train_true, train_pred)
    mape_test = mean_absolute_percentage_error(test_true, test_pred)
    dict_eval = {"MAPE_train": mape_train, "MAPE_test": mape_test}
    return dict_eval


def train_test_data(data, split_test):
    """Train test split"""
    train, test = train_test_split(data, test_size=split_test)  # splite by dates
    return {"train": train, "test": test}


def eval_model_train(data, list_algo, split_test=0.2):
    data["date"] = pd.to_datetime(data["date"], format="%Y-%m-%d")
    data = data.sort_values("date")
    """Using only train and test data"""
    data_dic = train_test_data(data, split_test)
    train = data_dic["train"]
    test = data_dic["test"]
    # convert to date time formate
#     train["date"] = pd.to_datetime(train["date"], format="%Y-%m-%d")
#     test["date"] = pd.to_datetime(test["date"], format="%Y-%m-%d")
    model_dict = {}
    col_names = train.columns
    # work on algo one by one
    for algo in list_algo:
        if algo == Prophet:
            # work on profet part
            for y_indx in range(1, len(train.columns)):
                # item key for model_dict
                item_name = col_names[y_indx]

                # train data processing as pere NP formate
                tsdf_train = train.iloc[:, [0, y_indx]]
                tsdf_train.columns = ["ds", "y"]

                # test data processing as pere NP formate
                tsdf_test = test.iloc[:, [0, y_indx]]
                tsdf_test.columns = ["ds", "y"]

                # model traning
                mod = Prophet()
                mod.fit(tsdf_train)
                model_dict[item_name] = mod

                # train and test pred and true values
                pred_train = mod.predict(tsdf_train)
                pred_test = mod.predict(tsdf_test)
                train_true = pred_train.iloc[:, 1]
                train_pred = pred_train.iloc[:, 2]
                test_true = pred_test.iloc[:, 1]
                test_pred = pred_test.iloc[:, 2]

                # model evaluation metrics
                dict_eval = eval_model(train_true, train_pred, test_true, test_pred)
                eval_item_metrics[item_name] = dict_eval
            
        elif algo == SARIMAX:
            # work on auto arima part
            for y_indx in range(1, len(train.columns)):
                # item key for model_dict
                item_name = col_names[y_indx]

                tsdf_train = train.iloc[:, [0, y_indx]]
                tsdf_train.set_index("date", inplace=True)
                tsdf_test = test.iloc[:, [0, y_indx]]
                tsdf_test.set_index("date", inplace=True)

                # model traning
                mod = SARIMAX(tsdf_train, order=(1, 1, 1))#, seasonal_order=(1, 1, 1, 7))
                result = mod.fit()
                model_dict[item_name] = result

                # train and test pred and true values
                pred_train = result.predict(start=tsdf_train.index[0], end=tsdf_train.index[-1])
#                 pred_test = result.predict(start=tsdf_test.index[0], end=tsdf_test.index[-1])
#                 train_true = pred_train.iloc[:, 1]
#                 train_pred = pred_train.iloc[:, 2]
#                 test_true = pred_test.iloc[:, 1]
#                 test_pred = pred_test.iloc[:, 2]

#                 # model evaluation metrics
#                 dict_eval = eval_model(train_true, train_pred, test_true, test_pred)
#                 eval_item_metrics[item_name] = dict_eval

        

    return tsdf_train, pred_train#model_dict


In [74]:
q,w = eval_model_train(data, [SARIMAX], split_test=0.2)

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


In [79]:
q

Unnamed: 0_level_0,sales_50
date,Unnamed: 1_level_1
2013-01-01,30
2013-01-01,30
2013-01-01,30
2013-01-01,30
2013-01-01,30
...,...
2016-10-28,71
2016-10-28,71
2016-10-28,71
2016-10-28,71


In [76]:
w

date
2013-01-01     0.0
2013-01-01    30.0
2013-01-01    30.0
2013-01-01    30.0
2013-01-01    30.0
              ... 
2016-10-27    50.0
2016-10-27    50.0
2016-10-27    50.0
2016-10-27    50.0
2016-10-28    50.0
Name: predicted_mean, Length: 22337, dtype: float64

In [86]:
len(list(q.index))

22346

In [88]:
data

Unnamed: 0,date,sales_1,sales_2,sales_3,sales_4,sales_5,sales_6,sales_7,sales_8,sales_9,...,sales_41,sales_42,sales_43,sales_44,sales_45,sales_46,sales_47,sales_48,sales_49,sales_50
0,2013-01-01,13,33,15,10,11,31,25,33,18,...,6,21,22,20,37,30,17,21,18,30
11688,2013-01-01,13,33,15,10,11,31,25,33,18,...,6,21,22,20,37,30,17,21,18,30
14610,2013-01-01,13,33,15,10,11,31,25,33,18,...,6,21,22,20,37,30,17,21,18,30
10227,2013-01-01,13,33,15,10,11,31,25,33,18,...,6,21,22,20,37,30,17,21,18,30
16071,2013-01-01,13,33,15,10,11,31,25,33,18,...,6,21,22,20,37,30,17,21,18,30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24491,2017-01-31,17,37,28,10,8,36,33,53,26,...,17,24,34,23,56,43,13,34,18,54
23499,2017-01-31,17,37,28,10,8,36,33,53,26,...,17,24,34,23,56,43,13,34,18,54
26661,2017-01-31,17,37,28,10,8,36,33,53,26,...,17,24,34,23,56,43,13,34,18,54
26258,2017-01-31,17,37,28,10,8,36,33,53,26,...,17,24,34,23,56,43,13,34,18,54


In [None]:
2013-01-01 n 2014-03-07

In [None]:
2014-03-08 n 2017-01-31

In [24]:
# data.sort_values("date", axis=1)
d = data.sort_values("date")

In [25]:
train_test_split(d, test_size=0.2)

[             date  sales_1  sales_2  sales_3  sales_4  sales_5  sales_6  \
 0      2013-01-01       13       33       15       10       11       31   
 11688  2013-01-01       13       33       15       10       11       31   
 14610  2013-01-01       13       33       15       10       11       31   
 10227  2013-01-01       13       33       15       10       11       31   
 16071  2013-01-01       13       33       15       10       11       31   
 ...           ...      ...      ...      ...      ...      ...      ...   
 17467  2016-10-28       18       67       37       31       26       59   
 14545  2016-10-28       18       67       37       31       26       59   
 7240   2016-10-28       18       67       37       31       26       59   
 23311  2016-10-28       18       67       37       31       26       59   
 5779   2016-10-28       18       67       37       31       26       59   
 
        sales_7  sales_8  sales_9  ...  sales_41  sales_42  sales_43  sales_44  \
 0  

In [21]:
eval_model_train(data, [Prophet], split_test=0.2)

12:09:09 - cmdstanpy - INFO - Chain [1] start processing
12:09:15 - cmdstanpy - INFO - Chain [1] done processing
12:09:23 - cmdstanpy - INFO - Chain [1] start processing
12:09:35 - cmdstanpy - INFO - Chain [1] done processing
12:09:42 - cmdstanpy - INFO - Chain [1] start processing
12:09:50 - cmdstanpy - INFO - Chain [1] done processing
12:09:57 - cmdstanpy - INFO - Chain [1] start processing
12:10:07 - cmdstanpy - INFO - Chain [1] done processing
12:10:15 - cmdstanpy - INFO - Chain [1] start processing
12:10:21 - cmdstanpy - INFO - Chain [1] done processing
12:10:29 - cmdstanpy - INFO - Chain [1] start processing
12:10:32 - cmdstanpy - INFO - Chain [1] done processing
12:10:40 - cmdstanpy - INFO - Chain [1] start processing
12:10:55 - cmdstanpy - INFO - Chain [1] done processing
12:11:02 - cmdstanpy - INFO - Chain [1] start processing
12:11:14 - cmdstanpy - INFO - Chain [1] done processing
12:11:22 - cmdstanpy - INFO - Chain [1] start processing
12:11:32 - cmdstanpy - INFO - Chain [1]

{'sales_1': <prophet.forecaster.Prophet at 0x1b132bbe4c0>,
 'sales_2': <prophet.forecaster.Prophet at 0x1b0a5367430>,
 'sales_3': <prophet.forecaster.Prophet at 0x1b132cb3370>,
 'sales_4': <prophet.forecaster.Prophet at 0x1b13453b760>,
 'sales_5': <prophet.forecaster.Prophet at 0x1b0a35570a0>,
 'sales_6': <prophet.forecaster.Prophet at 0x1b0f1ebc220>,
 'sales_7': <prophet.forecaster.Prophet at 0x1b0f1ebc850>,
 'sales_8': <prophet.forecaster.Prophet at 0x1b0a3ce4400>,
 'sales_9': <prophet.forecaster.Prophet at 0x1b132661ee0>,
 'sales_10': <prophet.forecaster.Prophet at 0x1b12fc41760>,
 'sales_11': <prophet.forecaster.Prophet at 0x1b13453bee0>,
 'sales_12': <prophet.forecaster.Prophet at 0x1b0a8d61d60>,
 'sales_13': <prophet.forecaster.Prophet at 0x1b1328c88b0>,
 'sales_14': <prophet.forecaster.Prophet at 0x1b132bbe580>,
 'sales_15': <prophet.forecaster.Prophet at 0x1b1328d8190>,
 'sales_16': <prophet.forecaster.Prophet at 0x1b13453fbe0>,
 'sales_17': <prophet.forecaster.Prophet at 0x1b1

In [None]:
"""Performance analysis of model by using evaluation metrics i.e. MAPE
by building profet model for each item i.e. 50 models"""
import pandas as pd
from pmdarima.model_selection import train_test_split
from prophet import Prophet
from sklearn.metrics import mean_absolute_percentage_error
from statsmodels.tsa.statespace.sarimax import SARIMAX

eval_item_metrics = {}


def eval_model(train_true, train_pred, test_true, test_pred):
    """MAPE metrics"""
    mape_train = mean_absolute_percentage_error(train_true, train_pred)
    mape_test = mean_absolute_percentage_error(test_true, test_pred)
    dict_eval = {"MAPE_train": mape_train, "MAPE_test": mape_test}
    return dict_eval


def train_test_data(data, split_test):
    """Train test split"""
    train, test = train_test_split(data, test_size=split_test)  # splite by dates
    return {"train": train, "test": test}


def eval_model_train(data, list_algo, split_test=0.2):
    data["date"] = pd.to_datetime(data["date"], format="%Y-%m-%d")
    data = data.sort_values("date")
    """Using only train and test data"""
    data_dic = train_test_data(data, split_test)
    train = data_dic["train"]
    test = data_dic["test"]
    # convert to date time formate
#     train["date"] = pd.to_datetime(train["date"], format="%Y-%m-%d")
#     test["date"] = pd.to_datetime(test["date"], format="%Y-%m-%d")
    model_dict = {}
    col_names = train.columns
    # work on algo one by one
    for algo in list_algo:
        if algo == Prophet:
            # work on profet part
            for y_indx in range(1, len(train.columns)):
                # item key for model_dict
                item_name = col_names[y_indx]

                # train data processing as pere NP formate
                tsdf_train = train.iloc[:, [0, y_indx]]
                tsdf_train.columns = ["ds", "y"]

                # test data processing as pere NP formate
                tsdf_test = test.iloc[:, [0, y_indx]]
                tsdf_test.columns = ["ds", "y"]

                # model traning
                mod = Prophet()
                mod.fit(tsdf_train)
                model_dict[item_name] = mod

                # train and test pred and true values
                pred_train = mod.predict(tsdf_train)
                pred_test = mod.predict(tsdf_test)
                train_true = pred_train.iloc[:, 1]
                train_pred = pred_train.iloc[:, 2]
                test_true = pred_test.iloc[:, 1]
                test_pred = pred_test.iloc[:, 2]

                # model evaluation metrics
                dict_eval = eval_model(train_true, train_pred, test_true, test_pred)
                eval_item_metrics[item_name] = dict_eval
            
        elif algo == SARIMAX:
            # work on auto arima part
            for y_indx in range(1, len(train.columns)):
                # item key for model_dict
                item_name = col_names[y_indx]

                tsdf_train = train.iloc[:, [0, y_indx]]
                tsdf_train.set_index("date", inplace=True)
                tsdf_test = test.iloc[:, [0, y_indx]]
                tsdf_test.set_index("date", inplace=True)

                # model traning
                # design network
                model = Sequential()
                model.add(LSTM(50, input_shape=(train_X.shape[1], train_X.shape[2])))
                model.add(Dense(1))
                model.compile(loss='mae', optimizer='adam')
                # fit network
                history = model.fit(train_X, train_y, epochs=300, batch_size=50, validation_data=(test_X, test_y), verbose=2, shuffle=False)
                a =  history.history['loss']
                b = history.history['val_loss']

                # train and test pred and true values
#                 pred_train = result.predict(start=tsdf_train.index[0], end=tsdf_train.index[-1])
#                 pred_test = result.predict(start=tsdf_test.index[0], end=tsdf_test.index[-1])
#                 train_true = pred_train.iloc[:, 1]
#                 train_pred = pred_train.iloc[:, 2]
#                 test_true = pred_test.iloc[:, 1]
#                 test_pred = pred_test.iloc[:, 2]

#                 # model evaluation metrics
#                 dict_eval = eval_model(train_true, train_pred, test_true, test_pred)
#                 eval_item_metrics[item_name] = dict_eval

        

    return tsdf_train, tsdf_test#model_dict
