# pre processed data

In [1]:
"""In pre process first grouping by items i.e. 50 then by
droping unique columns geting final X and Y dataframe
for current data no need of null or nan value handling
but we can add fun to deal with that in case data change"""
import os
import pandas as pd


def pre_pros(raw_df):
    """By using group by working on one item at a time"""
    y = {}
    X = {}
    for item, data in raw_df.groupby("item"):
        item_data = data.drop(
            [col for col in data.columns if data[col].nunique() == 1], axis=1
        )
        item_data.rename(columns={"sales": f"sales_{item}"}, inplace=True)
        X.update({"date": item_data["date"].values})
        y.update({f"sales_{item}": item_data[f"sales_{item}"].values})
    df_X = pd.DataFrame(X)
    df_y = pd.DataFrame(y)
    df_final = pd.concat([df_X, df_y], axis=1)
    df_final["date"] = pd.to_datetime(df_final["date"], format="%Y-%m-%d")
    df_final = df_final.resample("D", on="date").sum()
    df_final = df_final.reset_index()
    
    if not os.path.exists(
        r"D:\MLOPs POC\python_files\prepros_sales_data\updated_sales_data.csv"
    ):
        os.mkdir("D:\MLOPs POC\python_files\prepros_sales_data")
        df_final.to_csv(
            r"D:\MLOPs POC\python_files\prepros_sales_data\updated_sales_data.csv",
            index=False,
        )
        eval_model_train(df_final, [SARIMAX, Prophet], split_test=0.2)
    else:
        df_old = pd.read_csv(
            r"D:\MLOPs POC\python_files\prepros_sales_data\updated_sales_data.csv"
        )
        df_new = df_final
        df_final = pd.concat([df_old, df_new], axis=0, ignore_index=True)
        df_final.to_csv(
            r"D:\MLOPs POC\python_files\prepros_sales_data\updated_sales_data.csv",
            index=False,
        )
        
        eval_model_train(df_final, [SARIMAX, Prophet], split_test=0.2) 
        
        return df_final



In [2]:
# raw_df = r"D:\MLOPs POC\python_files\samp_data\data_2013_to_2016.csv"
# raw_df = pd.read_csv(raw_df)

In [3]:
# pre_pros(raw_df)

# model eval on split data

In [4]:
"""Product wise Profet model is build and check its performance by using MAPE metrics.
Store all the model and metrics data with the help of mlflow"""
import sys
import pandas as pd
import mlflow
from prophet import Prophet
from statsmodels.tsa.statespace.sarimax import SARIMAX
from datetime import datetime

# sys.path.insert(0, "./Model")
# from model_eval import (
#     eval_item_metrics,
# )  # variable where all the MAPE values availble wrt each item  # d


def train_final_model(data, dict_mod_met):  
    """considering whole data for traning final models"""
    # date time formate
    data["date"] = pd.to_datetime(data["date"], format="%Y-%m-%d")
    data = data.sort_values("date")
    final_model_dict = {}
    col_name = list(data.columns)
    col_name.remove('date')
    
    date_now = datetime.now().date()
    str_date_now = date_now.strftime("%Y_%m_%d")
    
    for prod_name in col_name:
    # work on algo one by one
        mod_name = dict_mod_met[prod_name]["best_model"]
        if mod_name == "Prophet":
            with mlflow.start_run(run_name=f"{prod_name}_run") as _:
                # train data processing as pere NP formate
                data_prod = data.loc[:, ["date", prod_name]]
                data_prod.columns = ["ds", "y"]

                # model traning
                mod = Prophet()
                mod.fit(data_prod)
                final_model_dict[prod_name] = mod
                # mlflow tracking metrics and model item wise i.e. 50
                mlflow.prophet.log_model(
                    mod, f"{prod_name}_model"
                )  # model train on whole data
                # using eval_model performance metrics as per item
                dict_eval = dict_mod_met[prod_name]
                mlflow.log_metric(f"MAPE_train", dict_eval["MAPE_train"])
                mlflow.log_metric(f"MAPE_test", dict_eval["MAPE_test"])
                
        elif mod_name == "SARIMAX" :
            with mlflow.start_run(run_name=f"{prod_name}_run") as _:
                # train data processing as pere NP formate
                data_prod = data.loc[:, ["date", prod_name]]
                data_prod.set_index("date", inplace=True)

                # model traning
                mod = SARIMAX(data_prod, order=(1, 1, 1)) #, seasonal_order=(1, 1, 1, 7))
                result = mod.fit()                
                final_model_dict[prod_name] = result
                # mlflow tracking metrics and model item wise i.e. 50
                mlflow.statsmodels.log_model(
                    result, f"{prod_name}_model"
                )  # model train on whole data
                # using eval_model performance metrics as per item
                dict_eval = dict_mod_met[prod_name]
                mlflow.log_metric(f"MAPE_train", dict_eval["MAPE_train"])
                mlflow.log_metric(f"MAPE_test", dict_eval["MAPE_test"])

    return final_model_dict


In [5]:
"""Performance analysis of model by using evaluation metrics i.e. MAPE
by building profet model for each item i.e. 50 models"""
import pandas as pd
from pmdarima.model_selection import train_test_split
from prophet import Prophet
from sklearn.metrics import mean_absolute_percentage_error
from statsmodels.tsa.statespace.sarimax import SARIMAX

eval_item_metrics = {}


def eval_model(train_true, train_pred, test_true, test_pred):
    """MAPE metrics"""
    mape_train = mean_absolute_percentage_error(train_true, train_pred)
    mape_test = mean_absolute_percentage_error(test_true, test_pred)
    dict_eval = {"MAPE_train": mape_train, "MAPE_test": mape_test}
    return dict_eval


def train_test_data(data, split_test):
    """Train test split"""
    train, test = train_test_split(data, test_size=split_test)  # splite by dates
    return {"train": train, "test": test}


def eval_model_train(data_upd, list_algo, split_test=0.2):
    data = data_upd.copy()
    data["date"] = pd.to_datetime(data["date"], format="%Y-%m-%d")
    data = data.sort_values("date")
    """Using only train and test data"""
    data_dic = train_test_data(data, split_test)
    train = data_dic["train"]
    test = data_dic["test"]
    # convert to date time formate
#     train["date"] = pd.to_datetime(train["date"], format="%Y-%m-%d")
#     test["date"] = pd.to_datetime(test["date"], format="%Y-%m-%d")
    model_dict = {}
    col_names = train.columns
    # work on algo one by one
    for algo in list_algo:
        if algo == Prophet:
            algo_dict = {}
            mod_name = "Prophet"
            # work on profet part
            for y_indx in range(1, len(train.columns)):
                # item key for model_dict
                item_name = col_names[y_indx]

                # train data processing as pere NP formate
                tsdf_train = train.iloc[:, [0, y_indx]]
                tsdf_train.columns = ["ds", "y"]

                # test data processing as pere NP formate
                tsdf_test = test.iloc[:, [0, y_indx]]
                tsdf_test.columns = ["ds", "y"]

                # model traning
                mod = Prophet()
                mod.fit(tsdf_train)
                model_dict[item_name] = mod

                # train and test pred and true values
                pred_train = mod.predict(tsdf_train)
                pred_test = mod.predict(tsdf_test)
                train_true = pred_train.iloc[:, 1]
                train_pred = pred_train.iloc[:, 2]
                test_true = pred_test.iloc[:, 1]
                test_pred = pred_test.iloc[:, 2]

                # model evaluation metrics
                dict_eval = eval_model(train_true, train_pred, test_true, test_pred)
                algo_dict[item_name] = dict_eval
#                 eval_item_metrics[f"{mod_name}_{item_name}"] = dict_eval
            
        elif algo == SARIMAX:
            mod_name = "SARIMAX"
            algo_dict = {}
            # work on auto arima part
            for y_indx in range(1, len(train.columns)):
                # item key for model_dict
                item_name = col_names[y_indx]

                tsdf_train = train.iloc[:, [0, y_indx]]
                tsdf_train.set_index("date", inplace=True)
                tsdf_test = test.iloc[:, [0, y_indx]]
                tsdf_test.set_index("date", inplace=True)

                # model traning
                mod = SARIMAX(tsdf_train, order=(1, 1, 1))#, seasonal_order=(1, 1, 1, 7))
                result = mod.fit()
                model_dict[item_name] = result

                # train and test pred and true values
                pred_train = result.predict(start=tsdf_train.index[0], end=tsdf_train.index[-1])
                pred_test = result.predict(start=tsdf_test.index[0], end=tsdf_test.index[-1])
                train_true = tsdf_train
                train_pred = pred_train
                test_true = tsdf_test
                test_pred = pred_test

                # model evaluation metrics
                dict_eval = eval_model(train_true, train_pred, test_true, test_pred)
                algo_dict[item_name] = dict_eval
#                 eval_item_metrics[f"{mod_name}_{item_name}"] = dict_eval
        eval_item_metrics[mod_name] = algo_dict
    # best model selection from yrained models 
    lis_df = []
    for model in  eval_item_metrics.keys():
        df_met = pd.DataFrame(eval_item_metrics[model]).loc[["MAPE_test"], :]
        df_met.index = [model]
        lis_df.append(df_met)
    conct_met_df = pd.concat(lis_df, axis=0)
    dict_best_mod = {}
    mode_names_indx = list(conct_met_df.index)
    for prod in conct_met_df.columns:
        index = conct_met_df[prod].argmin()
        mod_dict = {"best_model" : mode_names_indx[index]}
        mod_dict.update(eval_item_metrics[mode_names_indx[index]][prod]) 
        dict_best_mod[prod] = mod_dict
        
    # final model devlopment and storing with mlflow
    final_best_mod = train_final_model(data_upd, dict_best_mod)
    
    return final_best_mod


In [6]:
# data = r"D:\MLOPs POC\python_files\prepros_sales_data\updated_sales_data.csv"
# data = pd.read_csv(data)

In [7]:
# eval_model_train(data, [SARIMAX, Prophet], split_test=0.2)

# model selection ... compare with last best model

In [8]:
"""Fetch the best model wrt each product by using metrics i.e. MAPE from mlflow track data"""
import mlflow


def best_model():
    """# path of mlflow folder"""
    list_run = mlflow.search_runs()
    run_name_list = set(list_run["tags.mlflow.runName"])
    best_model_runs = {}
    for run_name in run_name_list:
        df_item = list_run[list_run["tags.mlflow.runName"] == run_name]
        df_item = df_item.reset_index()
        df_item = df_item.iloc[:, 1:]
        df_item = df_item.sort_values("metrics.MAPE_test")
        best_model_runs[run_name] = df_item.loc[0]["run_id"]
    #  delete rest of run id in future

    return best_model_runs


In [9]:
# mlflow.search_runs()

In [10]:
# run_model_dict= best_model()

# forecasting with selected model

In [11]:
"""First forecasting dataframe is created as per input argument provided then
by using mlflow profet flavor fetching model and return forecasted data in
standard dataframe formate"""
from datetime import timedelta, datetime
import calendar
import pandas as pd
import mlflow
import mlflow.pyfunc


def forecast_with_run_model(run_model_dict, latest_proc_data, next_month_count):
    """create dataframe for next month forecasting with current data used for traning model"""
    try:
        latest_proc_data["date"] = pd.to_datetime(latest_proc_data["date"])
    finally:
        year = list(latest_proc_data["date"])[-1].year
        month = list(latest_proc_data["date"])[-1].month

        list_mon_yr = []
        for _ in range(1, next_month_count + 1):
            new_month = month + 1
            if new_month > 12:
                month = 1
                year += 1
            else:
                month = new_month
            day_count = calendar.monthrange(year, month)[1]
            list_mon_yr.append([month, year, day_count])

        final_day_count = 0
        for val in list_mon_yr:
            final_day_count += val[-1]

        start_ = datetime(list_mon_yr[0][1], list_mon_yr[0][0], 1)
        end_ = start_ + timedelta(final_day_count - 1)

        data_next_month = pd.date_range(start=start_, end=end_, freq="D")
        data_next_month = pd.DataFrame({"ds": data_next_month})

        # fetch each best model from dict and forecast
        dict_forecast = {}
        for name, run_id in run_model_dict.items():
            model_folder_name = f"{name[:-3]}model"
            pyfunc_uri = f"runs:/{run_id}/{model_folder_name}"
            pyfunc_model = mlflow.pyfunc.load_model(
                pyfunc_uri
            )  # we are using  flavor from mlflow
            flavor = [i.split(":")[1].strip() for i in str(pyfunc_model).split("\n") if i.strip().startswith("flavor")]
            if flavor[0] == 'mlflow.statsmodels':
                pred_df = pyfunc_model.predict(pd.DataFrame({"start": data_next_month.iloc[0,:] , "end":data_next_month.iloc[-1,:]})) ### sarima and profet same ????
                col_val = pred_df.values
            elif flavor[0] == 'mlflow.prophet':
                pred_df = pyfunc_model.predict(data_next_month)
                col_val = pred_df["yhat"].values
                
            # final single datafarem with each item sales for next month
            name_col = name[:-4]
            dict_forecast[name_col] = col_val
            df_fourcast = pd.DataFrame(dict_forecast)
            final_forecast = pd.concat([data_next_month["ds"], df_fourcast], axis=1)
            final_forecast.rename(columns={"ds": "date"}, inplace=True)

        # make dataframe in standered formate
        col_name_order = ["date"]
        col_name_order.extend([f"sales_{i}" for i in range(1, 51)])
        final_forecast = final_forecast.reindex(col_name_order, axis=1)
        final_forecast = final_forecast.round()
        final_forecast.columns = ["date"] + [
            f"sales_{i}_forecast" for i in range(1, 51)
        ]
    return final_forecast


In [12]:
# forecast_with_run_model(run_model_dict, data, 1) # profet working 

In [13]:
# forecast_with_run_model(run_model_dict, data, 1) # sarimax working 

# final pipeline

In [14]:
"""It provides input from user and follow pipe line flow for loading input files,
data processing, stornig process updated data, model building, best model selection
and forecasting with api link"""
import os
import sys
import matplotlib

# matplotlib.use("Agg")
# import matplotlib.pyplot as plt
# import pandas as pd
# from flask import Flask, render_template, request

# sys.path.insert(0, "./Model_selection")
# from model_selection import best_model

# sys.path.insert(0, "./Forecasting")
# from forecasting import forecast_with_run_model

# sys.path.insert(0, "./Preprocess")
# from preprocess import pre_pros

# app = Flask(
#     __name__, static_folder=""
# )  # , static_folder=r"D:\MLOPs POC\python_files\static"
# pick_folder = r"D:\MLOPs POC\python_files\static\forecast.png"
# # app.config["UPLODE_FOLDER"] = pick_folder


# @app.route("/", methods=["GET", "POST"])
def forecast_data():
    """check the presence of new data if it presesnt then update current data..
    if not then use the current data"""
    path_ = r"D:\MLOPs POC\python_files\prepros_sales_data\updated_sales_data.csv"
    # pick_store = os.path.join(app.config["UPLODE_FOLDER"], "forecast.svg")

#     if request.method == "POST":
    if os.path.exists(path_):
        data_df = pd.read_csv(rf"{path_}").iloc[-3:, :]
    else:
        raw_df_path = r"D:\MLOPs POC\python_files\samp_data\data_2013_to_2016.csv"
        raw_df = pd.read_csv(raw_df_path)
        pre_pros(raw_df)
        data_df = pd.read_csv(rf"{path_}").iloc[-3:, :]
        data_df = data_df.iloc[-3:, :]
    # need to mension location of new data
    new_path = r"D:\MLOPs POC\python_files\samp_data\test\next_month_data.csv"
    if os.path.exists(new_path):
        new_path_df = pd.read_csv(new_path)
        if new_path_df.iloc[-1,0] != data_df.iloc[-1,0]:
            pre_pros(new_path_df)
            data_df = pd.read_csv(rf"{path_}").iloc[-3:, :]
            data_df = data_df.iloc[-3:, :]
#             eval_model_train(data_df, [SARIMAX, Prophet], split_test=0.2)
            
    # gey proc data last date by using updated file
#     month_count = request.form.get("month_count")
#     prod_name = request.form.get("product_name")
    month_count = 1 #int(month_count)
    run_model_dict = best_model()
    forecasted_month = forecast_with_run_model(
        run_model_dict, data_df, month_count
    )#[["date", "sales_1"]]

#         # Create a line plot of the output data
#         plot_df = forecasted_month.set_index("date")
#         plt.figure(figsize=(15, 10), dpi=80)
#         plt.plot(plot_df)
#         plt.xlabel("Month")
#         plt.ylabel("Output")
#         plt.xticks(rotation=45)
#         plt.savefig(pick_folder)
#         plt.clf()
#     else:
#         forecasted_month = pd.DataFrame()

    """Need to add code for geting product wise result"""
    return forecasted_month

#     return render_template(
#         "index.html",
#         forecast=forecasted_month,
#         # plot_graph=pick_store,
#     )


# if __name__ == "__main__":
#     app.run(debug=True)


In [15]:
forecast_data() # no updated ornew data ie 1st process

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
21:17:20 - cmdstanpy - INFO - Chain [1] start processing
21:17:20 - cmdstanpy - INFO - Chain [1] done processing
21:17:20 - cmdstanpy - INFO - Chain [1] start processing
21:17:21 - cmdstanpy - INFO - Chain [1] done processing
21:17:21 - cmdstanpy - INFO - Chain [1] start processing
21:17:21 - cmdstanpy - INFO - Chain [1] done processing
21:17:22 - cmdstanpy 

21:17:49 - cmdstanpy - INFO - Chain [1] start processing
21:17:49 - cmdstanpy - INFO - Chain [1] done processing
21:17:50 - cmdstanpy - INFO - Chain [1] start processing
21:17:50 - cmdstanpy - INFO - Chain [1] done processing
21:17:51 - cmdstanpy - INFO - Chain [1] start processing
21:17:51 - cmdstanpy - INFO - Chain [1] done processing
21:17:52 - cmdstanpy - INFO - Chain [1] start processing
21:17:52 - cmdstanpy - INFO - Chain [1] done processing
21:17:52 - cmdstanpy - INFO - Chain [1] start processing
21:17:53 - cmdstanpy - INFO - Chain [1] done processing
21:17:53 - cmdstanpy - INFO - Chain [1] start processing
21:17:53 - cmdstanpy - INFO - Chain [1] done processing
21:17:54 - cmdstanpy - INFO - Chain [1] start processing
21:17:54 - cmdstanpy - INFO - Chain [1] done processing
21:17:55 - cmdstanpy - INFO - Chain [1] start processing
21:17:55 - cmdstanpy - INFO - Chain [1] done processing
21:17:56 - cmdstanpy - INFO - Chain [1] start processing
21:17:56 - cmdstanpy - INFO - Chain [1]

21:18:19 - cmdstanpy - INFO - Chain [1] start processing
21:18:19 - cmdstanpy - INFO - Chain [1] done processing
  setattr(model, attribute, pd.Timestamp.utcfromtimestamp(model_dict[attribute]).tz_localize(None))
  setattr(model, attribute, pd.Timestamp.utcfromtimestamp(model_dict[attribute]).tz_localize(None))
  setattr(model, attribute, pd.Timestamp.utcfromtimestamp(model_dict[attribute]).tz_localize(None))
  setattr(model, attribute, pd.Timestamp.utcfromtimestamp(model_dict[attribute]).tz_localize(None))
  setattr(model, attribute, pd.Timestamp.utcfromtimestamp(model_dict[attribute]).tz_localize(None))
  setattr(model, attribute, pd.Timestamp.utcfromtimestamp(model_dict[attribute]).tz_localize(None))
  setattr(model, attribute, pd.Timestamp.utcfromtimestamp(model_dict[attribute]).tz_localize(None))
  setattr(model, attribute, pd.Timestamp.utcfromtimestamp(model_dict[attribute]).tz_localize(None))
  setattr(model, attribute, pd.Timestamp.utcfromtimestamp(model_dict[attribute]).tz_loc

  setattr(model, attribute, pd.Timestamp.utcfromtimestamp(model_dict[attribute]).tz_localize(None))
  setattr(model, attribute, pd.Timestamp.utcfromtimestamp(model_dict[attribute]).tz_localize(None))
  setattr(model, attribute, pd.Timestamp.utcfromtimestamp(model_dict[attribute]).tz_localize(None))
  setattr(model, attribute, pd.Timestamp.utcfromtimestamp(model_dict[attribute]).tz_localize(None))
  setattr(model, attribute, pd.Timestamp.utcfromtimestamp(model_dict[attribute]).tz_localize(None))
  setattr(model, attribute, pd.Timestamp.utcfromtimestamp(model_dict[attribute]).tz_localize(None))
  setattr(model, attribute, pd.Timestamp.utcfromtimestamp(model_dict[attribute]).tz_localize(None))
  setattr(model, attribute, pd.Timestamp.utcfromtimestamp(model_dict[attribute]).tz_localize(None))
  setattr(model, attribute, pd.Timestamp.utcfromtimestamp(model_dict[attribute]).tz_localize(None))
  setattr(model, attribute, pd.Timestamp.utcfromtimestamp(model_dict[attribute]).tz_localize(None))


  setattr(model, attribute, pd.Timestamp.utcfromtimestamp(model_dict[attribute]).tz_localize(None))
  setattr(model, attribute, pd.Timestamp.utcfromtimestamp(model_dict[attribute]).tz_localize(None))
  setattr(model, attribute, pd.Timestamp.utcfromtimestamp(model_dict[attribute]).tz_localize(None))
  setattr(model, attribute, pd.Timestamp.utcfromtimestamp(model_dict[attribute]).tz_localize(None))
  setattr(model, attribute, pd.Timestamp.utcfromtimestamp(model_dict[attribute]).tz_localize(None))
  setattr(model, attribute, pd.Timestamp.utcfromtimestamp(model_dict[attribute]).tz_localize(None))
  setattr(model, attribute, pd.Timestamp.utcfromtimestamp(model_dict[attribute]).tz_localize(None))
  setattr(model, attribute, pd.Timestamp.utcfromtimestamp(model_dict[attribute]).tz_localize(None))
  setattr(model, attribute, pd.Timestamp.utcfromtimestamp(model_dict[attribute]).tz_localize(None))
  setattr(model, attribute, pd.Timestamp.utcfromtimestamp(model_dict[attribute]).tz_localize(None))


Unnamed: 0,date,sales_1_forecast,sales_2_forecast,sales_3_forecast,sales_4_forecast,sales_5_forecast,sales_6_forecast,sales_7_forecast,sales_8_forecast,sales_9_forecast,...,sales_41_forecast,sales_42_forecast,sales_43_forecast,sales_44_forecast,sales_45_forecast,sales_46_forecast,sales_47_forecast,sales_48_forecast,sales_49_forecast,sales_50_forecast
0,2017-01-01,21.0,57.0,34.0,21.0,17.0,57.0,53.0,74.0,46.0,...,21.0,33.0,48.0,26.0,76.0,56.0,20.0,48.0,27.0,61.0
1,2017-01-02,13.0,36.0,21.0,13.0,11.0,36.0,32.0,46.0,29.0,...,14.0,21.0,31.0,16.0,47.0,35.0,13.0,30.0,17.0,38.0
2,2017-01-03,16.0,43.0,25.0,16.0,13.0,44.0,39.0,56.0,35.0,...,16.0,25.0,36.0,20.0,57.0,41.0,16.0,36.0,20.0,46.0
3,2017-01-04,16.0,43.0,25.0,16.0,13.0,44.0,39.0,56.0,35.0,...,16.0,25.0,37.0,19.0,57.0,42.0,16.0,36.0,20.0,47.0
4,2017-01-05,17.0,46.0,28.0,18.0,15.0,46.0,43.0,60.0,39.0,...,17.0,27.0,39.0,22.0,61.0,46.0,17.0,39.0,23.0,50.0
5,2017-01-06,19.0,50.0,30.0,18.0,15.0,50.0,46.0,64.0,41.0,...,19.0,29.0,42.0,23.0,66.0,49.0,18.0,42.0,23.0,53.0
6,2017-01-07,20.0,53.0,32.0,20.0,16.0,54.0,51.0,69.0,44.0,...,20.0,31.0,45.0,25.0,69.0,51.0,19.0,45.0,25.0,58.0
7,2017-01-08,21.0,57.0,35.0,21.0,17.0,57.0,53.0,74.0,47.0,...,20.0,34.0,48.0,26.0,77.0,56.0,20.0,48.0,27.0,61.0
8,2017-01-09,13.0,36.0,22.0,13.0,11.0,36.0,32.0,46.0,29.0,...,12.0,22.0,31.0,16.0,47.0,35.0,13.0,30.0,17.0,39.0
9,2017-01-10,16.0,43.0,26.0,16.0,13.0,43.0,39.0,56.0,35.0,...,15.0,26.0,35.0,20.0,57.0,42.0,15.0,36.0,20.0,46.0


In [16]:
new_path = r"D:\MLOPs POC\python_files\samp_data\test\next_month_data.csv"

In [17]:
pa = r"D:\MLOPs POC\python_files\prepros_sales_data\updated_sales_data.csv"

In [18]:
import pandas as pd 

In [20]:
a1 = pd.read_csv(new_path)
a2 = pd.read_csv(pa)

In [32]:
a1.iloc[-1, 0] == a2.iloc[-1, 0][:10]

True

In [30]:
a2.iloc[-1, 0][:11]

'2017-01-31 '

In [35]:
a1.sort_values("date")

Unnamed: 0,date,store,item,sales
0,2017-01-01,1,1,19
899,2017-01-01,1,30,27
1519,2017-01-01,1,50,49
961,2017-01-01,1,32,32
155,2017-01-01,1,6,52
...,...,...,...,...
867,2017-01-31,1,28,74
836,2017-01-31,1,27,18
805,2017-01-31,1,26,24
1487,2017-01-31,1,48,34


In [37]:
aa = a1.groupby("item").get_group(1)

In [38]:
aa

Unnamed: 0,date,store,item,sales
0,2017-01-01,1,1,19
1,2017-01-02,1,1,15
2,2017-01-03,1,1,10
3,2017-01-04,1,1,16
4,2017-01-05,1,1,14
5,2017-01-06,1,1,24
6,2017-01-07,1,1,14
7,2017-01-08,1,1,20
8,2017-01-09,1,1,18
9,2017-01-10,1,1,11
