In [1]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np
import warnings
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error,r2_score,mean_absolute_error,mean_absolute_percentage_error,mean_squared_log_error
from sklearn.multioutput import MultiOutputRegressor
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
import optuna
from collections import Counter
from statsmodels.tsa.stattools import adfuller, kpss
from tqdm import tqdm
from functools import partial, reduce
from datetime import timedelta
warnings.filterwarnings("ignore")

In [2]:
def read_dataset(file):
    if file.split('.')[1]=='csv':
        data = pd.read_csv(file)
    elif file.split('.')[1]=='xlsx':
        data = pd.read_excel(file)
    return data

In [3]:
def make_train_test_splits(X, y, test_split,unique_len):
    split_size = int(len(X) * (1 - test_split))  
    while True:
        split_size -= 1
        if split_size % unique_len == 0:
            break
    X_train = X[:split_size]
    y_train = y[:split_size]
    X_test = X[split_size:]
    y_test = y[split_size:]
    return X_train, X_test, y_train, y_test

In [4]:
def time_control_type(data,col):
    if str(data[col].dtypes) not in '[ns]':
        data[col]=pd.to_datetime(data[col])
    return data

In [5]:
def time_len_control(data,col):
    firstdateminusenddate = len(data[col])
    len_datetime = len(data[col].unique())
    if len_datetime == firstdateminusenddate:
        return False
    else:
        return True

In [6]:
def auto_detect(df, selected_datetime):
    df = df.sort_values(by=[selected_datetime],ascending=True)
    unique_df_time = len(df[selected_datetime].unique())
    df_time = len(df[selected_datetime])
    a = int(df_time / unique_df_time)
    unique_series_id = []
    for col in df.columns:
        if a == len(df[col].unique()):
            if df[col].isnull().sum() == 0 and df[selected_datetime].isnull().sum() == 0:
                df["concated"] = df[col].astype(str) + df[selected_datetime].astype(str)
                if df_time == len(df["concated"].unique()):
                    unique_series_id.append(col)
    return unique_series_id

In [7]:
def date_sort(data,col,col2):
    data = data.sort_values(by=[col,col2],ascending=[True,True]).reset_index(drop=True)
    return data

In [8]:
def date_column_info_pxexpress(data,num_cols,timestamp_column,unique_col):
    fig = make_subplots(rows=len(num_cols), cols=1, subplot_titles=num_cols)
    print("Store : ",unique_col)
    for i, col in enumerate(num_cols):
        line_chart = px.line(data, x=timestamp_column, y=col)
        line = line_chart.data[0]
        fig.add_trace(line, row=i + 1, col=1)
    num_rows = data.shape[1]
    fig.update_xaxes(title_text='Date', row=num_cols, col=1)
    fig.update_layout(showlegend=False, height=150*num_rows, width=1400)
    fig.show()

In [9]:
def date_column_info_pyplot(data, num_cols, timestamp_column, unique_col):
    fig, axes = plt.subplots(len(num_cols), 1, figsize=(14, 4*len(num_cols)))
    plt.suptitle(f"Store: {unique_col}")

    for i, col in enumerate(num_cols):
        axes[i].plot(data[timestamp_column], data[col])
        axes[i].set_title(col)
        axes[i].set_xlabel("Date")
        axes[i].set_ylabel(col)
    plt.subplots_adjust(hspace=0.4)
    plt.show()

In [10]:
def date_engineering(data,col):
    data['Day'] = data[col].dt.day.astype(str)
    data['Month'] = data[col].dt.month.astype(str)
    data['Year'] = data[col].dt.year.astype(str)         
    data['DayOfWeek'] = data[col].dt.dayofweek.astype(str)
    data['DayOfYear'] = data[col].dt.dayofyear.astype(str)
    data['WeekOfYear'] = data[col].dt.weekofyear.astype(str)
    data['Quarter'] = data[col].dt.quarter.astype(str)
    return data

In [11]:
def frequency_detect(data, selected_datetime):
    dates = data[selected_datetime].unique()
    frequencies = [int(int((dates[x + 1] - dates[x])) / (1000000000)) for x in range(0, len(dates) - 1)]
    time_type = ''
    frequency = list(Counter(frequencies).most_common(1)[0])[0]
    if frequency >= 31536000:
        time_type = 'years' if frequency % 31536000 == 0 else 'quarters'
    elif frequency >= 7948800:
        time_type = 'quarters' if frequency % 7948800 == 0 else 'months'
    elif frequency >= 2592000:
        time_type = 'months' if frequency % 2592000 == 0 else 'weeks'
    elif frequency >= 604800:
        time_type = 'weeks' if frequency % 604800 == 0 else 'days'
    elif frequency >= 86400:
        time_type = 'days' if frequency % 86400 == 0 else 'hours'
    elif frequency >= 3600:
        time_type = 'hours' if frequency % 3600 == 0 else 'minutes'
    elif frequency >= 60:
        time_type = 'minutes' if frequency % 60 == 0 else 'seconds'
    elif frequency >= 1:
        time_type = 'seconds'
    return time_type,frequency

In [12]:
def ADF_Test(data, target, selected_datetime_feature, SignificanceLevel=.05):
    data = data.set_index(
        pd.DatetimeIndex(data[selected_datetime_feature]))
    data = data.drop([selected_datetime_feature], axis=1)
    data = data[[target]]
    adfTest = adfuller(data, autolag='AIC')

    pValue = adfTest[1]

    if (pValue < SignificanceLevel):
        isStationary_adf = True
    else:
        isStationary_adf = False

    dataResults = pd.Series(adfTest[0:4],
                           index=['Adata Test Statistic', 'P-Value', '# Lags Used', '# Observations Used'])
    
    # Add Critical Values
    for key, value in adfTest[4].items():
         dataResults['Critical Value (%s)' % key] = value
    
    print('Augmented Dickey-Fuller Test Results:')
    print(dataResults)
    print(isStationary_adf)
    return isStationary_adf
    
def KPSS_Test(data, target, selected_datetime_feature, trend,SignificanceLevel=.05):
    """
    Regression: This parameter determines the type of regression to be used in calculating the test statistic. The KPSS test applies a regression model to examine the stationarity property of the data. 
    In this model, a component of the data is predicted, and the test statistic is calculated based on the remaining residuals. "c" (constant): This option represents a regression model with a constant component.
    The test examines the stationarity property of the series with this component. "ct" (constant and trend): This option represents a regression model that includes both a constant component and a linear trend component.
    The test assesses the stationarity property of the series with these two components. The nlags parameter specifies the number of lags used in the KPSS test. 
    This parameter determines how many steps back the regression model used in calculating the test statistic looks. The number of lags is a method used to examine the stationarity property of the series.
    If nlags is set to 25, the regression model will be designed to look back 25 steps (25 observations) when calculating the test statistic.
    This means that the test will use the last 25 observations when evaluating the stationarity property of the series.
    """
    data = data.set_index(
        pd.DatetimeIndex(data[selected_datetime_feature]))
    data = data.drop([selected_datetime_feature], axis=1)
    data = data[[target]]
    kpsstest = kpss(data, regression='ct',nlags=trend)
    kpss_output = pd.Series(kpsstest[0:3], index=['Test Statistic', 'p-value', '#Lags Used'])
    for key, value in kpsstest[3].items():
         kpss_output['Critical Value (%s)' % key] = value
    print(kpss_output)
    pValue = kpsstest[1]
    if (pValue < SignificanceLevel):
        isStationary_kpss = False
    else:
        isStationary_kpss = True
    print(isStationary_kpss)
    return isStationary_kpss

In [13]:
def editing_index(data, col, col2):
    data = data.set_index([col, col2]).sort_index()
    return data

In [14]:
def derived_lag_features(data, lag_features, lag_bound=5):
    for col in lag_features:
        for lag in range(1, int(lag_bound) + 1):
            f_name = f'{col}_lag_{lag}'
            data[f_name] = data[col].shift(int(lag))
    return data.loc[:, data.columns.str.contains('_lag_')]

In [15]:
def app_lag_data(data, WINDOW, derived_lag_features_cols,series_id,datetime_feature):
    data_1 = data.copy()
    data_2 = []
    data_1 = data_1.reset_index()
    for seri in data_1[series_id].unique():
        v = int(WINDOW) + 1
        data_3 = pd.DataFrame()
        data_4 = data_1[data_1[series_id] == seri]
        data_4 = editing_index(data_4, datetime_feature,series_id)
        if v < 7:
            for i in tqdm(range(len(data_4) - int(WINDOW))):
                data_5 = derived_lag_features(data_4[v - int(WINDOW):v + 1], derived_lag_features_cols,
                                                 lag_bound=WINDOW).dropna()
                v += 1
                data_3 = data_3.append(data_5)
            data_2.append(data_3)
        else:
            for i in tqdm(range(len(data_4) - int(WINDOW))):
                data_5 = derived_lag_features(data_4[v - int(WINDOW):v][-6:],
                                                 derived_lag_features_cols).dropna()
                v += 1
                data_3 = data_3.append(data_5)
            data_2.append(data_3)
    lagged_data = pd.concat(data_2)
    lagged_data = lagged_data.sort_index()
    return lagged_data

In [16]:
def time_type_detect(time_type):
        if time_type == 'years':
            time_num = 31536000
        elif time_type == 'quarters':
            time_num = 7948800
        elif time_type == 'months':
            time_num = 2592000
        elif time_type == 'weeks':
            time_num = 604800
        elif time_type == 'days':
            time_num = 86400
        elif time_type == 'hours':
            time_num = 3600
        elif time_type == 'minutes':
            time_num = 60
        elif time_type == 'seconds':
            time_num = 1
        return time_num

In [17]:
def derive_features(data, derivation_lagged_cols, win, window_list,time_type, frequency):
    functions = {
        'min': lambda x: x.rolling(window=win, min_periods=1).min(),
        'max': lambda x: x.rolling(window=win, min_periods=1).max(),
        'mean': lambda x: x.rolling(window=win, min_periods=1).mean(),
        'std': lambda x: x.rolling(window=win, min_periods=1).std(),
        'median': lambda x: x.rolling(window=win, min_periods=1).median()
    }
    time_num = time_type_detect(time_type)
    for win in window_list:
        for function_name, function in functions.items():
            for j in derivation_lagged_cols:
                data[f'{j}_stat_{function_name}_{int(win * frequency / time_num)}_{time_type}'] = data[[j]].apply(function)
    return data

In [18]:
def app_derived_data(data, derived_lag_features_cols, WINDOW, window_list,time_type, frequency,series_id,datetime_feature):
    data_1 = data.copy()
    data_1 = data_1.reset_index()
    derives = []
    for seri in data_1[series_id].unique():
        v = int(WINDOW) + 1
        data_2 = pd.DataFrame()
        data_3 = data_1[data_1[series_id] == seri]
        data_3 = editing_index(data_3, datetime_feature, series_id)
        if v < 7:
            for i in tqdm(range(len(data_3) - int(WINDOW))):
                data_4 = derive_features(data_3[v - int(WINDOW):v + 1], derived_lag_features_cols, WINDOW,window_list,time_type,frequency).iloc[
                    -1].to_frame().T
                v += 1
                data_2 = data_2.append(data_4)
            derives.append(data_2)
        else:
            for i in tqdm(range(len(data_3) - int(WINDOW))):
                data_4 = derive_features(data_3[v - int(WINDOW):v], derived_lag_features_cols, WINDOW,window_list,time_type,frequency).iloc[
                    -1].to_frame().T
                v += 1
                data_2 = data_2.append(data_4)
            derives.append(data_2)
    derived_data = pd.concat(derives)
    derived_data = derived_data.loc[:, derived_data.columns.str.contains('stat_')]
    derived_data = derived_data.astype('float32')
    derived_data = derived_data.sort_index()
    return derived_data

In [19]:
def app_diff_data(df, window, lagged_data, derived_data, target, time_type):
    data = df.copy()
    lag_data_target_columns = [x for x in lagged_data.columns if target in x]
    derived_diff_inp_column = derived_data[f'{target}_stat_mean_{window}_{time_type}']
    derived_data_target_columns = [x for x in derived_data.columns if target in x and derived_diff_inp_column.name not in x]
    for i in lag_data_target_columns:
        data[f'{target}_diff_{i.replace(target + "_", "", 1)}'] = data[target] - lagged_data[i]
        data[f'{target}_{i.replace(target + "_", "", 1)}_diff_{derived_diff_inp_column.name}'] = lagged_data[
                                                                                                     i] - derived_diff_inp_column
    for k in derived_data_target_columns:
        data[f'{target}_diff_{k.replace(target + "_", "", 1)}'] = data[target] - derived_data[k]
        data[f'{target}_{k.replace(target + "_", "", 1)}_diff_{derived_diff_inp_column.name}'] = derived_data[
                                                                                                     k] - derived_diff_inp_column
    data = data.loc[:, data.columns.str.contains('diff')]
    return data

In [20]:
def merge_data(data, lagged_data, derived_data, diff_data=None):
    list_of_datas = [data, lagged_data, derived_data]
    if diff_data:
        list_of_datas.append(diff_data)
    merge = partial(pd.merge, left_index=True, right_index=True)
    final_data = reduce(merge, list_of_datas)
    return final_data

In [21]:
def trend_removal_log(data,target_list):
    negative_values = []
    [[negative_values.append({column: x}) for x in range(len(data)) if data[column][x] < 0] for index, column in enumerate(data[target_list])]
    data[target_list] = abs(data[target_list])
    data[target_list] = np.log1p(data[target_list])
    if not negative_values:
        [data.rename({x: f"{x}_log"}, axis=1, inplace=True) for x in data[target_list].columns.tolist()]
        return data
    for i in negative_values:
        for key, value in i.items():
            data[key][value] = data[key][value] * int(-1)
    [data.rename({x: f"{x}_log"}, axis=1, inplace=True) for x in data[target_list].columns.tolist()]
    return data

In [22]:
def split(df,target,horizon,len_unique):
    X = df.drop([target], axis=1)
    y = df[[target]]
    horizon = int(horizon)
    y = derived_lag_features(y, [target], horizon).dropna()
    X = X.iloc[horizon*len_unique:]
    y = y.iloc[horizon*len_unique - horizon:]
    return X,y

In [23]:
def get_fold(X,fold_number,unique_len):
    X = X.reset_index()
    index = int(len(X)/(fold_number+1))
    while index % unique_len != 0:
        index -= 1
    cv_partitions = []
    for i in range(1,fold_number+1):
        train_index = X.iloc[:index*i]
        if i == fold_number:
            val_index = X.iloc[index*i:]
        else:
            val_index = X.iloc[index*i:index*(i+1)]
        cv_partitions.append({f'train': train_index.index.tolist(),
                              f'validation': val_index.index.tolist()})
    return cv_partitions

In [24]:
def pipeline_build(alg,num_cols,cat_cols):
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median', fill_value='missing')),
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, num_cols),
            ('cat', categorical_transformer, cat_cols)], remainder='passthrough')

    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                               ('algorithm', MultiOutputRegressor(alg))])
    return pipe

In [25]:
def objective(trial,X,y,fold_list,alg,num_cols,cat_cols):   
    params = {
        'learning_rate': trial.suggest_float('learning_rate',0.05, 0.5,step=0.01),
        'n_estimators': trial.suggest_int('n_estimators',10, 3000,step=10),
        'max_bin': trial.suggest_int('max_bin',16, 2048,step=16),
        'subsample': trial.suggest_float('subsample', 0.1, 1,step=0.1),
        'max_depth': trial.suggest_int('max_depth', 6, 10,step=1),
    }
    liste = []  
    for i in range(len(fold_list)):
        train_indices = fold_list[i]['train']
        val_indices = fold_list[i]['validation']
        X_train = X.iloc[train_indices]
        y_train = y.iloc[train_indices]
        X_val = X.iloc[val_indices]                    
        y_val = y.iloc[val_indices]
        alg.set_params(**params)
        pipe = pipeline_build(alg,num_cols,cat_cols)
        pipe.fit(X_train,y_train)
        y_pred = pipe.predict(X_val)
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        liste.append(rmse)
    print(f'RMSE : {np.mean(liste)}')
    return np.mean(liste)

In [26]:
def metrics_calculate(y_val,y_pred,X_train):
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    mae = mean_absolute_error(y_val, y_pred)
    rmsle = mean_squared_log_error(y_val, y_pred),
    r2 = r2_score(y_val, y_pred),
    adj_r2 = 1 - (1 - (r2_score(y_val, y_pred))) * (X_train.shape[0] - 1) / (X_train.shape[0] - len(X_train.columns.tolist()) - 1)
    mape = mean_absolute_percentage_error(y_val, y_pred)
    scores = {'RMSE' : rmse,'MAE' : mae,'RMSLE' : rmsle,'R-Squared' : r2,
                  'Adj R-Squared' : adj_r2,'MAPE' : mape}
    return scores

In [27]:
def pred_visualize_pxexpress(y_val_,y_pred_,target,unique_col,m,i):
    for k in range(len(y_val_.columns.tolist())):
        fig = go.Figure()
        y_val_vis = y_val_.iloc[:,k]
        y_pred_vis = y_pred_.iloc[:,k]
        real_table_name = f'{target} and {unique_col} : {m} Week : {k+1}'
        fig.add_trace(go.Scatter(x=y_val_.index, y=y_val_vis, mode='lines',name=f'Real Day {real_table_name}',line_color='#247AFD'))
        fig.add_trace(go.Scatter(x=y_val_.index, y=y_pred_vis, mode='lines',name=f'Pred Day {real_table_name}',line_color='#ff0000'))
        fig.update_layout(title=f'Test Değerleri ve Tahminler Fold {i + 1}',
                  xaxis_title='Time',
                  yaxis_title='Horizon Time Steps')
        fig.show()

In [28]:
def pred_visualize_plotly(y_val_, y_pred_, target, unique_col, m, i):
    for k in range(len(y_val_.columns.tolist())):
        plt.figure(figsize=(10, 6))
        
        y_val_vis = y_val_.iloc[:, k]
        y_pred_vis = y_pred_.iloc[:, k]
        
        real_table_name = f'{target} and {unique_col} : {m} Week : {k + 1}'

        plt.plot(y_val_.index, y_val_vis, label=f'Real Day {real_table_name}', color='#247AFD')
        plt.plot(y_val_.index, y_pred_vis, label=f'Pred Day {real_table_name}', color='#ff0000')

        plt.title(f'Test Values And Predictions Fold {i + 1}')
        plt.xlabel('Time')
        plt.ylabel('Horizon Time Steps')
        plt.legend()    
        plt.subplots_adjust(hspace=0.4)
        plt.show()

In [29]:
def train_and_visualization(X,y,fold_list,horizon,num_cols,cat_cols,alg,timestamp_column,unique_col,target):
    for i in range(len(fold_list)):
        train_indices = fold_list[i]['train']
        val_indices = fold_list[i]['validation']
        X_train = X.iloc[train_indices]
        y_train = y.iloc[train_indices]
        X_val = X.iloc[val_indices]                    
        y_val = y.iloc[val_indices]  
        pipe = pipeline_build(alg,num_cols,cat_cols)
        pipe.fit(X_train,y_train)
        y_pred = pipe.predict(X_val)
        scores = metrics_calculate(y_val,y_pred,X_train)
        print(f"Fold {i + 1} Scores : {scores}")
        model_preds_columns_list = [[f'+{i + 1}_Horizon_time_step'][0] for i in range(horizon)]
        y_pred = pd.DataFrame(y_pred, index=X_val.index,
                                  columns=[model_preds_columns_list])
        y_pred = y_pred.sort_values(by=[unique_col,timestamp_column],ascending=[True,True])
        y_pred = y_pred.reset_index()
        y_val = y_val.reset_index()
        print(f"Train Start-End: {X_train.index[0]} - {X_train.index[-1]}")
        print(f"Validation Start-End: {X_val.index[0]} - {X_val.index[-1]}")
        indice = 0
        indice_ = len(y_pred[timestamp_column])/len(y_val[unique_col].unique())
        indice_end = len(y_pred[timestamp_column])/len(y_val[unique_col].unique())
        for m in y_val[unique_col].unique():
            y_val_ = y_val[y_val[unique_col] == m]
            y_pred_ = y_pred.iloc[int(indice):int(indice_)]
            y_val_=y_val_.set_index(timestamp_column)
            y_val_.drop(unique_col,axis=1,inplace=True)
            y_pred_.drop(unique_col,axis=1,inplace=True)
            y_pred_.drop(timestamp_column,axis=1,inplace=True)
            # If you want to plot real and predicted data using Plotly Express, you can use the code below with comments disabled.
            #pred_visualize_pxexpress(y_val_,y_pred_,target,unique_col,m,i)
            pred_visualize_plotly(y_val_,y_pred_,target,unique_col,m,i)
            indice += indice_end
            indice_ += indice_end

In [30]:
train = read_dataset("C:/Users/MahmutYAVUZ/Desktop/Software/Python/kaggle/advanced_multiple_time_series/data/raw/Walmart.csv")

In [None]:
train

In [None]:
train.info()

In [None]:
timestamp_column = 'Date'
target = 'Weekly_Sales' 

In [None]:
train=time_control_type(train,timestamp_column)

In [None]:
control = time_len_control(train,timestamp_column)

In [None]:
if control:
    unique_list = auto_detect(train,timestamp_column)
    print(unique_list)

In [None]:
train = date_sort(train,timestamp_column,unique_list[0])

In [None]:
train

In [None]:
for i in train[unique_list[0]].unique():
    data = train[train[unique_list[0]] == i]
    num_cols = data.select_dtypes(include=['float','int']).columns.tolist()
    num_cols.remove(unique_list[0])
    #If you want to use Plotly Express, please use the date_column_info_pyexpress function
    #date_column_info_pxexpress(data,num_cols,timestamp_column,i)
    date_column_info_pyplot(data,num_cols,timestamp_column,i)

In [None]:
train = date_engineering(train,timestamp_column)

In [None]:
time_type,frequency = frequency_detect(train,timestamp_column)

In [None]:
train

In [None]:
isStationary_adf = ADF_Test(train,target,timestamp_column)

In [None]:
isStationary_kpss = KPSS_Test(train,target,timestamp_column,trend=315)

In [None]:
train

In [None]:
window = 50

In [None]:
train = editing_index(train,timestamp_column,unique_list[0])

In [None]:
train

In [None]:
num_cols = train.select_dtypes(include=['float','int']).columns.tolist() 
cat_cols = train.select_dtypes(exclude=['float','int']).columns.tolist()

In [None]:
num_cols.remove('Holiday_Flag')
cat_cols.append('Holiday_Flag')

In [None]:
num_cols,cat_cols

In [None]:
lagged_data = app_lag_data(train,window,num_cols,unique_list[0],timestamp_column)

In [None]:
lagged_data

In [None]:
window_list = [50,25,10]

In [None]:
derived_data = app_derived_data(train,num_cols,window,window_list,time_type,frequency,unique_list[0],timestamp_column)

In [None]:
derived_data = derived_data.reset_index()
derived_data.rename(columns = {'level_0':timestamp_column,'level_1':unique_list[0]},inplace=True)

In [None]:
derived_data = editing_index(derived_data,timestamp_column,unique_list[0])

In [None]:
def split_data(data,window,unique_len):
    if int(window) < int(6):
        data = data.iloc[(int(window) * int(unique_len)) + int(unique_len):]
        data = data.sort_index()
    else:
        data = data.iloc[int(window) * int(unique_len):]
        data = data.sort_index()
    return datadef split_data(data,window,unique_len):
    if int(window) < int(6):
        data = data.iloc[(int(window) * int(unique_len)) + int(unique_len):]
        data = data.sort_index()
    else:
        data = data.iloc[int(window) * int(unique_len):]
        data = data.sort_index()
    return data

In [None]:
train = split_data(train,window,len(train.reset_index()[unique_list[0]].unique()))

In [None]:
if not isStationary_kpss:
    diff_data = app_diff_data(train,window,lagged_data,derived_data,target,time_type)

In [None]:
final_data = merge_data(train,lagged_data,derived_data)

In [None]:
final_data

In [None]:
if not isStationary_adf:
    target_list = [x for x in final_data.columns.tolist() if x.startswith(target) and x != target]
    final_data = trend_removal_log(final_data,target_list)

In [None]:
final_data

In [None]:
horizon = 4
X,y = split(final_data,target,horizon,len(train.reset_index()[unique_list[0]].unique()))

In [None]:
X

In [None]:
y

In [None]:
X_train,X_test,y_train,y_test = make_train_test_splits(X,y,0.20,len(train.reset_index()[unique_list[0]].unique()))

In [None]:
fold_list = get_fold(X_train,3,len(train.reset_index()[unique_list[0]].unique()))

In [None]:
num_cols = X.select_dtypes(include=['float','int']).columns.tolist() 
cat_cols = X.select_dtypes(exclude=['float','int']).columns.tolist()

In [None]:
num_cols.remove('Holiday_Flag')
cat_cols.append('Holiday_Flag')

In [None]:
X_train

In [None]:
y_train

In [None]:
X_test

In [None]:
y_test

In [None]:
alg_list = [XGBRegressor(random_state=42),LGBMRegressor(random_state=42,verbose=-1),CatBoostRegressor(random_state=42,verbose=0)]
rmse_list = []
for alg in alg_list:
    for i in range(len(fold_list)):
        train_indices = fold_list[i]['train']
        val_indices = fold_list[i]['validation']
        X_train_2 = X_train.iloc[train_indices]
        y_train_2 = y_train.iloc[train_indices]
        X_val = X_train.iloc[val_indices]                    
        y_val = y_train.iloc[val_indices]
        pipe = pipeline_build(alg,num_cols,cat_cols)
        pipe.fit(X_train_2,y_train_2)
        y_pred = pipe.predict(X_val)
        rmse = np.sqrt(mean_squared_error(y_val, pipe.predict(X_val)))
        rmse_list.append(rmse)
    print(f'RMSE And {type(alg).__name__} : {np.mean(rmse_list)}')

In [None]:
study = optuna.create_study(direction = 'minimize',study_name = 'advanced_multiple_time_series')
study.optimize(lambda trial: objective(trial,X,y,fold_list,LGBMRegressor(random_state=42,verbose=-1),num_cols,cat_cols), n_trials = 100)

In [None]:
print('Best Value:', study.best_value)
print('Best Params:', study.best_params)
best_params = study.best_params

In [None]:
forecast_distance = time_type_detect(time_type)

In [None]:
alg = CatBoostRegressor(random_state=42,verbose=0)
train_and_visualization(X,y,fold_list,horizon,num_cols,cat_cols,alg,timestamp_column,unique_list[0],target)