In [4]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
import os

import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import MinMaxScaler

current_folder = os.path.dirname(os.path.abspath(''))
dataSet_location = os.path.join(current_folder,'Datasets')
avgRunFile = os.path.join(dataSet_location,'avg_run_dir1.csv')

# dataframe of average running times for each segement for each time slot in history
avg_run_df = pd.read_csv(avgRunFile)

# additional columns
avg_run_df['date'] = pd.to_datetime(avg_run_df['date'])
avg_run_df['weekday'] = avg_run_df['date'].dt.dayofweek
avg_run_df['time'] = avg_run_df['date'].dt.time

def handlingZeros(df):
    # Loop over each segment column
    for col in df.columns[1:-2]:
        # Group by weekday and time slot
        grouped = df.groupby(['weekday', 'time'])

        # Define a function to apply to each group
        def replace_zeros(group):
            nonzero_mean = group[group != 0].mean()
            if pd.notnull(nonzero_mean):
                return group.replace(0, nonzero_mean)
            else:
                return group

        # Apply the function to each group in the specific segment column
        df[col] = grouped[col].transform(replace_zeros)

    return df

def handlingOutliers(df, stdev_num):
    # Assuming 'df' is your DataFrame
    for col in df.columns:
        if np.issubdtype(df[col].dtype, np.number):  # check if column is numeric
            mean = df[col].mean()
            std = df[col].std()
            median = df[col].median()
            outliers = (df[col] - mean).abs() > stdev_num*std
            df.loc[outliers, col] = median
    return df

processed_df = handlingZeros(avg_run_df)
processed_df = handlingOutliers(processed_df,3)

processed_df

Unnamed: 0,date,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,weekday,time
0,2021-10-01 06:00:00,100.800000,241.333333,444.000000,226.266667,135.571429,229.406250,0.000000,82.692308,0.000000,173.000000,0.000000,174.545455,149.652778,106.259259,320.692308,4,06:00:00
1,2021-10-01 06:15:00,98.666667,238.529412,426.600000,203.600000,135.571429,229.406250,0.000000,82.692308,0.000000,173.000000,0.000000,174.545455,149.652778,106.259259,320.692308,4,06:15:00
2,2021-10-01 06:30:00,69.000000,271.944444,407.411765,188.538462,121.125000,237.666667,40.000000,75.600000,27.800000,143.500000,0.000000,174.545455,149.652778,106.259259,320.692308,4,06:30:00
3,2021-10-01 06:45:00,87.812500,210.000000,496.000000,195.000000,97.000000,285.444444,45.382353,91.718750,35.214286,162.000000,202.000000,165.333333,160.000000,98.500000,320.692308,4,06:45:00
4,2021-10-01 07:00:00,104.789474,282.131579,545.210526,224.428571,126.750000,210.000000,45.000000,99.000000,34.000000,149.000000,219.125000,175.117647,154.333333,103.307692,312.500000,4,07:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12475,2022-09-30 17:45:00,105.846154,260.642857,387.750000,236.000000,123.000000,208.000000,52.000000,73.000000,30.000000,165.000000,135.200000,180.000000,157.000000,150.000000,278.000000,4,17:45:00
12476,2022-09-30 18:00:00,94.100000,237.115385,412.428571,251.107143,143.200000,303.125000,63.857143,84.111111,38.125000,155.269231,154.000000,166.000000,158.000000,115.000000,342.178571,4,18:00:00
12477,2022-09-30 18:15:00,101.200000,237.550000,409.333333,235.100000,136.450000,234.541667,48.571429,77.357143,29.966667,171.071429,138.681818,153.000000,162.000000,99.833333,382.000000,4,18:15:00
12478,2022-09-30 18:30:00,109.666667,264.500000,415.500000,250.812500,144.909091,230.227273,44.500000,82.200000,29.150000,153.200000,138.409091,154.416667,145.230769,98.714286,316.291667,4,18:30:00


In [5]:
# Get the list of integer column names (segments)
segment_columns = processed_df.columns[1:-2]

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Loop through each segment column and normalize it
for segment in segment_columns:
    processed_df[segment] = scaler.fit_transform(processed_df[segment].values.reshape(-1, 1))

processed_df

Unnamed: 0,date,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,weekday,time
0,2021-10-01 06:00:00,0.407042,0.329710,0.364326,0.440404,0.436411,0.404076,0.000000,0.414488,0.000000,0.435345,0.000000,0.364420,0.408476,0.335097,0.365767,4,06:00:00
1,2021-10-01 06:15:00,0.392019,0.322091,0.331309,0.354545,0.436411,0.404076,0.000000,0.414488,0.000000,0.435345,0.000000,0.364420,0.408476,0.335097,0.365767,4,06:15:00
2,2021-10-01 06:30:00,0.183099,0.412893,0.294899,0.297494,0.348323,0.428019,0.380952,0.345631,0.408824,0.308190,0.000000,0.364420,0.408476,0.335097,0.365767,4,06:30:00
3,2021-10-01 06:45:00,0.315581,0.244565,0.462998,0.321970,0.201220,0.566506,0.432213,0.502124,0.517857,0.387931,0.508816,0.324713,0.461538,0.282313,0.365767,4,06:45:00
4,2021-10-01 07:00:00,0.435137,0.440575,0.556377,0.433442,0.382622,0.347826,0.428571,0.572816,0.500000,0.331897,0.551952,0.366886,0.432479,0.315018,0.347682,4,07:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12475,2022-09-30 17:45:00,0.442579,0.382182,0.257590,0.477273,0.359756,0.342029,0.495238,0.320388,0.441176,0.400862,0.340554,0.387931,0.446154,0.632653,0.271523,4,17:45:00
12476,2022-09-30 18:00:00,0.359859,0.318248,0.304419,0.534497,0.482927,0.617754,0.608163,0.428263,0.560662,0.358919,0.387909,0.327586,0.451282,0.394558,0.413198,4,18:00:00
12477,2022-09-30 18:15:00,0.409859,0.319429,0.298545,0.473864,0.441768,0.418961,0.462585,0.362691,0.440686,0.427032,0.349324,0.271552,0.471795,0.291383,0.501104,4,18:15:00
12478,2022-09-30 18:30:00,0.469484,0.392663,0.310247,0.533381,0.493348,0.406456,0.423810,0.409709,0.428676,0.350000,0.348638,0.277658,0.385799,0.283771,0.356052,4,18:30:00


## Generating Time Series Sequences 

In [6]:
processed_df = processed_df.loc[processed_df.date <= '2022-02-28']
processed_df

Unnamed: 0,date,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,weekday,time
0,2021-10-01 06:00:00,0.407042,0.329710,0.364326,0.440404,0.436411,0.404076,0.000000,0.414488,0.000000,0.435345,0.000000,0.364420,0.408476,0.335097,0.365767,4,06:00:00
1,2021-10-01 06:15:00,0.392019,0.322091,0.331309,0.354545,0.436411,0.404076,0.000000,0.414488,0.000000,0.435345,0.000000,0.364420,0.408476,0.335097,0.365767,4,06:15:00
2,2021-10-01 06:30:00,0.183099,0.412893,0.294899,0.297494,0.348323,0.428019,0.380952,0.345631,0.408824,0.308190,0.000000,0.364420,0.408476,0.335097,0.365767,4,06:30:00
3,2021-10-01 06:45:00,0.315581,0.244565,0.462998,0.321970,0.201220,0.566506,0.432213,0.502124,0.517857,0.387931,0.508816,0.324713,0.461538,0.282313,0.365767,4,06:45:00
4,2021-10-01 07:00:00,0.435137,0.440575,0.556377,0.433442,0.382622,0.347826,0.428571,0.572816,0.500000,0.331897,0.551952,0.366886,0.432479,0.315018,0.347682,4,07:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7795,2022-02-27 17:45:00,0.539437,0.255629,0.290409,0.350379,0.515244,0.478261,0.628571,0.485437,0.308824,0.512931,0.755668,0.245690,0.471795,0.309524,0.456954,6,17:45:00
7796,2022-02-27 18:00:00,0.436620,0.122283,0.332258,0.815341,0.456174,0.294565,0.468783,0.366990,0.333333,0.316379,0.700252,0.564655,0.661538,0.476190,0.395143,6,18:00:00
7797,2022-02-27 18:15:00,0.584507,0.220109,0.347249,0.299242,0.439024,0.327536,0.714286,0.398058,0.405637,0.345366,0.340050,0.222291,0.291209,0.252079,0.359823,6,18:15:00
7798,2022-02-27 18:30:00,0.437324,0.122283,0.442125,0.587121,0.448171,0.339855,0.815476,0.332039,0.441176,0.353448,0.299748,0.159483,0.338462,0.204082,0.345107,6,18:30:00


### Using Darts Python Library to Handle Time Series

In [31]:
from darts import TimeSeries
from darts.models import ExponentialSmoothing
from darts.metrics import mae
from darts.models import ARIMA, AutoARIMA

In [28]:
list_columns = processed_df.columns[1:-2]
list_columns

Index(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13',
       '14', '15'],
      dtype='object')

In [36]:
list_models = []
for col in list_columns:
    series = TimeSeries.from_dataframe(processed_df, "date", col, fill_missing_dates=True, freq=None)
    train, val = series[:-36], series[-36:]
    model = ARIMA()
    list_models.append(model)
    model.fit(train)
    pred = model.predict(len(val))
    print(col, scaler.inverse_transform(mae(actual_series=val, pred_series=pred).reshape(1, -1))[0][0])

1 212.37063980303782
2 212.17732129514476
3 188.69612359794579
4 200.054973828653
5 188.0712483447814
6 191.03020303579567
7 206.28292137597484
8 199.22630437526016
9 208.62378764617122
10 195.31127497230065
11 206.1772282513502
12 199.70698156387272
13 191.48139916614304
14 190.47471711520654
15 183.99716880266308


## Errors

Using a simple ARIMA to predict give MAE scores for each segment 
    
    1 212.37063980303782
    
    2 212.17732129514476
    
    3 188.69612359794579
    
    4 200.054973828653
    
    5 188.0712483447814
    
    6 191.03020303579567
    
    7 206.28292137597484
    
    8 199.22630437526016
    
    9 208.62378764617122
    
    10 195.31127497230065
    
    11 206.1772282513502
    
    12 199.70698156387272
    
    13 191.48139916614304
    
    14 190.47471711520654
    
    15 183.99716880266308

Which is roughly abou

In [29]:
model

ARIMA(p=12, d=1, q=0, seasonal_order=(0, 0, 0, 0), trend=None, random_state=None, add_encoders=None)