# CASH-4464: Expand evaluation window of ARIMA market trend predictions

### Goal: 

For the evaluation of the predicted rate direction (up, stable, down), expand the evaluation window to better assess how the algorithm performs at different points in time and market conditions.

In [1]:
# Load required packages
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import seaborn as sns

sns.set()


# import necessary libraries
from statsmodels.tsa.arima_model import ARIMA
import pmdarima as pm
from datetime import timedelta

## display df side by side
from IPython.display import HTML

# Supressing warnings
import warnings

warnings.filterwarnings("ignore")

In [2]:
# Format settings:
pd.set_option("display.float_format", lambda x: "%.4f" % x)

### Get TRAC rates from selected lanes V0 

TRAC lanes were originally denified on CASH-4305:

You can either extract then from a s3 file or trino table.

In [3]:
# ## Extract files from aws S3 and save it on a file path
! aws s3 cp  s3://loadsmart-data-science/pricing-analysis/CASH-4305/df_trac_selected_lanes_rates.csv .

### Table trino: s3_silver.trac.lanes

Completed 256.0 KiB/22.6 MiB (186.3 KiB/s) with 1 file(s) remaining
Completed 512.0 KiB/22.6 MiB (358.3 KiB/s) with 1 file(s) remaining
Completed 768.0 KiB/22.6 MiB (512.3 KiB/s) with 1 file(s) remaining
Completed 1.0 MiB/22.6 MiB (657.0 KiB/s) with 1 file(s) remaining  
Completed 1.2 MiB/22.6 MiB (777.2 KiB/s) with 1 file(s) remaining  
Completed 1.5 MiB/22.6 MiB (885.1 KiB/s) with 1 file(s) remaining  
Completed 1.8 MiB/22.6 MiB (1.0 MiB/s) with 1 file(s) remaining    
Completed 2.0 MiB/22.6 MiB (1.1 MiB/s) with 1 file(s) remaining    
Completed 2.2 MiB/22.6 MiB (1.2 MiB/s) with 1 file(s) remaining    
Completed 2.5 MiB/22.6 MiB (1.3 MiB/s) with 1 file(s) remaining    
Completed 2.8 MiB/22.6 MiB (1.3 MiB/s) with 1 file(s) remaining    
Completed 3.0 MiB/22.6 MiB (1.4 MiB/s) with 1 file(s) remaining    
Completed 3.2 MiB/22.6 MiB (1.5 MiB/s) with 1 file(s) remaining    
Completed 3.5 MiB/22.6 MiB (1.6 MiB/s) with 1 file(s) remaining    
Completed 3.8 MiB/22.6 MiB (1.7 MiB/s) with 1 fi

In [4]:
## Get selected Dry Van lanes previously defined
df_trac = pd.read_csv("df_trac_selected_lanes_rates.csv")
df_trac.set_index("data_timestamp", inplace=True)

## fill in missing data
df_trac = df_trac.fillna(method="ffill").fillna(method="bfill")

## Select a few lanes to run ARIMA:

#### The chosen lanes are among the SG TOP Lanes previously identified (CASH-4373:)

In [5]:
df_arima = df_trac[
    [
        "442-286",
        "410-601",
        "917-841",
        "770-300",
        "432-296",
        "301-945",
        "231-194",
        "372-282",
        "780-770",
        "303-752",
    ]
]
# df_arima = df_trac[["442-286", "410-601", "917-841", "770-300"]]
df_arima.index = pd.to_datetime(df_arima.index)

# Get datapoints of every Friday
df_arima_f = df_arima[df_arima.index.day_name() == "Friday"]

## ARIMA Modelling:

### Let's loop ARIMA for all selected lanes

In [6]:
def get_forecast_lane(data, n_periods):
    # Initialize empty lists to store forecast data
    data_fc = []
    data_lower = []
    data_upper = []
    data_aic = []
    data_order = []
    data_fitted = []

    # Iterate over columns in data
    for lane in data.columns:
        # Fit an ARIMA model using the auto_arima function
        data_actual = data[lane]
        model = pm.auto_arima(
            data_actual,
            # start_p=1,
            # start_q=1, # had
            max_p=3,
            max_q=3,  # maximum p and q
            test="adf",  # use adftest to find optimal 'd'
            d=None,  # let model determine 'd'
            #   seasonal=seasonal,  # TRUE if seasonal series
            # m=7,  # frequency of series
            # D=None,  # let model determine 'D'
            trace=False,  # Whether to print status on the fits
            error_action="ignore",
            suppress_warnings=True,
            stepwise=True,
        )

        # Generate forecast and confidence intervals for n_periods into the future
        fc, confint = model.predict(n_periods=n_periods, return_conf_int=True)
        index_of_fc = pd.date_range(
            pd.to_datetime(df_arima_f.index[-1]) + timedelta(days=6),
            periods=n_periods,
            freq="W-FRI",
        )

        # Append forecast data to lists
        data_fc.append(fc)
        data_lower.append(confint[:, 0])
        data_upper.append(confint[:, 1])
        data_aic.append(model.aic())
        data_order.append(model.order)
        data_fitted.append(
            model.fittedvalues()
        )  # Notice that auto arima applies one-step ahead predictions

        # Create dataframes for forecast, lower bound, and upper bound and model statistics
        df_fc = pd.DataFrame(index=index_of_fc)
        df_lower = pd.DataFrame(index=index_of_fc)
        df_upper = pd.DataFrame(index=index_of_fc)
        df_aic = pd.DataFrame()
        df_order = pd.DataFrame()
        df_fitted = pd.DataFrame(index=data_actual.index)

    # Populate dataframes with fit results
    i = 0
    for lane in data.columns:
        df_fc[lane] = data_fc[i][:]
        df_lower[lane] = data_lower[i][:]
        df_upper[lane] = data_upper[i][:]
        df_aic[lane] = data_aic[i]
        df_order[lane] = data_order[i]
        df_fitted[lane] = data_fitted[i][:].iloc[1:]
        i = i + 1

    # Need to drop first row of fitted data - all row value equals to zero because of the recursive ARIMA process
    df_fitted = df_fitted.iloc[1:]
    # Rename df_order index to match ARI<MA hyperparamethers
    df_order = df_order.rename(index={0: "p", 1: "d", 2: "q"})

    return df_fc, df_lower, df_upper, df_aic, df_fitted, df_order

In [7]:
df_fc, df_lower, df_upper, df_aic, df_fitted, df_order = get_forecast_lane(
    data=df_arima_f, n_periods=1
)

### Best model selection based on AIC Criteria

In [8]:
df_order

Unnamed: 0,442-286,410-601,917-841,770-300,432-296,301-945,231-194,372-282,780-770,303-752
p,1,0,0,2,1,2,2,2,3,3
d,1,1,1,1,1,1,1,1,1,1
q,2,2,2,0,0,3,2,1,0,3


## Evaluate market trends

In [9]:
### Weekly change
def weekly_change(df):
    # df_delta = df.copy()
    df_week_delta = df.diff()
    # # Drop the first row NaN Values
    df_week_delta = df_week_delta.iloc[1:]
    return df_week_delta


### Set market directional trends rule
def market_trend(x):
    if x >= 0.015:
        return "up"
    if x <= -0.015:
        return "down"
    else:
        return "stable"

In [10]:
# Get weekly changes
df_delta = weekly_change(df_arima_f)

# Apply market trend rules
df_delta_trend = df_delta.applymap(market_trend)

### 2. In-sample TRAC-fitted values cross check 

Notice that I will be comparing (TRAC) with the forecasted/Fitted Values (t+1).

That is, to verify if the predicted values ​​were able to generate the correct market trend, I calculate the difference between forecasted (t+1) and the actual value in t to obtain the market direction.

In [11]:
### Extract daily market trend: Forecsted (t+1) - TRAC real data (t)
df_trac_fitted = df_fitted - df_arima_f.shift(+1)
df_trac_fitted = df_trac_fitted.iloc[1:]

In [12]:
# Apply market trend rules
df_trac_fitted_trend = df_trac_fitted.applymap(market_trend)
df_trac_fitted_trend.tail(5)

Unnamed: 0_level_0,442-286,410-601,917-841,770-300,432-296,301-945,231-194,372-282,780-770,303-752
data_timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2023-04-21,stable,stable,stable,stable,stable,stable,down,down,stable,down
2023-04-28,stable,down,stable,down,stable,stable,stable,stable,stable,stable
2023-05-05,stable,stable,down,stable,down,stable,up,down,stable,stable
2023-05-12,stable,up,stable,up,down,stable,stable,up,up,up
2023-05-19,down,down,stable,down,down,stable,stable,up,stable,stable


#### Concatenate both dfs to further analysis 

In [13]:
df_all = pd.concat(
    [df_delta_trend, df_trac_fitted_trend],
    axis="columns",
    keys=["TRAC", "ARIMA"],
)

In [14]:
df_final = df_all.swaplevel(axis="columns")
df_final = df_final[(sorted(df_final.columns))].reset_index()
df_final.set_index("data_timestamp", inplace=True)
df_final

Unnamed: 0_level_0,231-194,231-194,301-945,301-945,303-752,303-752,372-282,372-282,410-601,410-601,432-296,432-296,442-286,442-286,770-300,770-300,780-770,780-770,917-841,917-841
Unnamed: 0_level_1,ARIMA,TRAC,ARIMA,TRAC,ARIMA,TRAC,ARIMA,TRAC,ARIMA,TRAC,ARIMA,TRAC,ARIMA,TRAC,ARIMA,TRAC,ARIMA,TRAC,ARIMA,TRAC
data_timestamp,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2
2019-01-11,stable,down,stable,down,stable,up,stable,down,stable,stable,stable,down,stable,down,stable,stable,stable,up,stable,down
2019-01-18,stable,down,stable,up,stable,down,down,down,stable,down,down,down,down,down,stable,down,stable,down,stable,down
2019-01-25,down,down,stable,down,down,down,down,down,down,stable,down,down,down,down,stable,down,stable,down,down,down
2019-02-01,down,down,stable,down,down,stable,down,down,stable,down,down,down,down,down,down,down,down,down,down,down
2019-02-08,stable,down,stable,stable,stable,down,down,up,stable,down,down,down,down,down,down,down,up,down,down,stable
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-04-21,down,stable,stable,down,down,stable,down,down,stable,down,stable,down,stable,down,stable,down,stable,down,stable,stable
2023-04-28,stable,up,stable,stable,stable,down,stable,down,down,up,stable,down,stable,stable,down,down,stable,stable,stable,down
2023-05-05,up,up,stable,stable,stable,up,down,up,stable,up,down,down,stable,down,stable,up,stable,up,down,down
2023-05-12,stable,down,stable,stable,up,stable,up,down,up,down,down,down,stable,down,up,down,up,up,stable,up


## Confusion Matrix analysis

#### All data sample

In [15]:
### Add a suffix to better visualize results in the confusion matrix
df_trac_fitted_trend.columns = [str(col) + "_f" for col in df_trac_fitted_trend.columns]

To better assess market predictions results, it was built a confusion matrix. It shows how many prediction are correct and incorrect per class (market trends). 

It helps in understanding the classes that are being confused by model as other class.

In [16]:
def conf_matrix(df1, lane1, df2, lane2, lane_name):
    df_ct = pd.crosstab(df1[lane1], df2[lane2], normalize="all")

    # Calculate scenarios based on matrix element position
    trace = np.trace(df_ct).round(2)
    stable_trend = (df_ct.iat[1, 0] + df_ct.iat[1, 2]).round(2)
    trend_stable = (df_ct.iat[0, 1] + df_ct.iat[2, 1]).round(2)
    false_positives = (df_ct.iat[2, 0] + df_ct.iat[0, 2]).round(2)

    ### Creaate a df of the results
    # assign data of lists.
    data = {lane_name: [trace, stable_trend, trend_stable, false_positives]}
    # Creates pandas DataFrame.
    df_values = pd.DataFrame(
        data, index=["Trace", "Stable_Trend", "Trend_Stable", "False Positives"]
    )

    # print the data
    return df_values

In [17]:
df_cf_1 = conf_matrix(
    df_delta_trend, "442-286", df_trac_fitted_trend, "442-286_f", "442-286"
)

df_cf_2 = conf_matrix(
    df_delta_trend, "410-601", df_trac_fitted_trend, "410-601_f", "410-601"
)

df_cf_3 = conf_matrix(
    df_delta_trend, "917-841", df_trac_fitted_trend, "917-841_f", "917-841"
)

df_cf_4 = conf_matrix(
    df_delta_trend, "770-300", df_trac_fitted_trend, "770-300_f", "770-300"
)

df_cf_5 = conf_matrix(
    df_delta_trend, "432-296", df_trac_fitted_trend, "432-296_f", "432-296"
)

df_cf_6 = conf_matrix(
    df_delta_trend, "301-945", df_trac_fitted_trend, "301-945_f", "301-945"
)

df_cf_7 = conf_matrix(
    df_delta_trend, "231-194", df_trac_fitted_trend, "231-194_f", "231-194"
)

df_cf_8 = conf_matrix(
    df_delta_trend, "372-282", df_trac_fitted_trend, "372-282_f", "372-282"
)

df_cf_9 = conf_matrix(
    df_delta_trend, "780-770", df_trac_fitted_trend, "780-770_f", "780-770"
)

df_cf_10 = conf_matrix(
    df_delta_trend, "303-752", df_trac_fitted_trend, "303-752_f", "303-752"
)

In [18]:
### Display results side by side
def side_by_side(*dfs):
    html = '<div style="display:flex">'
    for df in dfs:
        html += '<div style="margin-right: 2em">'
        html += df.to_html()
        html += "</div>"
    html += "</div>"
    display(HTML(html))


print("All datapoints")
side_by_side(df_cf_1, df_cf_2, df_cf_3, df_cf_4, df_cf_5)

side_by_side(df_cf_6, df_cf_7, df_cf_8, df_cf_9, df_cf_10)

All datapoints


Unnamed: 0,442-286
Trace,0.49
Stable_Trend,0.07
Trend_Stable,0.29
False Positives,0.15

Unnamed: 0,410-601
Trace,0.48
Stable_Trend,0.09
Trend_Stable,0.28
False Positives,0.15

Unnamed: 0,917-841
Trace,0.47
Stable_Trend,0.08
Trend_Stable,0.25
False Positives,0.19

Unnamed: 0,770-300
Trace,0.39
Stable_Trend,0.07
Trend_Stable,0.48
False Positives,0.07

Unnamed: 0,432-296
Trace,0.61
Stable_Trend,0.09
Trend_Stable,0.16
False Positives,0.14


Unnamed: 0,301-945
Trace,0.39
Stable_Trend,0.08
Trend_Stable,0.46
False Positives,0.07

Unnamed: 0,231-194
Trace,0.4
Stable_Trend,0.07
Trend_Stable,0.32
False Positives,0.2

Unnamed: 0,372-282
Trace,0.42
Stable_Trend,0.07
Trend_Stable,0.32
False Positives,0.18

Unnamed: 0,780-770
Trace,0.31
Stable_Trend,0.07
Trend_Stable,0.43
False Positives,0.18

Unnamed: 0,303-752
Trace,0.47
Stable_Trend,0.11
Trend_Stable,0.34
False Positives,0.08


Preliminary findings:

* Roughly, in about 50% of cases, the market direction strategy.
  
* Regardless of the lane, antagonistic values ​​occurred in less than 15% of the all cases $\rightarrow $ Consequently, the probability of committing a serious error is relatively low.
  
* Given the complexity of the market and the source of primary data, the strategy proved to be credible.

## Select different data range to evaluate trends in different economic/freight contexts

### Range 1: 2019-11-22 - 2020-10-02

In [19]:
mask = (df_delta_trend.index >= "2019-11-22") & (df_delta_trend.index <= "2020-10-02")
df_delta_trend_range_1 = df_delta_trend.loc[mask]
df_trac_fitted_trend_range_1 = df_trac_fitted_trend.loc[mask]

In [20]:
### Calculate confusion matrix

df_cf_1 = conf_matrix(
    df_delta_trend_range_1,
    "442-286",
    df_trac_fitted_trend_range_1,
    "442-286_f",
    "442-286",
)

df_cf_2 = conf_matrix(
    df_delta_trend_range_1,
    "410-601",
    df_trac_fitted_trend_range_1,
    "410-601_f",
    "410-601",
)

df_cf_3 = conf_matrix(
    df_delta_trend_range_1,
    "917-841",
    df_trac_fitted_trend_range_1,
    "917-841_f",
    "917-841",
)

df_cf_4 = conf_matrix(
    df_delta_trend_range_1,
    "770-300",
    df_trac_fitted_trend_range_1,
    "770-300_f",
    "770-300",
)

df_cf_5 = conf_matrix(
    df_delta_trend_range_1,
    "432-296",
    df_trac_fitted_trend_range_1,
    "432-296_f",
    "432-296",
)

df_cf_6 = conf_matrix(
    df_delta_trend_range_1,
    "301-945",
    df_trac_fitted_trend_range_1,
    "301-945_f",
    "301-945",
)

df_cf_7 = conf_matrix(
    df_delta_trend_range_1,
    "231-194",
    df_trac_fitted_trend_range_1,
    "231-194_f",
    "231-194",
)

df_cf_8 = conf_matrix(
    df_delta_trend_range_1,
    "372-282",
    df_trac_fitted_trend_range_1,
    "372-282_f",
    "372-282",
)

df_cf_9 = conf_matrix(
    df_delta_trend_range_1,
    "780-770",
    df_trac_fitted_trend_range_1,
    "780-770_f",
    "780-770",
)

df_cf_10 = conf_matrix(
    df_delta_trend_range_1,
    "303-752",
    df_trac_fitted_trend_range_1,
    "303-752_f",
    "303-752",
)

### Range 2: 2021-12-31 - 2022-08-01

In [21]:
mask_2 = (df_delta_trend.index >= "2021-12-01") & (df_delta_trend.index <= "2022-12-15")
df_delta_trend_range_2 = df_delta_trend.loc[mask_2]
df_trac_fitted_trend_range_2 = df_trac_fitted_trend.loc[mask_2]

In [22]:
### Calculate confusion matrix

df_cf_1_2 = conf_matrix(
    df_delta_trend_range_2,
    "442-286",
    df_trac_fitted_trend_range_2,
    "442-286_f",
    "442-286",
)

df_cf_2_2 = conf_matrix(
    df_delta_trend_range_2,
    "410-601",
    df_trac_fitted_trend_range_2,
    "410-601_f",
    "410-601",
)

df_cf_3_2 = conf_matrix(
    df_delta_trend_range_2,
    "917-841",
    df_trac_fitted_trend_range_2,
    "917-841_f",
    "917-841",
)

df_cf_4_2 = conf_matrix(
    df_delta_trend_range_2,
    "770-300",
    df_trac_fitted_trend_range_2,
    "770-300_f",
    "770-300",
)

df_cf_5_2 = conf_matrix(
    df_delta_trend_range_2,
    "432-296",
    df_trac_fitted_trend_range_2,
    "432-296_f",
    "432-296",
)

df_cf_6_2 = conf_matrix(
    df_delta_trend_range_2,
    "301-945",
    df_trac_fitted_trend_range_2,
    "301-945_f",
    "301-945",
)

df_cf_7_2 = conf_matrix(
    df_delta_trend_range_2,
    "231-194",
    df_trac_fitted_trend_range_2,
    "231-194_f",
    "231-194",
)

df_cf_8_2 = conf_matrix(
    df_delta_trend_range_2,
    "372-282",
    df_trac_fitted_trend_range_2,
    "372-282_f",
    "372-282",
)

df_cf_9_2 = conf_matrix(
    df_delta_trend_range_2,
    "780-770",
    df_trac_fitted_trend_range_2,
    "780-770_f",
    "780-770",
)

df_cf_10_2 = conf_matrix(
    df_delta_trend_range_2,
    "303-752",
    df_trac_fitted_trend_range_2,
    "303-752_f",
    "303-752",
)

### Range 3: 2022-12-23 - 2023-02-31

In [23]:
mask_3 = (df_delta_trend.index >= "2022-12-23") & (df_delta_trend.index <= "2023-03-31")
df_delta_trend_range_3 = df_delta_trend.loc[mask_3]
df_trac_fitted_trend_range_3 = df_trac_fitted_trend.loc[mask_3]

In [24]:
### Calculate confusion matrix

df_cf_1_3 = conf_matrix(
    df_delta_trend_range_3,
    "442-286",
    df_trac_fitted_trend_range_3,
    "442-286_f",
    "442-286",
)

df_cf_2_3 = conf_matrix(
    df_delta_trend_range_3,
    "410-601",
    df_trac_fitted_trend_range_3,
    "410-601_f",
    "410-601",
)

df_cf_3_3 = conf_matrix(
    df_delta_trend_range_3,
    "917-841",
    df_trac_fitted_trend_range_3,
    "917-841_f",
    "917-841",
)

df_cf_4_3 = conf_matrix(
    df_delta_trend_range_3,
    "770-300",
    df_trac_fitted_trend_range_3,
    "770-300_f",
    "770-300",
)

df_cf_5_3 = conf_matrix(
    df_delta_trend_range_3,
    "432-296",
    df_trac_fitted_trend_range_3,
    "432-296_f",
    "432-296",
)

df_cf_6_3 = conf_matrix(
    df_delta_trend_range_3,
    "301-945",
    df_trac_fitted_trend_range_3,
    "301-945_f",
    "301-945",
)

df_cf_7_3 = conf_matrix(
    df_delta_trend_range_3,
    "231-194",
    df_trac_fitted_trend_range_3,
    "231-194_f",
    "231-194",
)

df_cf_8_3 = conf_matrix(
    df_delta_trend_range_3,
    "372-282",
    df_trac_fitted_trend_range_3,
    "372-282_f",
    "372-282",
)

df_cf_9_3 = conf_matrix(
    df_delta_trend_range_3,
    "780-770",
    df_trac_fitted_trend_range_3,
    "780-770_f",
    "780-770",
)

df_cf_10_3 = conf_matrix(
    df_delta_trend_range_3,
    "303-752",
    df_trac_fitted_trend_range_3,
    "303-752_f",
    "303-752",
)

In [25]:
print("\nRANGE 1: 2019-11-22 - 2020-10-02")
side_by_side(df_cf_1, df_cf_2, df_cf_3, df_cf_4, df_cf_5)
side_by_side(df_cf_6, df_cf_7, df_cf_8, df_cf_9, df_cf_10)

print("\nRANGE 2: 2021-12-31 - 2022-08-01")
side_by_side(df_cf_1_2, df_cf_2_2, df_cf_3_2, df_cf_4_2, df_cf_5_2)
side_by_side(df_cf_6_2, df_cf_7_2, df_cf_8_2, df_cf_9_2, df_cf_10_2)

print("\nRANGE 3: 2022-12-23 - 2023-02-31")
side_by_side(df_cf_1_3, df_cf_2_3, df_cf_3_3, df_cf_4_3, df_cf_5_3)
side_by_side(df_cf_6_3, df_cf_7_3, df_cf_8_3, df_cf_9_3, df_cf_10_3)


RANGE 1: 2019-11-22 - 2020-10-02


Unnamed: 0,442-286
Trace,0.52
Stable_Trend,0.04
Trend_Stable,0.26
False Positives,0.17

Unnamed: 0,410-601
Trace,0.65
Stable_Trend,0.02
Trend_Stable,0.2
False Positives,0.13

Unnamed: 0,917-841
Trace,0.59
Stable_Trend,0.04
Trend_Stable,0.15
False Positives,0.22

Unnamed: 0,770-300
Trace,0.54
Stable_Trend,0.04
Trend_Stable,0.37
False Positives,0.04

Unnamed: 0,432-296
Trace,0.8
Stable_Trend,0.04
Trend_Stable,0.07
False Positives,0.09


Unnamed: 0,301-945
Trace,0.35
Stable_Trend,0.13
Trend_Stable,0.43
False Positives,0.09

Unnamed: 0,231-194
Trace,0.39
Stable_Trend,0.07
Trend_Stable,0.33
False Positives,0.22

Unnamed: 0,372-282
Trace,0.63
Stable_Trend,0.07
Trend_Stable,0.26
False Positives,0.04

Unnamed: 0,780-770
Trace,0.39
Stable_Trend,0.11
Trend_Stable,0.26
False Positives,0.24

Unnamed: 0,303-752
Trace,0.65
Stable_Trend,0.04
Trend_Stable,0.28
False Positives,0.02



RANGE 2: 2021-12-31 - 2022-08-01


Unnamed: 0,442-286
Trace,0.61
Stable_Trend,0.02
Trend_Stable,0.19
False Positives,0.19

Unnamed: 0,410-601
Trace,0.54
Stable_Trend,0.09
Trend_Stable,0.22
False Positives,0.15

Unnamed: 0,917-841
Trace,0.48
Stable_Trend,0.06
Trend_Stable,0.3
False Positives,0.17

Unnamed: 0,770-300
Trace,0.44
Stable_Trend,0.06
Trend_Stable,0.44
False Positives,0.06

Unnamed: 0,432-296
Trace,0.48
Stable_Trend,0.11
Trend_Stable,0.19
False Positives,0.22


Unnamed: 0,301-945
Trace,0.39
Stable_Trend,0.09
Trend_Stable,0.43
False Positives,0.09

Unnamed: 0,231-194
Trace,0.43
Stable_Trend,0.07
Trend_Stable,0.26
False Positives,0.24

Unnamed: 0,372-282
Trace,0.43
Stable_Trend,0.02
Trend_Stable,0.31
False Positives,0.24

Unnamed: 0,780-770
Trace,0.24
Stable_Trend,0.09
Trend_Stable,0.52
False Positives,0.15

Unnamed: 0,303-752
Trace,0.48
Stable_Trend,0.15
Trend_Stable,0.26
False Positives,0.11



RANGE 3: 2022-12-23 - 2023-02-31


Unnamed: 0,442-286
Trace,0.6
Stable_Trend,0.13
Trend_Stable,0.13
False Positives,0.13

Unnamed: 0,410-601
Trace,0.67
Stable_Trend,0.0
Trend_Stable,0.2
False Positives,0.13

Unnamed: 0,917-841
Trace,0.4
Stable_Trend,0.07
Trend_Stable,0.2
False Positives,0.33

Unnamed: 0,770-300
Trace,0.4
Stable_Trend,0.13
Trend_Stable,0.47
False Positives,0.0

Unnamed: 0,432-296
Trace,0.73
Stable_Trend,0.07
Trend_Stable,0.13
False Positives,0.07


Unnamed: 0,301-945
Trace,0.4
Stable_Trend,0.0
Trend_Stable,0.53
False Positives,0.07

Unnamed: 0,231-194
Trace,0.27
Stable_Trend,0.13
Trend_Stable,0.53
False Positives,0.07

Unnamed: 0,372-282
Trace,0.47
Stable_Trend,0.07
Trend_Stable,0.33
False Positives,0.13

Unnamed: 0,780-770
Trace,0.33
Stable_Trend,0.07
Trend_Stable,0.47
False Positives,0.13

Unnamed: 0,303-752
Trace,0.6
Stable_Trend,0.07
Trend_Stable,0.27
False Positives,0.07


Preliminary findings:

* It is evident how economc enviroment affect market classes distribution. 
  
* In different scenarios, the probability of the model incurring major errors remained low.
  
* Degree of uncertainty/risk in forecasted projectrions is influenced by the context of the lanes. Lanes with a low trace value may be influenced by intrinsic features to the price series (number of providers, dispersion).

## Weekly analysis

But remember, we are going to evaluate on a weekly basis, so let's asess how good are the forecasted values on a weekly basis.

Randomly, we have chosen 6 datapoints to evaluate:

* 2019-09-13
* 2020-07-10
* 2021-08-27
* 2022-04-22
* 2023-03-17
* 2023-05-19


In [26]:
df_week = df_final.loc[
    pd.DatetimeIndex(
        [
            "2019-09-13",
            "2020-07-10",
            "2021-08-27",
            "2022-04-22",
            "2023-03-17",
            "2023-05-19",
        ]
    )
]

### Evaluate when ARIMA and TRAC market trends diverge on selected weeks

In [27]:
def highlight_diff(data, color="red"):
    attr = "background-color: {}".format(color)
    other = data.xs("ARIMA", axis="columns", level=-1)
    return pd.DataFrame(
        # evaluate inequality over the column axis
        np.where(data.ne(other, level=0), attr, ""),
        index=data.index,
        columns=data.columns,
    )


teste = df_week.style.apply(highlight_diff, axis=None)
teste

Unnamed: 0_level_0,231-194,231-194,301-945,301-945,303-752,303-752,372-282,372-282,410-601,410-601,432-296,432-296,442-286,442-286,770-300,770-300,780-770,780-770,917-841,917-841
Unnamed: 0_level_1,ARIMA,TRAC,ARIMA,TRAC,ARIMA,TRAC,ARIMA,TRAC,ARIMA,TRAC,ARIMA,TRAC,ARIMA,TRAC,ARIMA,TRAC,ARIMA,TRAC,ARIMA,TRAC
2019-09-13 00:00:00,up,down,stable,stable,stable,stable,up,up,up,up,up,up,up,up,down,stable,stable,down,stable,down
2020-07-10 00:00:00,up,up,up,up,up,up,up,up,up,up,up,up,up,up,up,up,down,down,up,up
2021-08-27 00:00:00,stable,down,stable,down,stable,stable,stable,up,stable,up,up,stable,stable,up,stable,stable,stable,stable,down,down
2022-04-22 00:00:00,stable,down,down,down,down,down,down,down,down,down,down,down,down,down,down,down,down,up,stable,down
2023-03-17 00:00:00,down,stable,up,down,stable,stable,down,down,down,down,down,down,down,down,stable,up,stable,down,stable,up
2023-05-19 00:00:00,stable,stable,stable,stable,stable,up,up,up,down,stable,down,up,down,stable,down,stable,stable,down,stable,up


### Final conclusion:

* ARIMA strategy captures the patterns and trends of the data using a combination of past values (AR), differences (I), and errors (MA);
  
* The probability of ARIMA displaying a serious error is relatively low.

* Market conditions are correlated with the performance of predicted trends:
  * In a downtrend scenario (Range 2), volatility increases rapidly, may explai why there is an increase.