## Setup

In [118]:
import pandas as pd
import matplotlib.pyplot as plt

In [119]:
# The dataframe is relatively large and we want to visualise it as a scrollable element
pd.set_option('display.max_columns', None)

In [120]:
# Import the cleaned data
df = pd.read_csv('data/cleaned_data.csv')

# Import training data
df_train = pd.read_csv("data/train.csv", index_col=0)

# Import test data
df_test = pd.read_csv("data/test.csv", index_col=0)

# Import forecast data
df_forecast = pd.read_csv("data/forecast_cluster_optimizer.csv", index_col=0)

# Import brute force data
df_brute_force = pd.read_csv("data/forecast_brute_force.csv", index_col=0)

# Import reused optimized model
df_reused = pd.read_csv("data/forecast_reused_optimized_model.csv", index_col=0)

In [121]:
# Check if succeeded
print(df.shape)
print(df_test.shape)
print(df_forecast.shape)
print(df_train.shape)

(818375, 22)
(5600, 3)
(5504, 3)
(93800, 3)


In [122]:
# Only adjust the copy version
data = df.copy()

In [123]:
data.shape

(818375, 22)

In [124]:
# Parse timestamp
data["DimPostingDateId"] = pd.to_datetime(data["DimPostingDateId"], format="%Y-%m-%d")

## Create product info table

In [125]:
# Select only product relalted features
product_info = data[['ItemNo2','Color','Gender','Category','SUB Category', 'SUB Category2', 'Quality', 'Origin Country']].drop_duplicates()
product_info.reset_index(drop=True, inplace=True)
product_info

Unnamed: 0,ItemNo2,Color,Gender,Category,SUB Category,SUB Category2,Quality,Origin Country
0,F15404308_CLR000021,Black,WOMENSWEAR,Trousers,Trousers,Default,Woven,China
1,M00012003_CLR000508,White mel,MENSWEAR,Tops,Crew neck,Short sleeve,Jersey,Turkey
2,F19123672_CLR000021,Black,WOMENSWEAR,Shirts,Shirts,Short sleeve,Woven,China
3,F19123672_CLR000289,Clear Cream,WOMENSWEAR,Shirts,Shirts,Short sleeve,Woven,China
4,M00012307_CLR000650,Blue Iris Mel,MENSWEAR,Tops,V-neck,Short sleeve,Jersey,Turkey
...,...,...,...,...,...,...,...,...
695,M24400088_190414TCX,FOREST NIGHT,MENSWEAR,Shirts,Shirts,Default,Woven,China
696,F24200090_CLR000021,Black,WOMENSWEAR,Skirts,Maxi,Default,Woven,China
697,M24300006_CLR000589,Black Blizzard,MENSWEAR,Jeans,Jeans,Default,Denim,Turkey
698,F24400186_CLR000021,Black,WOMENSWEAR,Knitwear,Cardigan,Long sleeve,Knit,China


## Sample by week

In [126]:
# Sample weekly data
data_id_time_target = data.set_index("DimPostingDateId").groupby('ItemNo2').resample('W')['Quantity'].sum().reset_index()
data_id_time_target

Unnamed: 0,ItemNo2,DimPostingDateId,Quantity
0,F00001015_CLR000021,2022-03-20,66.0
1,F00001015_CLR000021,2022-03-27,92.0
2,F00001015_CLR000021,2022-04-03,90.0
3,F00001015_CLR000021,2022-04-10,70.0
4,F00001015_CLR000021,2022-04-17,78.0
...,...,...,...
56821,S00029_CLR001336,2024-08-11,0.0
56822,S00029_CLR001336,2024-08-18,4.0
56823,S00029_CLR001336,2024-08-25,2.0
56824,S00029_CLR001336,2024-09-01,1.0


## Calc MAE for naive forecast

In [127]:
def calc_mae_by_product_id(product_id, df):
    # Select records by product_id
    product_data = df.loc[df["ItemNo2"] == product_id]
    
    # Check if contains enough records
    if product_data.shape[0] < 16:
        return
    
    forecast = product_data.iloc[-16:-8]
    forecast.reset_index(drop=True, inplace=True)
    target = product_data.iloc[-8:]
    target.reset_index(drop=True, inplace=True)

    # Calc MAE
    result = (abs(target["Quantity"] - forecast["Quantity"])).mean()
    return result


In [128]:
# Apply mae calculation and store in a new column
product_info["naive_mae"] = product_info["ItemNo2"].apply(calc_mae_by_product_id, args=(data_id_time_target,))

## Calc MAE for forecasts

In [129]:
# Debug. Used for quickly review
def quick_check_by_product_id(unique_id):
    print("df_train:\n", df_train.loc[df_train["unique_id"] == unique_id].reset_index(drop=True))
    print("\ndf_test:\n", df_test.loc[df_test["unique_id"] == unique_id].reset_index(drop=True))
    print("\ndf_forecast:\n", df_forecast.loc[df_forecast["unique_id"] == unique_id].reset_index(drop=True))
    print("\ndf_brute_force:\n", df_brute_force.loc[df_brute_force["unique_id"] == unique_id].reset_index(drop=True))

In [130]:
def calc_mae_by_product_id(unique_id, df_forecast, df_test):
    # Select records from forecast data
    forecast_data = df_forecast.loc[df_forecast["unique_id"] == unique_id].reset_index(drop=True)
    if forecast_data.empty:
        return
    
    # Select records from test data
    target_data = df_test.loc[df_test["unique_id"] == unique_id].reset_index(drop=True)
    # Calc MAE
    result = (abs(target_data["y"] - forecast_data["forecast"])).mean()
    return result

In [131]:
# Apply mae calculation and store in a new column
product_info["forecast_mae"] = product_info["ItemNo2"].apply(calc_mae_by_product_id, args=(df_forecast,df_test))
product_info["brute_force_mae"] = product_info["ItemNo2"].apply(calc_mae_by_product_id, args=(df_brute_force,df_test))
product_info["reused_model_mae"] = product_info["ItemNo2"].apply(calc_mae_by_product_id, args=(df_reused,df_test))

In [132]:
product_info.head(20)

Unnamed: 0,ItemNo2,Color,Gender,Category,SUB Category,SUB Category2,Quality,Origin Country,naive_mae,forecast_mae,brute_force_mae,reused_model_mae
0,F15404308_CLR000021,Black,WOMENSWEAR,Trousers,Trousers,Default,Woven,China,38.625,36.0,31.0,38.66756
1,M00012003_CLR000508,White mel,MENSWEAR,Tops,Crew neck,Short sleeve,Jersey,Turkey,26.75,12.5,,16.19189
2,F19123672_CLR000021,Black,WOMENSWEAR,Shirts,Shirts,Short sleeve,Woven,China,65.75,49.25,20.5,64.44114
3,F19123672_CLR000289,Clear Cream,WOMENSWEAR,Shirts,Shirts,Short sleeve,Woven,China,66.25,57.5,24.0,73.94366
4,M00012307_CLR000650,Blue Iris Mel,MENSWEAR,Tops,V-neck,Short sleeve,Jersey,Turkey,1.375,0.0,,0.2940724
5,F15301618_CLR000289,Clear Cream,WOMENSWEAR,Tops,Tank,Sleeveless,Jersey,China,11.875,10.25,10.25,10.31328
6,F00013006_CLR999254,3173 BLUE STRIPE,WOMENSWEAR,Tops,Tank,Long sleeve,Jersey,Turkey,1.5,5.875,0.0,5.840119
7,M20300010_161103TCX,Pure Cashmere,MENSWEAR,Tops,Crew neck,Short sleeve,Jersey,Turkey,28.375,23.5,,23.83168
8,F00012030_CLR000021,Black,WOMENSWEAR,Tops,V-neck,Short sleeve,Woven,China,4.625,0.0,0.0,0.0
9,M20300010_CLR000337,Grey mel,MENSWEAR,Tops,Crew neck,Short sleeve,Jersey,Turkey,59.875,0.0,,3.550922e-20


## Result interpretation


In [133]:
clustered_df = pd.read_csv("./data/clusters_model.csv")
clustered_products = pd.read_csv("./data/clustered_products.csv")
brute_force_stats = pd.read_csv("./data/brute_force_forecast_stats.csv")

In [134]:
model_analysis = pd.merge(clustered_products, brute_force_stats, on="unique_id")
model_analysis = pd.merge(model_analysis, clustered_df, on="cluster")
model_analysis = model_analysis[["unique_id", "model_x", "model_y"]].rename(columns={"model_x": "brute_force_model", "model_y": "cluster_model"})
model_analysis.head()

Unnamed: 0,unique_id,brute_force_model,cluster_model
0,F00001015_CLR000021,RandomForestRegressor,RandomForestRegressor
1,F00001015_CLR000023,RandomForestRegressor,RandomForestRegressor
2,F00001111_151304TCX,RandomForestRegressor,RandomForestRegressor
3,F00001111_193924TPX,XGBoost,RandomForestRegressor
4,F00001111_CLR000021,SARIMA,RandomForestRegressor


In [135]:
# find differences between models
different_model_counts = model_analysis[model_analysis["brute_force_model"] != model_analysis["cluster_model"]]
print(f"{len(different_model_counts)} products out of {len(model_analysis)} have different models between brute force and cluster models")

differences = model_analysis[model_analysis['brute_force_model'] != model_analysis['cluster_model']]

change_counts = differences.groupby(['cluster_model', 'brute_force_model']).size().reset_index(name='count')

for _, row in change_counts.iterrows():
    print(f"{row['cluster_model']} was changed to {row['brute_force_model']} {row['count']} times")

260 products out of 362 have different models between brute force and cluster models
ARIMA was changed to RandomForestRegressor 24 times
ARIMA was changed to SARIMA 28 times
ARIMA was changed to XGBoost 9 times
RandomForestRegressor was changed to ARIMA 3 times
RandomForestRegressor was changed to SARIMA 62 times
RandomForestRegressor was changed to XGBoost 54 times
SARIMA was changed to ARIMA 3 times
SARIMA was changed to RandomForestRegressor 58 times
SARIMA was changed to XGBoost 19 times


In [136]:
# Calc mean and std
print("Missing brute_force_mae:", product_info["brute_force_mae"].isnull().sum())
product_info[['naive_mae', 'forecast_mae', 'brute_force_mae', 'reused_model_mae']].aggregate(['mean', 'std'])

Missing brute_force_mae: 338


Unnamed: 0,naive_mae,forecast_mae,brute_force_mae,reused_model_mae
mean,25.271976,13.490552,11.13605,15.266212
std,33.518968,24.020261,18.69984,25.981034


### Explanation of Results

The comparison of the mean and standard deviation of the `naive_mae` and `forecast_mae` demonstrates that our forecast model provides more accurate and consistent predictions compared to the naive forecast:

1. **Mean (Average Error):**  
   The mean MAE for the forecast model is noticeably lower than that of the naive forecast, indicating that the forecast model produces smaller errors on average and is therefore more accurate.

2. **Standard Deviation (Error Consistency):**  
   The standard deviation of the forecast model's errors is lower than that of the naive forecast, suggesting that the forecast model's predictions are more consistent and less prone to large deviations.

### Conclusion

These results indicate that the forecast model is effectively learning and leveraging patterns in the data to provide better forecasts than the naive approach, both in terms of accuracy and reliability. This highlights the model's ability to outperform a baseline method and underscores its value in improving forecast quality.