In [1]:
import os

# Change working directory to the parent directory
os.chdir("/Users/megan/Thesis")
print("Current working directory:", os.getcwd())

Current working directory: /Users/megan/Thesis


In [4]:
import pandas as pd
# Path to your CSV file
csv_path = "data/top_10_product_groups.csv"

# Load the CSV into a DataFrame
data = pd.read_csv(csv_path)
data.columns

Index(['date', 'product_group', 'transaction_count', 'avg_price',
       'sales_channel', 'unique_customers', 'unique_articles_sold',
       'median_age', 'fashion_news_subscribers', 'first_purchase_days_ago',
       'recent_purchase_days_ago', 'age_bin_10-19', 'age_bin_20-29',
       'age_bin_30-39', 'age_bin_40-49', 'age_bin_50-59', 'age_bin_60+'],
      dtype='object')

In [5]:
import os
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from modules.utils import process_name

# Output directory
output_dir = "final_version/output/1_day/sarimax"
os.makedirs(output_dir, exist_ok=True)

# Store results
sarimax_results = {}
metrics = {}

# Iterate over product groups with a progress bar
for product_group in tqdm(data['product_group'].unique(), desc="Processing product groups"):
    sanitized_group = process_name(product_group)
    
    # Create output directory for this product group
    group_output_dir = os.path.join(output_dir, sanitized_group)
    os.makedirs(group_output_dir, exist_ok=True)

    # Extract data for the product group
    product_data = data[data['product_group'] == product_group]
    
    # Ensure 'date' is set as index and properly formatted
    product_data = product_data.set_index('date')
    product_data.index = pd.to_datetime(product_data.index)
    product_data = product_data.asfreq('D')

    # Fill missing values with zero (adjustable if needed)
    product_data = product_data.fillna(0)

    # Define endogenous (dependent variable) and exogenous (independent variables)
    endog = product_data['transaction_count']
    exog = product_data.drop(columns=['transaction_count', 'product_group'], errors='ignore')

    # Ensure sufficient data
    if len(product_data) < 50:  # Require at least 50 days of data
        continue

    # Train-test split (80% train, 20% test)
    split_idx = int(len(endog) * 0.8)
    train_endog, test_endog = endog.iloc[:split_idx], endog.iloc[split_idx:]
    train_exog, test_exog = exog.iloc[:split_idx], exog.iloc[split_idx:]

    try:
        # Train SARIMAX model
        model = SARIMAX(
            train_endog,
            exog=train_exog,
            order=(1, 1, 1),
            seasonal_order=(1, 1, 1, 7)
        )
        results = model.fit(disp=False, maxiter=500)

        # Generate predictions
        pred = results.get_prediction(start=test_endog.index[0], end=test_endog.index[-1], exog=test_exog)
        pred_mean = np.maximum(pred.predicted_mean, 0)  # Clip predictions at zero

        # Compute evaluation metrics
        mae = mean_absolute_error(test_endog, pred_mean)
        rmse = np.sqrt(mean_squared_error(test_endog, pred_mean))
        mape = np.mean(np.abs((test_endog - pred_mean) / np.maximum(test_endog, 1))) * 100
        r2 = r2_score(test_endog, pred_mean)

        # Store metrics
        metrics[product_group] = {
            "MAE": mae,
            "RMSE": rmse,
            "MAPE": mape,
            "R2": r2
        }

        # Save individual product metrics
        metrics_df = pd.DataFrame([metrics[product_group]])
        metrics_df.to_csv(os.path.join(group_output_dir, f"{sanitized_group}_metrics.csv"), index=False)

        # Save trained model
        sarimax_results[product_group] = results

        # Plot Predicted vs Actual
        plt.figure(figsize=(12, 6))
        plt.plot(test_endog.index, test_endog, label="Actual", alpha=0.7)
        plt.plot(test_endog.index, pred_mean, label="Predicted", alpha=0.7, linestyle='--')
        plt.legend()
        plt.title(f"Predicted vs Actual for {product_group}")
        plt.savefig(os.path.join(group_output_dir, f"{sanitized_group}.png"))
        plt.close()

        # Plot Residuals
        residuals = test_endog - pred_mean
        plt.figure(figsize=(12, 6))
        plt.plot(test_endog.index, residuals, label="Residuals", alpha=0.7)
        plt.axhline(0, linestyle='--', color='r', alpha=0.7)
        plt.legend()
        plt.title(f"Residuals for {product_group}")
        plt.savefig(os.path.join(group_output_dir, f"{sanitized_group}_residuals.png"))
        plt.close()

    except Exception as e:
        print(f"Error processing {product_group}: {e}")
        continue

# Save final metrics summary
summary_df = pd.DataFrame.from_dict(metrics, orient="index")
summary_df.to_csv(os.path.join(output_dir, "final_metrics_summary.csv"))

# Compute and save average metrics across all products
avg_metrics = {
    "MAE": np.mean([metrics[pg]["MAE"] for pg in metrics]),
    "RMSE": np.mean([metrics[pg]["RMSE"] for pg in metrics]),
    "MAPE": np.mean([metrics[pg]["MAPE"] for pg in metrics]),
    "R2": np.mean([metrics[pg]["R2"] for pg in metrics])
}

avg_metrics_df = pd.DataFrame([avg_metrics])
avg_metrics_df.to_csv(os.path.join(output_dir, "final_test_avg_metrics.csv"), index=False)

print("\nProcessing completed. Metrics and plots have been saved.")

Processing product groups: 100%|██████████| 10/10 [05:37<00:00, 33.75s/it]


Processing completed. Metrics and plots have been saved.





In [8]:
import os
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from modules.utils import process_name

# Output directory
output_dir = "final_version/output/2_weeks/sarimax"
os.makedirs(output_dir, exist_ok=True)

# Store results
sarimax_results = {}
metrics = {}

# Forecast horizon
forecast_horizon = 14  # 14 days ahead

for product_group in tqdm(data['product_group'].unique(), desc="Processing product groups"):
    sanitized_group = process_name(product_group)
    
    # Create output directory for this product group
    group_output_dir = os.path.join(output_dir, sanitized_group)
    os.makedirs(group_output_dir, exist_ok=True)

    # Extract data for the product group
    product_data = data[data['product_group'] == product_group]
    
    # Ensure 'date' is set as index and properly formatted
    product_data = product_data.set_index('date')
    product_data.index = pd.to_datetime(product_data.index)
    product_data = product_data.asfreq('D')

    # Fill missing values with zero (adjustable if needed)
    product_data = product_data.fillna(0)

    # Define endogenous (dependent variable) and exogenous (independent variables)
    endog = product_data['transaction_count']
    exog = product_data.drop(columns=['transaction_count', 'product_group'], errors='ignore')

    # Ensure sufficient data
    if len(product_data) < 50 + forecast_horizon:  # Ensure enough data for lookahead
        continue

    # Train-test split (80% train, 20% test)
    split_idx = int(len(endog) * 0.8)
    train_endog, test_endog = endog.iloc[:split_idx], endog.iloc[split_idx:]
    train_exog, test_exog = exog.iloc[:split_idx], exog.iloc[split_idx:]

    try:
        # Train SARIMAX model
        model = SARIMAX(
            train_endog,
            exog=train_exog,
            order=(1, 1, 1),
            seasonal_order=(1, 1, 1, 7)
        )
        results = model.fit(disp=False, maxiter=500)

        # ✅ **Direct 14-Day Forecast Instead of Rolling**
        # Extract exogenous data for the next 14 days
        exog_forecast = test_exog.iloc[:14] if not test_exog.empty else None
        
        # Directly predict the next 14 days
        pred = results.get_prediction(start=test_endog.index[0], end=test_endog.index[0] + pd.Timedelta(days=13), exog=exog_forecast)
        pred_mean = np.maximum(pred.predicted_mean, 0)  # Clip negative values

        # Compute evaluation metrics
        mae = mean_absolute_error(test_endog[:14], pred_mean)
        rmse = np.sqrt(mean_squared_error(test_endog[:14], pred_mean))
        mape = np.mean(np.abs((test_endog[:14] - pred_mean) / np.maximum(test_endog[:14], 1))) * 100
        r2 = r2_score(test_endog[:14], pred_mean)

        # Store metrics
        metrics[product_group] = {
            "MAE": mae,
            "RMSE": rmse,
            "MAPE": mape,
            "R2": r2
        }

        # Save individual product metrics
        metrics_df = pd.DataFrame([metrics[product_group]])
        metrics_df.to_csv(os.path.join(group_output_dir, f"{sanitized_group}_metrics.csv"), index=False)

        # Save trained model
        sarimax_results[product_group] = results

        # Plot Predicted vs Actual
        plt.figure(figsize=(12, 6))
        plt.plot(test_endog.index[:14], test_endog[:14], label="Actual", alpha=0.7)
        plt.plot(test_endog.index[:14], pred_mean, label="Predicted", alpha=0.7, linestyle='--')
        plt.legend()
        plt.title(f"Predicted vs Actual for {product_group} (14-day ahead)")
        plt.savefig(os.path.join(group_output_dir, f"{sanitized_group}.png"))
        plt.close()

        # Plot Residuals
        residuals = test_endog[:14] - pred_mean
        plt.figure(figsize=(12, 6))
        plt.plot(test_endog.index[:14], residuals, label="Residuals", alpha=0.7)
        plt.axhline(0, linestyle='--', color='r', alpha=0.7)
        plt.legend()
        plt.title(f"Residuals for {product_group} (14-day ahead)")
        plt.savefig(os.path.join(group_output_dir, f"{sanitized_group}_residuals.png"))
        plt.close()

    except Exception as e:
        print(f"Error processing {product_group}: {e}")
        continue

# Save final metrics summary
summary_df = pd.DataFrame.from_dict(metrics, orient="index")
summary_df.to_csv(os.path.join(output_dir, "final_metrics_summary.csv"))

print("\n✅ Processing completed. Metrics and plots have been saved.")


Processing product groups: 100%|██████████| 10/10 [04:44<00:00, 28.44s/it]


✅ Processing completed. Metrics and plots have been saved.





In [9]:
avg_metrics = {
    "MAE": np.mean([metrics[pg]["MAE"] for pg in metrics]),
    "RMSE": np.mean([metrics[pg]["RMSE"] for pg in metrics]),
    "MAPE": np.mean([metrics[pg]["MAPE"] for pg in metrics]),
    "R2": np.mean([metrics[pg]["R2"] for pg in metrics])
}

avg_metrics_df = pd.DataFrame([avg_metrics])
avg_metrics_df.to_csv(os.path.join(output_dir, "final_test_avg_metrics.csv"), index=False)

print("\nProcessing completed. Metrics and plots have been saved.")


Processing completed. Metrics and plots have been saved.
