In [6]:
import os

# Change working directory to the parent directory
os.chdir("/Users/megan/Thesis")
print("Current working directory:", os.getcwd())

Current working directory: /Users/megan/Thesis


In [7]:
import pandas as pd
# Path to your CSV file
csv_path = "data/top_10_product_groups.csv"

# Load the CSV into a DataFrame
data = pd.read_csv(csv_path)

In [None]:
import numpy as np 

def process_data(data):
    # Ensure we're working on a copy of the DataFrame to avoid SettingWithCopyWarning
    data = data.copy()  
    
    # Convert 'date' column to datetime if it's not already
    data['date'] = pd.to_datetime(data['date'], errors='coerce')

    # Drop rows where date conversion failed
    data = data.dropna(subset=['date'])

    # Feature engineering
    data['month'] = data['date'].dt.month
    data['day_of_week'] = data['date'].dt.dayofweek
    data['quarter'] = data['date'].dt.quarter
    data['week_of_year'] = data['date'].dt.isocalendar().week
    data['is_weekend'] = data['day_of_week'].isin([5, 6]).astype(int)  # 5 and 6 are Saturday and Sunday

    # Encode cyclical day of the week
    data['day_of_week_sin'] = np.sin(2 * np.pi * data['day_of_week'] / 7)
    data['day_of_week_cos'] = np.cos(2 * np.pi * data['day_of_week'] / 7)

    # Lag features
    data['lag_7'] = data['transaction_count'].shift(7)
    data['lag_14'] = data['transaction_count'].shift(14)

    # Create seasonal feature based on the month
    def get_season(month):
        if month in [12, 1, 2]:
            return 'Winter'
        elif month in [3, 4, 5]:
            return 'Spring'
        elif month in [6, 7, 8]:
            return 'Summer'
        else:
            return 'Fall'

    data['season'] = data['month'].apply(get_season)

    # Convert categorical columns to category dtype
    data['season'] = data['season'].astype('category')
    data['is_weekend'] = data['is_weekend'].astype('category')

    # One-hot encode categorical columns
    data = pd.get_dummies(data, columns=['season'], drop_first=True, dtype=np.int8)

    # Rolling statistics
    data['rolling_mean_7'] = data['transaction_count'].rolling(window=7, min_periods=1).mean()
    data['rolling_std_7'] = data['transaction_count'].rolling(window=7, min_periods=1).std()

    # Fill missing values generated by rolling functions or lags
    data = data.fillna(0)

    columns_to_drop = [
    'std_price', 
    'club_member_ratio', 
    'product_type_name', 
    'colour_group_name', 
    'graphical_appearance_name'
    ]

    # Drop them from your DataFrame `data`
    data = data.drop(columns=columns_to_drop, errors='ignore')
    
    return data


data = process_data(data)


In [9]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import optuna
from tqdm import tqdm
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from modules.utils import process_name  

# Store results
xgboost_results = {}
metrics = {}
best_params_dict = {}
output_dir = 'final_version/output/1_day/xgboost/finetuned'

def objective(trial, X_train, X_val, y_train, y_val):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000, step=100),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-3, 10.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-3, 10.0, log=True),
    }
    
    model = XGBRegressor(objective='reg:squarederror', random_state=42, **params)
    model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)
    
    y_pred = model.predict(X_val)
    return mean_absolute_error(y_val, y_pred)

for product_group in tqdm(data['product_group'].unique(), desc="Processing product groups"):
    sanitized_group = process_name(product_group)
    group_output_dir = os.path.join(output_dir, sanitized_group)
    os.makedirs(group_output_dir, exist_ok=True)
    
    product_data = data[data['product_group'] == product_group].copy()
    product_data = product_data.set_index('date').asfreq('D').fillna(0)
    
    if len(product_data) < 50:  # Ensure enough data for splitting
        continue
    
    target_column = 'transaction_count'
    lags = 36
    
    X, y = create_lagged_features(product_data, target_column, lags)
    
    # Ensure categorical columns are properly encoded
    X = X.copy()
    if 'product_group' in X.columns:
        X.drop(columns=['product_group'], inplace=True)
    if 'is_weekend' in X.columns:
        X['is_weekend'] = X['is_weekend'].astype(int)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, shuffle=False)  # 20% of 80%

    # Run Optuna optimization
    study = optuna.create_study(direction="minimize")
    study.optimize(lambda trial: objective(trial, X_train, X_val, y_train, y_val), n_trials=30)
    
    best_params = study.best_params
    best_params_dict[product_group] = best_params

    # Train final model using best hyperparameters
    model = XGBRegressor(objective='reg:squarederror', random_state=42, **best_params)
    model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)], verbose=False)
    
    y_pred = np.expm1(model.predict(X_test))
    y_test_actual = np.expm1(y_test)
    y_pred = np.maximum(y_pred, 0)
    
    mae = mean_absolute_error(y_test_actual, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test_actual, y_pred))
    mape = np.mean(np.abs((y_test_actual - y_pred)/np.maximum(y_test_actual, 1e-8))) * 100
    r2 = r2_score(y_test_actual, y_pred)
    
    metrics[product_group] = {'MAE': mae, 'RMSE': rmse, 'MAPE': mape, 'R2': r2}
    xgboost_results[product_group] = model
    
    # Save hyperparameters
    pd.DataFrame([best_params]).to_csv(os.path.join(group_output_dir, f"{sanitized_group}_best_params.csv"), index=False)
    
    # Plot Predictions vs Actuals
    plt.figure(figsize=(12, 6))
    plt.plot(X_test.index, y_test_actual, label="Actual", alpha=0.7)
    plt.plot(X_test.index, y_pred, label="Predicted", alpha=0.7, linestyle='--')
    plt.legend()
    plt.title(f"Predicted vs Actual for {product_group}")
    plt.savefig(os.path.join(group_output_dir, f"{sanitized_group}.png"))
    plt.close()
    
    # Plot residuals
    residuals = y_test_actual - y_pred
    plt.figure(figsize=(12, 6))
    plt.plot(X_test.index, residuals, label="Residuals", alpha=0.7)
    plt.axhline(0, linestyle='--', color='r', alpha=0.7)
    plt.legend()
    plt.title(f"Residuals for {product_group}")
    plt.savefig(os.path.join(group_output_dir, f"{sanitized_group}_residuals.png"))
    plt.close()
    
    # Save per-group metrics
    metrics_df = pd.DataFrame([metrics[product_group]])
    metrics_df.to_csv(os.path.join(group_output_dir, f"{sanitized_group}_metrics.csv"), index=False)
    
# Save all metrics
summary_df = pd.DataFrame.from_dict(metrics, orient="index")
summary_df.to_csv(os.path.join(output_dir, "final_metrics_summary.csv"))

# Save best hyperparameters
best_params_df = pd.DataFrame.from_dict(best_params_dict, orient="index")
best_params_df.to_csv(os.path.join(output_dir, "final_best_hyperparameters.csv"))

# Overall average across product groups
avg_metrics_all = {
    'MAE': np.mean([metrics[p]['MAE'] for p in metrics]),
    'RMSE': np.mean([metrics[p]['RMSE'] for p in metrics]),
    'MAPE': np.mean([metrics[p]['MAPE'] for p in metrics]),
    'R2': np.mean([metrics[p]['R2'] for p in metrics])
}
pd.DataFrame([avg_metrics_all]).to_csv(os.path.join(output_dir, "final_test_avg_metrics.csv"), index=False)

print("\nProcessing completed. Hyperparameter tuning, model training, and evaluation completed.")

  from .autonotebook import tqdm as notebook_tqdm
Processing product groups:   0%|          | 0/10 [00:00<?, ?it/s][I 2025-03-04 09:14:33,168] A new study created in memory with name: no-name-aa0d1cee-3981-42fc-95c3-202c2d1f8358
[I 2025-03-04 09:14:36,861] Trial 0 finished with value: 43.48582458496094 and parameters: {'n_estimators': 800, 'max_depth': 11, 'learning_rate': 0.09230928701639736, 'colsample_bytree': 0.8463170822823719, 'subsample': 0.8147095764800725, 'reg_alpha': 0.0043402260445835365, 'reg_lambda': 0.005627570459964369}. Best is trial 0 with value: 43.48582458496094.
[I 2025-03-04 09:14:48,883] Trial 1 finished with value: 52.984657287597656 and parameters: {'n_estimators': 1000, 'max_depth': 12, 'learning_rate': 0.012187062638701252, 'colsample_bytree': 0.6455802619993312, 'subsample': 0.6290494865246752, 'reg_alpha': 5.844801751949958, 'reg_lambda': 0.4553642753922501}. Best is trial 0 with value: 43.48582458496094.
[I 2025-03-04 09:14:50,187] Trial 2 finished with va

ValueError: Input contains infinity or a value too large for dtype('float64').

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from modules.utils import process_name  

# 1 day
xgboost_results = {}
metrics = {}
output_dir = 'final_version/output/xgboost'

def create_lagged_features(df, target_col, lags):
    df = df.copy()
    df[f"log_{target_col}"] = np.log1p(df[target_col])
    df["is_active_day"] = (df[target_col] > 0).astype(int)
    df["rolling_mean_7"] = df[target_col].rolling(window=7, min_periods=1).mean()
    df["rolling_mean_14"] = df[target_col].rolling(window=14, min_periods=1).mean()
    
    for lag in range(1, lags + 1):
        df[f"lag_{lag}"] = df[target_col].shift(lag)
    
    df = df.dropna()
    X = df.drop(columns=[target_col, f"log_{target_col}"])
    y = df[f"log_{target_col}"]
    
    return X, y

for product_group in tqdm(data['product_group'].unique(), desc="Processing product groups"):
    sanitized_group = process_name(product_group)
    group_output_dir = os.path.join(output_dir, sanitized_group)
    os.makedirs(group_output_dir, exist_ok=True)
    
    product_data = data[data['product_group'] == product_group].copy()
    product_data = product_data.set_index('date').asfreq('D').fillna(0)
    
    if len(product_data) < 50:  # Ensure enough data for splitting
        continue
    
    target_column = 'transaction_count'
    lags = 36
    
    X, y = create_lagged_features(product_data, target_column, lags)
    
    # Ensure categorical columns are properly encoded
    X = X.copy()
    if 'product_group' in X.columns:
        X.drop(columns=['product_group'], inplace=True)
    if 'is_weekend' in X.columns:
        X['is_weekend'] = X['is_weekend'].astype(int)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, shuffle=False)  # 20% of 80%
    
    model = XGBRegressor(
        n_estimators=600,
        max_depth=6,
        learning_rate=0.1,
        colsample_bytree=0.8,
        subsample=0.7,
        objective='reg:squarederror',
        reg_alpha=0.1,
        reg_lambda=1.5,
        random_state=42,
        enable_categorical=True
    )
    model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)], verbose=False)
    
    y_pred = np.expm1(model.predict(X_test))
    y_test_actual = np.expm1(y_test)
    y_pred = np.maximum(y_pred, 0)
    
    mae = mean_absolute_error(y_test_actual, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test_actual, y_pred))
    mape = np.mean(np.abs((y_test_actual - y_pred)/np.maximum(y_test_actual, 1e-8))) * 100
    r2 = r2_score(y_test_actual, y_pred)
    
    metrics[product_group] = {'MAE': mae, 'RMSE': rmse, 'MAPE': mape, 'R2': r2}
    xgboost_results[product_group] = model
    
    plt.figure(figsize=(12, 6))
    plt.plot(X_test.index, y_test_actual, label="Actual", alpha=0.7)
    plt.plot(X_test.index, y_pred, label="Predicted", alpha=0.7, linestyle='--')
    plt.legend()
    plt.title(f"Predicted vs Actual for {product_group}")
    plt.savefig(os.path.join(group_output_dir, f"{sanitized_group}.png"))
    plt.close()
    
    # Plot residuals
    residuals = y_test_actual - y_pred
    plt.figure(figsize=(12, 6))
    plt.plot(X_test.index, residuals, label="Residuals", alpha=0.7)
    plt.axhline(0, linestyle='--', color='r', alpha=0.7)
    plt.legend()
    plt.title(f"Residuals for {product_group}")
    plt.savefig(os.path.join(group_output_dir, f"{sanitized_group}_residuals.png"))
    plt.close()
    
    # Save per-group metrics
    metrics_df = pd.DataFrame([metrics[product_group]])
    metrics_df.to_csv(os.path.join(group_output_dir, f"{sanitized_group}_metrics.csv"), index=False)
    
summary_df = pd.DataFrame.from_dict(metrics, orient="index")
summary_df.to_csv(os.path.join(output_dir, "final_metrics_summary.csv"))

avg_metrics_all = {
    'MAE': np.mean([metrics[p]['MAE'] for p in metrics]),
    'RMSE': np.mean([metrics[p]['RMSE'] for p in metrics]),
    'MAPE': np.mean([metrics[p]['MAPE'] for p in metrics]),
    'R2': np.mean([metrics[p]['R2'] for p in metrics])
}
pd.DataFrame([avg_metrics_all]).to_csv(os.path.join(output_dir, "final_test_avg_metrics.csv"), index=False)

print("\nProcessing completed. Metrics and plots have been saved.")


Processing product groups: 100%|██████████| 10/10 [00:17<00:00,  1.75s/it]


Processing completed. Metrics and plots have been saved.





In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from modules.utils import process_name  

# Store results
xgboost_results = {}
metrics = {}
output_dir = 'final_version/output/2_weeks/xgboost'

def create_lagged_features(df, target_col, forecast_horizon=14):
    df = df.copy()
    df[target_col] = df[target_col].replace(0, 1e-6)  # Replace 0 with a small value to avoid log(0)
    df.dropna(inplace=True)  # Drop NaNs before log transformation
    df[f"log_{target_col}"] = np.log1p(df[target_col])
    df["is_active_day"] = (df[target_col] > 0).astype(int)
    df["rolling_mean_7"] = df[target_col].rolling(window=7, min_periods=1).mean()
    df["rolling_mean_14"] = df[target_col].rolling(window=14, min_periods=1).mean()
    
    for lag in [14, 21, 28, 35]:  # Weekly lags instead of daily
        df[f"lag_{lag}"] = df[target_col].shift(lag)
    
    for i in range(1, forecast_horizon + 1):  # Multi-step target
        df[f"target_{i}"] = df[target_col].shift(-i)
    
    df.dropna(inplace=True)  # Drop remaining NaNs
    X = df.drop(columns=[target_col, f"log_{target_col}"] + [f"target_{i}" for i in range(1, forecast_horizon + 1)])
    y = df[[f"target_{i}" for i in range(1, forecast_horizon + 1)]]
    
    return X, y

forecast_horizon = 14  # Predict 14 days ahead
for product_group in tqdm(data['product_group'].unique(), desc="Processing product groups"):
    sanitized_group = process_name(product_group)
    group_output_dir = os.path.join(output_dir, sanitized_group)
    os.makedirs(group_output_dir, exist_ok=True)
    
    product_data = data[data['product_group'] == product_group].copy()
    product_data = product_data.set_index('date').asfreq('D').fillna(0)
    
    if len(product_data) < 50 + forecast_horizon:  # Ensure enough data for splitting
        continue
    
    target_column = 'transaction_count'
    
    X, y = create_lagged_features(product_data, target_column, forecast_horizon)
    
    # Ensure categorical columns are properly encoded
    X = X.copy()
    if 'product_group' in X.columns:
        X.drop(columns=['product_group'], inplace=True)
    if 'is_weekend' in X.columns:
        X['is_weekend'] = X['is_weekend'].astype(int)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, shuffle=False)  # 20% of 80%
    
    model = XGBRegressor(
        n_estimators=600,
        max_depth=6,
        learning_rate=0.1,
        colsample_bytree=0.8,
        subsample=0.7,
        objective='reg:squarederror',
        reg_alpha=0.1,
        reg_lambda=1.5,
        random_state=42,
        enable_categorical=True
    )
    model.fit(X_train, y_train, verbose=False)
    
    y_pred = np.clip(model.predict(X_test), 0, 1e6)  # Clip predictions to prevent extreme values
    y_test_actual = np.clip(y_test, 0, 1e6)  # Clip actual values to prevent extreme values
    
    valid_mask = np.isfinite(y_test_actual).all(axis=1) & np.isfinite(y_pred).all(axis=1)
    y_test_actual = y_test_actual[valid_mask]
    y_pred = y_pred[valid_mask]
    
    if len(y_test_actual) == 0 or len(y_pred) == 0:
        print(f"Warning: No valid predictions for {product_group}")
        continue
    
    y_test_actual = y_test_actual.to_numpy()

    mae_per_day = []
    rmse_per_day = []
    mape_per_day = []
    r2_per_day = []

    for i in range(14):  # Evaluate each day separately
        mae_per_day.append(mean_absolute_error(y_test_actual[:, i], y_pred[:, i]))
        rmse_per_day.append(np.sqrt(mean_squared_error(y_test_actual[:, i], y_pred[:, i])))
        mape_per_day.append(np.mean(np.abs((y_test_actual[:, i] - y_pred[:, i]) / np.maximum(y_test_actual[:, i], 1e-8))) * 100)
        r2_per_day.append(r2_score(y_test_actual[:, i], y_pred[:, i]))

    metrics[product_group] = {
        'MAE': np.mean(mae_per_day),
        'RMSE': np.mean(rmse_per_day),
        'MAPE': np.mean(mape_per_day),
        'R2': np.mean(r2_per_day)
    }


    xgboost_results[product_group] = model
    
    plt.figure(figsize=(12, 6))
    plt.plot(X_test.index, y_test_actual[:, 0], label="Actual", alpha=0.7)
    plt.plot(X_test.index, y_pred[:, 0], linestyle="--", label="Predicted", alpha=0.7)
    plt.legend()
    plt.title(f"Predicted vs Actual for {product_group}")
    plt.savefig(os.path.join(group_output_dir, f"{sanitized_group}.png"))
    plt.close()
    
    # Save per-group metrics
    metrics_df = pd.DataFrame([metrics[product_group]])
    metrics_df.to_csv(os.path.join(group_output_dir, f"{sanitized_group}_metrics.csv"), index=False)
    
summary_df = pd.DataFrame.from_dict(metrics, orient="index")
summary_df.to_csv(os.path.join(output_dir, "final_metrics_summary.csv"))

avg_metrics_all = {
    'MAE': np.mean([metrics[p]['MAE'] for p in metrics]),
    'RMSE': np.mean([metrics[p]['RMSE'] for p in metrics]),
    'MAPE': np.mean([metrics[p]['MAPE'] for p in metrics]),
    'R2': np.mean([metrics[p]['R2'] for p in metrics])
}
pd.DataFrame([avg_metrics_all]).to_csv(os.path.join(output_dir, "final_test_avg_metrics.csv"), index=False)

print("\nProcessing completed. Metrics and plots have been saved.")


Processing product groups: 100%|██████████| 10/10 [02:54<00:00, 17.47s/it]


Processing completed. Metrics and plots have been saved.





In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import optuna
from tqdm import tqdm
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from modules.utils import process_name  

# Store results
xgboost_results = {}
metrics = {}
best_params_dict = {}
output_dir = 'final_version/output/2_weeks/xgboost/finetuned'

def create_lagged_features(df, target_col, forecast_horizon=14):
    df = df.copy()
    df[f"log_{target_col}"] = np.log1p(df[target_col])
    df["is_active_day"] = (df[target_col] > 0).astype(int)
    df["rolling_mean_7"] = df[target_col].rolling(window=7, min_periods=1).mean()
    df["rolling_mean_14"] = df[target_col].rolling(window=14, min_periods=1).mean()

    for lag in [14, 21, 28, 35]:  # Weekly lags instead of daily
        df[f"lag_{lag}"] = df[target_col].shift(lag)
    
    for i in range(1, forecast_horizon + 1):  # Multi-step target
        df[f"target_{i}"] = df[target_col].shift(-i)
    
    df.dropna(inplace=True)  # Drop remaining NaNs
    X = df.drop(columns=[target_col, f"log_{target_col}"] + [f"target_{i}" for i in range(1, forecast_horizon + 1)])
    y = df[[f"target_{i}" for i in range(1, forecast_horizon + 1)]]
    
    return X, y

# Function to optimize XGBoost hyperparameters using Optuna
def objective(trial, X_train, X_val, y_train, y_val):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000, step=100),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),  # Fixed
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-3, 10.0, log=True),  # Fixed
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-3, 10.0, log=True),  # Fixed
    }
    
    model = XGBRegressor(objective='reg:squarederror', random_state=42, **params)
    model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)
    
    y_pred = model.predict(X_val)
    return mean_absolute_error(y_val, y_pred)

forecast_horizon = 14  # Predict 14 days ahead
for product_group in tqdm(data['product_group'].unique(), desc="Processing product groups"):
    sanitized_group = process_name(product_group)
    group_output_dir = os.path.join(output_dir, sanitized_group)
    os.makedirs(group_output_dir, exist_ok=True)
    
    product_data = data[data['product_group'] == product_group].copy()
    product_data = product_data.set_index('date').asfreq('D').fillna(0)
    
    if len(product_data) < 50 + forecast_horizon:  # Ensure enough data for splitting
        continue
    
    target_column = 'transaction_count'
    
    X, y = create_lagged_features(product_data, target_column, forecast_horizon)
    
    # Ensure categorical columns are properly encoded
    X = X.copy()
    if 'product_group' in X.columns:
        X.drop(columns=['product_group'], inplace=True)
    if 'is_weekend' in X.columns:
        X['is_weekend'] = X['is_weekend'].astype(int)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, shuffle=False)  # 20% of 80%

    # Run Optuna optimization
    study = optuna.create_study(direction="minimize")
    study.optimize(lambda trial: objective(trial, X_train, X_val, y_train, y_val), n_trials=30)
    
    best_params = study.best_params
    best_params_dict[product_group] = best_params

    # Train final model using best hyperparameters
    model = XGBRegressor(objective='reg:squarederror', random_state=42, **best_params)
    model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)], verbose=False)
    
    y_pred = np.clip(model.predict(X_test), 0, 1e6)  # Clip predictions
    y_test_actual = np.clip(y_test, 0, 1e6)  # Clip actual values
    
    valid_mask = np.isfinite(y_test_actual).all(axis=1) & np.isfinite(y_pred).all(axis=1)
    y_test_actual = y_test_actual[valid_mask]
    y_pred = y_pred[valid_mask]

    if len(y_test_actual) == 0 or len(y_pred) == 0:
        print(f"Warning: No valid predictions for {product_group}")
        continue
    
    mae = mean_absolute_error(y_test_actual, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test_actual, y_pred))
    mape = np.mean(np.abs((y_test_actual - y_pred) / np.maximum(y_test_actual, 1e-8))) * 100
    r2 = r2_score(y_test_actual, y_pred)
    
    metrics[product_group] = {'MAE': mae, 'RMSE': rmse, 'MAPE': mape, 'R2': r2}
    xgboost_results[product_group] = model
    
    # Save best hyperparameters
    pd.DataFrame([best_params]).to_csv(os.path.join(group_output_dir, f"{sanitized_group}_best_params.csv"), index=False)
    
    # Plot Predictions vs Actuals
    plt.figure(figsize=(12, 6))
    plt.plot(X_test.index, y_test_actual.mean(axis=1), label="Actual", alpha=0.7)
    plt.plot(X_test.index, y_pred.mean(axis=1), label="Predicted", alpha=0.7, linestyle='--')
    plt.legend()
    plt.title(f"Predicted vs Actual for {product_group}")
    plt.savefig(os.path.join(group_output_dir, f"{sanitized_group}.png"))
    plt.close()
    
    # Plot residuals
    residuals = y_test_actual.mean(axis=1) - y_pred.mean(axis=1)
    plt.figure(figsize=(12, 6))
    plt.plot(X_test.index, residuals, label="Residuals", alpha=0.7)
    plt.axhline(0, linestyle='--', color='r', alpha=0.7)
    plt.legend()
    plt.title(f"Residuals for {product_group}")
    plt.savefig(os.path.join(group_output_dir, f"{sanitized_group}_residuals.png"))
    plt.close()
    
    # Save per-group metrics
    metrics_df = pd.DataFrame([metrics[product_group]])
    metrics_df.to_csv(os.path.join(group_output_dir, f"{sanitized_group}_metrics.csv"), index=False)
    
# Save all metrics
summary_df = pd.DataFrame.from_dict(metrics, orient="index")
summary_df.to_csv(os.path.join(output_dir, "final_metrics_summary.csv"))

# Save best hyperparameters
best_params_df = pd.DataFrame.from_dict(best_params_dict, orient="index")
best_params_df.to_csv(os.path.join(output_dir, "final_best_hyperparameters.csv"))

print("\n✅ Processing completed. Hyperparameter tuning, model training, and evaluation completed.")


Processing product groups:   0%|          | 0/10 [00:00<?, ?it/s][I 2025-02-26 19:53:52,331] A new study created in memory with name: no-name-0e5ce4cf-d545-484e-97ce-6466a0543622
[I 2025-02-26 19:54:19,856] Trial 0 finished with value: 177.82888793945312 and parameters: {'n_estimators': 500, 'max_depth': 10, 'learning_rate': 0.08688068850842119, 'colsample_bytree': 0.5478100781447883, 'subsample': 0.6646115184176141, 'reg_alpha': 0.009995833014120594, 'reg_lambda': 7.276134780430636}. Best is trial 0 with value: 177.82888793945312.
[I 2025-02-26 19:54:28,126] Trial 1 finished with value: 241.7052001953125 and parameters: {'n_estimators': 600, 'max_depth': 10, 'learning_rate': 0.2900669515396802, 'colsample_bytree': 0.6723174512809134, 'subsample': 0.95992230518158, 'reg_alpha': 0.11190990667619331, 'reg_lambda': 0.002272230345889901}. Best is trial 0 with value: 177.82888793945312.
[I 2025-02-26 19:55:22,551] Trial 2 finished with value: 196.4676971435547 and parameters: {'n_estimators


✅ Processing completed. Hyperparameter tuning, model training, and evaluation completed.





In [None]:
import pandas as pd
import os

# Load the final metrics summary file
output_dir = "final_version/output/2_weeks/xgboost/finetuned/"
summary_file = os.path.join(output_dir, "final_metrics_summary.csv")

# Read the metrics summary
summary_df = pd.read_csv(summary_file)

# Compute average metrics
avg_metrics = {
    'MAE': summary_df['MAE'].mean(),
    'RMSE': summary_df['RMSE'].mean(),
    'MAPE': summary_df['MAPE'].mean(),
    'R2': summary_df['R2'].mean()
}

# Save as final_test_avg_metrics.csv
avg_metrics_df = pd.DataFrame([avg_metrics])
avg_metrics_path = os.path.join(output_dir, "final_test_avg_metrics.csv")
avg_metrics_df.to_csv(avg_metrics_path, index=False)

print(f"✅ Saved average metrics to: {avg_metrics_path}")


✅ Saved average metrics to: final_version/output/2_weeks/xgboost/finetuned/final_test_avg_metrics.csv


## Google Trends

In [None]:
import pandas as pd

# Load preprocessed data
data = pd.read_csv('data/data_with_keywords.csv')
data['date'] = pd.to_datetime(data['date'], errors='coerce')
data.columns

Index(['date', 'product_group', 'transaction_count', 'avg_price',
       'sales_channel', 'most_common_age_bin', 'unique_customers', 'std_price',
       'unique_articles_sold', 'median_age', 'club_member_ratio',
       'fashion_news_subscribers', 'first_purchase_days_ago',
       'recent_purchase_days_ago', 'product_type_name', 'colour_group_name',
       'graphical_appearance_name'],
      dtype='object')

In [None]:
import pandas as pd

# Load the keywords CSV file into a DataFrame
keywords_df = pd.read_csv('data/external/keywords_trends.csv')

# Display the first few rows of the DataFrame to check the data
keywords_df.head()

keywords_list = pd.read_csv('data/external/keywords.csv')['Keyword'].tolist()

In [None]:
def get_trends_file(keywords, trends_folder='data/external/google_trends_v2'):
    for keyword in keywords:
        file_name = f"{keyword.replace(' ', '_')}_trend_data.csv"
        file_path = os.path.join(trends_folder, file_name)
        if os.path.exists(file_path):
            return file_path
        else:
            print(f"Warning: Trend file for {keyword} not found.")

def load_and_process_trends(trend_file, product_data):
    """
    Loads and processes the Google Trends data by matching the date and aligning with product_data's index.
    
    Parameters:
    trend_file (str): Path to the Google Trends CSV file.
    product_data (pd.DataFrame): Product data with a datetime index to match with trend data.
    
    Returns:
    pd.Series: A Series containing the trend values, reindexed to match product_data.
    """
    # Load the Google Trends data
    trend_data = pd.read_csv(trend_file)

    # Drop the 'isPartial' column
    trend_data = trend_data.drop(columns=['isPartial'])

    # Convert 'date' to datetime format to align with the product_data dates
    trend_data['date'] = pd.to_datetime(trend_data['date'])

    # add lagged days
    trend_data['date'] = trend_data['date'] + pd.Timedelta(weeks=36)

    # Set 'date' as the index
    trend_data.set_index('date', inplace=True)

    # Reindex the trend data to align with product_data's index (assumed to be datetime)
    trend_data = trend_data.reindex(product_data.index, method='ffill')

    return trend_data

def add_trends_to_product_data(product_data, trend_data, trend_column_name):
    """
    Adds the trend data to the product_data based on the trend column name.

    Parameters:
    product_data (pd.DataFrame): The product data.
    trend_data (pd.DataFrame): The trend data containing the trend values.
    trend_column_name (str): The name of the column to store the trend data in product_data.
    
    Returns:
    pd.DataFrame: The updated product data with the trend column added.
    """
    # Add the trend data to the product_data DataFrame
    product_data[trend_column_name] = trend_data[trend_column_name]  # Add dynamic trend column

    return product_data


In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from modules.utils import process_name  

data = pd.read_csv('data/data_with_keywords.csv')
keywords_list = pd.read_csv('data/external/keywords.csv')['Keyword'].tolist()

data = process_data(data)

# Output directory
output_dir = "final_version/output/google_trends/1_day/xgboost"
os.makedirs(output_dir, exist_ok=True)

# Store results
xgboost_results = {}
metrics = {}

product_groups = data['product_group'].unique()

for product_group in tqdm(product_groups, desc="Processing product groups"):
    sanitized_group = process_name(product_group)
    group_output_dir = os.path.join(output_dir, sanitized_group)
    os.makedirs(group_output_dir, exist_ok=True)
    product_data = data[data['product_group'] == product_group].copy()
    product_data = product_data.set_index('date').asfreq('D').fillna(0)
    matched_keywords = [data[column].iloc[0] for column in ['product_type_name', 'colour_group_name', 'graphical_appearance_name'] if data[column].iloc[0] in keywords_list]
    
    if matched_keywords:
        for keyword in matched_keywords:
            trend_file = get_trends_file([keyword])
            if trend_file:
                trend_data = load_and_process_trends(trend_file, product_data)
                product_data = add_trends_to_product_data(product_data, trend_data, keyword)

    if len(product_data) < 50:
        continue

    target_column = 'transaction_count'
    X, y = create_lagged_features(product_data, target_column, lag=1)
    if 'product_group' in X.columns:
        X.drop(columns=['product_group'], inplace=True)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, shuffle=False)
    model = XGBRegressor(n_estimators=600, max_depth=6, learning_rate=0.1, colsample_bytree=0.8, subsample=0.7, objective='reg:squarederror', reg_alpha=0.1, reg_lambda=1.5, random_state=42, enable_categorical=True)
    model.fit(X_train, y_train, verbose=False)
    y_pred = np.clip(model.predict(X_test), 0, 1e6)
    y_test_actual = np.clip(y_test, 0, 1e6)
    
    mae = mean_absolute_error(y_test_actual, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test_actual, y_pred))
    mape = np.mean(np.abs((y_test_actual - y_pred) / np.maximum(y_test_actual, 1e-8))) * 100
    r2 = r2_score(y_test_actual, y_pred)
    
    metrics[product_group] = {'MAE': mae, 'RMSE': rmse, 'MAPE': mape, 'R2': r2}
    xgboost_results[product_group] = model

summary_df = pd.DataFrame.from_dict(metrics, orient="index")
summary_df.to_csv(os.path.join(output_dir, "final_metrics_summary.csv"))
avg_metrics = { 'MAE': np.mean([metrics[p]['MAE'] for p in metrics]), 'RMSE': np.mean([metrics[p]['RMSE'] for p in metrics]), 'MAPE': np.mean([metrics[p]['MAPE'] for p in metrics]), 'R2': np.mean([metrics[p]['R2'] for p in metrics]) }
pd.DataFrame([avg_metrics]).to_csv(os.path.join(output_dir, "final_test_avg_metrics.csv"), index=False)
print("\nProcessing completed. Metrics and plots have been saved.")


Processing product groups:   0%|          | 0/10 [00:00<?, ?it/s]


KeyError: 'product_type_name'