## Setup

In [7]:
import xgboost as xgb
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.multioutput import MultiOutputRegressor
import pandas as pd
import numpy as np

# Load the datasets
train_data = pd.read_csv("data/imputed_train_v2.csv")
test_data = pd.read_csv("data/test.csv")

from utils.preprocessing import preprocess_data

# Integrate holidays data
train_data = preprocess_data(train_data)
test_data = preprocess_data(test_data)

# Define target columns and the lag & rolling parameters
target_columns = ['valeur_NO2', 'valeur_CO', 'valeur_O3', 'valeur_PM10', 'valeur_PM25']
lag_steps = [1, 2, 3]  # Lags at 1, 2, and 3 time steps
rolling_windows = [7, 30]  # Rolling windows of 7 and 30 time steps

# Function to add lag and rolling stats only for training data
def add_time_features(df, target_cols, lags, rolls):
    df = df.copy()
    for col in target_cols:
        for lag in lags:
            df[f'{col}_lag_{lag}'] = df[col].shift(lag)
        for roll in rolls:
            df[f'{col}_roll_mean_{roll}'] = df[col].shift(1).rolling(roll).mean()
            df[f'{col}_roll_std_{roll}'] = df[col].shift(1).rolling(roll).std()
    return df

# Add lag and rolling statistics features only to the training data
train_data = add_time_features(train_data, target_columns, lag_steps, rolling_windows)

# Drop rows with NaN values due to lagging in train data
train_data.dropna(inplace=True)

# Define features and target variables
X_train = train_data.drop(columns=target_columns + ['id'], errors='ignore')
y_train = train_data[target_columns]
X_test = test_data.drop(columns=['id'], errors='ignore')  # No lag or rolling features

# Initialize the base XGBoost model
xgb_model = xgb.XGBRegressor(
    objective='reg:absoluteerror',
    n_estimators=500,
    learning_rate=0.1,
    max_depth=10,
    random_state=123
)

# Wrap the XGBoost model in MultiOutputRegressor for multi-target prediction
multi_target_model = MultiOutputRegressor(xgb_model)

# Use TimeSeriesSplit for time-based cross-validation
tscv = TimeSeriesSplit(n_splits=5)
mae_values = []  # List to store MAE for each fold

for fold, (train_index, val_index) in enumerate(tscv.split(X_train)):
    # Split the data into train and validation sets for each fold
    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
    
    # Train the model on the current fold
    multi_target_model.fit(X_train_fold, y_train_fold)
    
    # Make predictions on the validation fold
    y_val_pred = multi_target_model.predict(X_val_fold)
    
    # Calculate and display metrics for each target column in this fold
    print(f"Fold {fold + 1} metrics:")
    for i, target in enumerate(target_columns):
        mse = mean_squared_error(y_val_fold[target], y_val_pred[:, i])
        mae = mean_absolute_error(y_val_fold[target], y_val_pred[:, i])
        r2 = r2_score(y_val_fold[target], y_val_pred[:, i])
        
        mae_values.append(mae)
        
        print(f"  {target}: MSE={mse:.3f}, MAE={mae:.3f}, R²={r2:.3f}")
    print("\n")

# Calculate and display average MAE across all folds and targets
average_mae = sum(mae_values) / len(mae_values)
print(f"Average MAE across all targets and folds: {average_mae}\n")

# Train on full training set and make predictions on test set for final submission
multi_target_model.fit(X_train, y_train)
y_test_pred = multi_target_model.predict(X_test)

# Create DataFrame for predictions with the id column included
test_predictions = pd.DataFrame(y_test_pred, columns=target_columns)
test_predictions['id'] = test_data['id'].values  # Add the id column from test_data

# Save test predictions with id to a CSV file
test_predictions = test_predictions[['id'] + target_columns]  # Reorder to have id first
test_predictions.to_csv("data/test_predictions.csv", index=False)


  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_catego

Fold 1 metrics:
  valeur_NO2: MSE=20.515, MAE=3.192, R²=0.802
  valeur_CO: MSE=0.374, MAE=0.332, R²=-126.578
  valeur_O3: MSE=39.921, MAE=4.704, R²=0.921
  valeur_PM10: MSE=9.585, MAE=2.345, R²=0.824
  valeur_PM25: MSE=4.686, MAE=1.564, R²=0.840




  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_catego

Fold 2 metrics:
  valeur_NO2: MSE=21.907, MAE=3.208, R²=0.837
  valeur_CO: MSE=0.388, MAE=0.356, R²=-129.796
  valeur_O3: MSE=37.441, MAE=4.420, R²=0.928
  valeur_PM10: MSE=9.170, MAE=2.171, R²=0.845
  valeur_PM25: MSE=3.216, MAE=1.264, R²=0.878




  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_catego

Fold 3 metrics:
  valeur_NO2: MSE=20.515, MAE=3.028, R²=0.829
  valeur_CO: MSE=0.221, MAE=0.230, R²=-79.704
  valeur_O3: MSE=43.027, MAE=4.681, R²=0.930
  valeur_PM10: MSE=7.967, MAE=2.021, R²=0.856
  valeur_PM25: MSE=2.833, MAE=1.191, R²=0.877




  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_catego

Fold 4 metrics:
  valeur_NO2: MSE=18.094, MAE=2.744, R²=0.823
  valeur_CO: MSE=0.189, MAE=0.226, R²=-90.616
  valeur_O3: MSE=40.498, MAE=4.362, R²=0.906
  valeur_PM10: MSE=7.281, MAE=1.923, R²=0.875
  valeur_PM25: MSE=2.480, MAE=1.067, R²=0.872




  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_catego

Fold 5 metrics:
  valeur_NO2: MSE=14.970, MAE=2.611, R²=0.842
  valeur_CO: MSE=0.185, MAE=0.199, R²=-58.210
  valeur_O3: MSE=33.675, MAE=4.191, R²=0.920
  valeur_PM10: MSE=6.557, MAE=1.787, R²=0.867
  valeur_PM25: MSE=2.538, MAE=1.085, R²=0.874


Average MAE across all targets and folds: 2.1960132336261666



  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
