## Setup

In [36]:
# import all necessary libraries
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd

# Load the datasets
train_data = pd.read_csv("data/imputed_train_v2.csv")
test_data = pd.read_csv("data/test.csv")

from utils.preprocessing import preprocess_data

# Integrate holidays data
train_data = preprocess_data(train_data)
test_data = preprocess_data(test_data)

In [37]:
# Define target columns and separate features for training
target_columns = ['valeur_NO2', 'valeur_CO', 'valeur_O3', 'valeur_PM10', 'valeur_PM25']
X_train = train_data.drop(columns=target_columns + ['id'], errors='ignore')
y_train = train_data[target_columns]
X_test = test_data.drop(columns=['id'], errors='ignore')

## Training & Prediction

In [38]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.multioutput import MultiOutputRegressor

# Split the training data for validation
X_train_part, X_val, y_train_part, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=123)

# Initialize the base XGBoost model
xgb_model = xgb.XGBRegressor(
    objective='reg:absoluteerror',
    n_estimators=500,
    learning_rate=0.1,
    max_depth=10,
    random_state=123
)

# Wrap the XGBoost model in MultiOutputRegressor for multi-target prediction
multi_target_model = MultiOutputRegressor(xgb_model)

# Train the model on the partial training set
multi_target_model.fit(X_train_part, y_train_part)

# Make predictions on the validation set
y_val_pred = multi_target_model.predict(X_val)

# Initialize list to store MAE for each target
mae_values = []

# Evaluate the model for each target column
for i, target in enumerate(target_columns):
    mse = mean_squared_error(y_val[target], y_val_pred[:, i])
    mae = mean_absolute_error(y_val[target], y_val_pred[:, i])
    r2 = r2_score(y_val[target], y_val_pred[:, i])
    
    mae_values.append(mae)
    
    print(f"Metrics for {target} on Validation Set:")
    print(f"  Mean Squared Error (MSE): {mse}")
    print(f"  Mean Absolute Error (MAE): {mae}")
    print(f"  R² Score: {r2}\n")

# Calculate the average MAE across all targets
average_mae = sum(mae_values) / len(mae_values)
print(f"Average MAE across all targets: {average_mae}\n")

# create test set preditions
y_test_pred = multi_target_model.predict(X_test)

# Create DataFrame for predictions with the id column included
test_predictions = pd.DataFrame(y_test_pred, columns=target_columns)
test_predictions['id'] = test_data['id'].values  # Add the id column from test_data

# Save test predictions with id to a CSV file
test_predictions = test_predictions[['id'] + target_columns]  # Reorder to have id first
test_predictions.to_csv("data/test_predictions.csv", index=False)


  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_catego

Metrics for valeur_NO2 on Validation Set:
  Mean Squared Error (MSE): 36.807527171597634
  Mean Absolute Error (MAE): 4.041980803596558
  R² Score: 0.6774671064699793

Metrics for valeur_CO on Validation Set:
  Mean Squared Error (MSE): 0.13023823451499988
  Mean Absolute Error (MAE): 0.24509519482209582
  R² Score: -42.29026461267626

Metrics for valeur_O3 on Validation Set:
  Mean Squared Error (MSE): 98.44999291752525
  Mean Absolute Error (MAE): 7.338291461023343
  R² Score: 0.8037361964911571

Metrics for valeur_PM10 on Validation Set:
  Mean Squared Error (MSE): 18.398766829602756
  Mean Absolute Error (MAE): 3.1021304147854534
  R² Score: 0.6669904305199332

Metrics for valeur_PM25 on Validation Set:
  Mean Squared Error (MSE): 7.917553595640809
  Mean Absolute Error (MAE): 2.0252514860197603
  R² Score: 0.6660835983364891

Average MAE across all targets: 3.3505498720494415



  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
