## Setup

In [59]:
# import all necessary libraries
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd

# Load the datasets
train_data = pd.read_csv("data/imputed_train.csv")
test_data = pd.read_csv("data/test.csv")

from utils.preprocessing import preprocess_data

# Integrate holidays data
train_data = preprocess_data(train_data)
test_data = preprocess_data(test_data)

In [49]:
# Define target columns and separate features for training
target_columns = ['valeur_NO2', 'valeur_CO', 'valeur_O3', 'valeur_PM10', 'valeur_PM25']
X_train = train_data.drop(columns=target_columns + ['id'], errors='ignore')
y_train = train_data[target_columns]
X_test = test_data.drop(columns=['id'], errors='ignore')

In [63]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40991 entries, 0 to 40990
Data columns (total 19 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   is_holiday              40991 non-null  int64  
 1   is_jour_ferie           40991 non-null  int64  
 2   precipitation           40966 non-null  float64
 3   wind_speed              40986 non-null  float64
 4   temperature             40991 non-null  float64
 5   humidity                40991 non-null  float64
 6   pressure                40991 non-null  float64
 7   visibility              40940 non-null  float64
 8   global_solar_radiation  40986 non-null  float64
 9   Year                    40991 non-null  int32  
 10  Month                   40991 non-null  int32  
 11  Weekday                 40991 non-null  int32  
 12  Day                     40991 non-null  int32  
 13  Hour                    40991 non-null  int32  
 14  is_weekend              40991 non-null

## Training & Prediction

In [57]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Split the training data for validation
X_train_part, X_val, y_train_part, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=123)

# Initialize the base XGBoost model
xgb_model = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    random_state=123
)

# Wrap the XGBoost model in MultiOutputRegressor for multi-target prediction
multi_target_model = MultiOutputRegressor(xgb_model)

# Train the model on the partial training set
multi_target_model.fit(X_train_part, y_train_part)

# Make predictions on the validation set
y_val_pred = multi_target_model.predict(X_val)

# Initialize list to store MAE for each target
mae_values = []

# Evaluate the model for each target column
for i, target in enumerate(target_columns):
    mse = mean_squared_error(y_val[target], y_val_pred[:, i])
    mae = mean_absolute_error(y_val[target], y_val_pred[:, i])
    r2 = r2_score(y_val[target], y_val_pred[:, i])
    
    mae_values.append(mae)
    
    print(f"Metrics for {target} on Validation Set:")
    print(f"  Mean Squared Error (MSE): {mse}")
    print(f"  Mean Absolute Error (MAE): {mae}")
    print(f"  R² Score: {r2}\n")

# Calculate the average MAE across all targets
average_mae = sum(mae_values) / len(mae_values)
print(f"Average MAE across all targets: {average_mae}\n")

# create test set preditions
y_test_pred = multi_target_model.predict(X_test)

# Create DataFrame for predictions with the id column included
test_predictions = pd.DataFrame(y_test_pred, columns=target_columns)
test_predictions['id'] = test_data['id'].values  # Add the id column from test_data

# Save test predictions with id to a CSV file
test_predictions = test_predictions[['id'] + target_columns]  # Reorder to have id first
test_predictions.to_csv("data/test_predictions.csv", index=False)



  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:


Metrics for valeur_NO2 on Validation Set:
  Mean Squared Error (MSE): 69.9193638608098
  Mean Absolute Error (MAE): 5.797898813220963
  R² Score: 0.6840108299764677

Metrics for valeur_CO on Validation Set:
  Mean Squared Error (MSE): 0.003753000609980354
  Mean Absolute Error (MAE): 0.033867883230161676
  R² Score: 0.6368224731524509

Metrics for valeur_O3 on Validation Set:
  Mean Squared Error (MSE): 132.81592358107116
  Mean Absolute Error (MAE): 8.885468875781644
  R² Score: 0.809128018367318

Metrics for valeur_PM10 on Validation Set:
  Mean Squared Error (MSE): 38.62157142983751
  Mean Absolute Error (MAE): 4.470272616488838
  R² Score: 0.6840890318302979

Metrics for valeur_PM25 on Validation Set:
  Mean Squared Error (MSE): 17.213057918008808
  Mean Absolute Error (MAE): 2.922368988058865
  R² Score: 0.7333877938515186

Average MAE across all targets: 4.421975435356094



  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
