## Setup

In [2]:
# import all necessary libraries
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd

# Load the datasets
train_data = pd.read_csv("data/imputed_train_v2.csv")
test_data = pd.read_csv("data/test.csv")

from utils.preprocessing import preprocess_data
from utils.lagger import create_lags

# Integrate feature engineering
train_data = preprocess_data(train_data)
test_data = preprocess_data(test_data)

In [4]:
train_data.columns

Index(['id', 'valeur_NO2', 'valeur_CO', 'valeur_O3', 'valeur_PM10',
       'valeur_PM25', 'is_holiday', 'is_jour_ferie', 'precipitation',
       'wind_speed', 'temperature', 'humidity', 'pressure', 'visibility',
       'global_solar_radiation', 'car_flow', 'Year', 'Month', 'Day',
       'is_weekend', 'DayOfYear', 'HourOfDay', 'Weekday', 'DayOfYear_sin',
       'DayOfYear_cos', 'HourOfDay_sin', 'HourOfDay_cos', 'Weekday_sin',
       'Weekday_cos'],
      dtype='object')

In [9]:
# Create lags
target_columns = ['valeur_NO2', 'valeur_CO', 'valeur_O3', 'valeur_PM10', 'valeur_PM25']
columns_to_lag = [col for col in train_data.columns if col not in target_columns and col != 'id']
train_data = create_lags(train_data, columns_to_lag, 3)
test_data = create_lags(test_data, columns_to_lag, 3)

In [10]:
# Define target columns and separate features for training
X_train = train_data.drop(columns=target_columns + ['id'], errors='ignore')
y_train = train_data[target_columns]
X_test = test_data.drop(columns=['id'], errors='ignore')

## Training & Prediction

In [11]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.multioutput import MultiOutputRegressor

# Split the training data for validation
X_train_part, X_val, y_train_part, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=123)

# Initialize the base XGBoost model
xgb_model = xgb.XGBRegressor(
    objective='reg:absoluteerror',
    n_estimators=500,
    learning_rate=0.1,
    max_depth=10,
    random_state=123
)

# Wrap the XGBoost model in MultiOutputRegressor for multi-target prediction
multi_target_model = MultiOutputRegressor(xgb_model)

# Train the model on the partial training set
multi_target_model.fit(X_train_part, y_train_part)

# Make predictions on the validation set
y_val_pred = multi_target_model.predict(X_val)

# Initialize list to store MAE for each target
mae_values = []

# Evaluate the model for each target column
for i, target in enumerate(target_columns):
    mse = mean_squared_error(y_val[target], y_val_pred[:, i])
    mae = mean_absolute_error(y_val[target], y_val_pred[:, i])
    r2 = r2_score(y_val[target], y_val_pred[:, i])
    
    mae_values.append(mae)
    
    print(f"Metrics for {target} on Validation Set:")
    print(f"  Mean Squared Error (MSE): {mse}")
    print(f"  Mean Absolute Error (MAE): {mae}")
    print(f"  R² Score: {r2}\n")

# Calculate the average MAE across all targets
average_mae = sum(mae_values) / len(mae_values)
print(f"Average MAE across all targets: {average_mae}\n")

Metrics for valeur_NO2 on Validation Set:
  Mean Squared Error (MSE): 30.12347422591089
  Mean Absolute Error (MAE): 3.7735780215216064
  R² Score: 0.7360373800727043

Metrics for valeur_CO on Validation Set:
  Mean Squared Error (MSE): 0.000706728301314382
  Mean Absolute Error (MAE): 0.018804985331888194
  R² Score: 0.7650885295927199

Metrics for valeur_O3 on Validation Set:
  Mean Squared Error (MSE): 78.46019126359869
  Mean Absolute Error (MAE): 6.59976280782262
  R² Score: 0.8435866260109809

Metrics for valeur_PM10 on Validation Set:
  Mean Squared Error (MSE): 16.303034086272714
  Mean Absolute Error (MAE): 2.986491065923359
  R² Score: 0.7049222693798469

Metrics for valeur_PM25 on Validation Set:
  Mean Squared Error (MSE): 6.622948546477786
  Mean Absolute Error (MAE): 1.8950875234924058
  R² Score: 0.7206825163444375

Average MAE across all targets: 3.0547448808183755



In [17]:
test_data = pd.read_csv("data/test.csv")
test_data.head()

Unnamed: 0,id
0,2024-09-03 23
1,2024-09-04 00
2,2024-09-04 01
3,2024-09-04 02
4,2024-09-04 03


In [18]:
# create test set preditions
y_test_pred = multi_target_model.predict(X_test)

# Create DataFrame for predictions with the id column included
test_predictions = pd.DataFrame(y_test_pred, columns=target_columns)
test_predictions['id'] = test_data['id'].values  # Add the id column from test_data

# Save test predictions with id to a CSV file
test_predictions = test_predictions[['id'] + target_columns]  # Reorder to have id first
test_predictions.to_csv("submissions/test_predictions.csv", index=False)
