## Setup

In [1]:
# import all necessary libraries
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd

# Load the datasets
train_data = pd.read_csv("data/imputed_train_v2.csv")
test_data = pd.read_csv("data/test.csv")

from utils.preprocessing import preprocess_data
from utils.lagger import create_lags_advanced

# Integrate feature engineering
train_data = preprocess_data(train_data, add_filter_outliers=False)
test_data = preprocess_data(test_data, add_filter_outliers=False)

In [2]:
train_data.columns

Index(['id', 'valeur_NO2', 'valeur_CO', 'valeur_O3', 'valeur_PM10',
       'valeur_PM25', 'is_holiday', 'is_jour_ferie', 'precipitation',
       'wind_speed', 'temperature', 'humidity', 'pressure', 'visibility',
       'global_solar_radiation', 'car_flow', 'Year', 'Month', 'Day',
       'is_weekend', 'DayOfYear', 'HourOfDay', 'Weekday', 'DayOfYear_sin',
       'DayOfYear_cos', 'HourOfDay_sin', 'HourOfDay_cos', 'Weekday_sin',
       'Weekday_cos'],
      dtype='object')

In [3]:
target_columns = ['valeur_NO2', 'valeur_CO', 'valeur_O3', 'valeur_PM10', 'valeur_PM25']
lag_dict = {col:[1, 2, 24, 24*7, 24*365] for col in target_columns}
lag_dict.update({'global_solar_radiation': [3], 'temperature': [3], 'humidity': [3], 'pressure': [3], 'precipitation': [3]})
lag_dict

{'valeur_NO2': [1, 2, 24, 168, 8760],
 'valeur_CO': [1, 2, 24, 168, 8760],
 'valeur_O3': [1, 2, 24, 168, 8760],
 'valeur_PM10': [1, 2, 24, 168, 8760],
 'valeur_PM25': [1, 2, 24, 168, 8760],
 'global_solar_radiation': [3],
 'temperature': [3],
 'humidity': [3],
 'pressure': [3],
 'precipitation': [3]}

In [4]:
features = ['is_holiday', 'precipitation',
       'wind_speed', 'temperature', 'humidity', 'pressure', 'visibility',
       'global_solar_radiation', 'Year',
       'is_weekend', 'Weekday', 'DayOfYear_sin',
       'DayOfYear_cos', 'HourOfDay_sin', 'HourOfDay_cos', 'Weekday_sin',
       'Weekday_cos']
target_columns = ['valeur_NO2', 'valeur_CO', 'valeur_O3', 'valeur_PM10', 'valeur_PM25']


In [6]:
# Create lags

target_columns = ['valeur_NO2', 'valeur_CO', 'valeur_O3', 'valeur_PM10', 'valeur_PM25']
columns_to_lag = [col for col in train_data.columns if col not in target_columns and col != 'id']
train_data = create_lags_advanced(train_data, lag_dict)
test_data = create_lags_advanced(test_data, {key:value for key, value in lag_dict.items() if key not in target_columns})

In [9]:
# Define target columns and separate features for training
X_train = train_data[features]
y_train = train_data[target_columns]

## Training & Prediction

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import TimeSeriesSplit
tscv = TimeSeriesSplit()


# Split the training data for validation
X_train_part, X_val, y_train_part, y_val = time_series_split(X_train, y_train, test_size=0.2, random_state=123)

# Initialize the base XGBoost model
xgb_model = xgb.XGBRegressor(
    objective='reg:absoluteerror',
    n_estimators=500,
    learning_rate=0.1,
    max_depth=10,
    random_state=123
)

# Wrap the XGBoost model in MultiOutputRegressor for multi-target prediction
multi_target_model = MultiOutputRegressor(xgb_model)

# Train the model on the partial training set
multi_target_model.fit(X_train_part, y_train_part)

# Make predictions on the validation set
y_val_pred = multi_target_model.predict(X_val)

# Initialize list to store MAE for each target
mae_values = []

# Evaluate the model for each target column
for i, target in enumerate(target_columns):
    mse = mean_squared_error(y_val[target], y_val_pred[:, i])
    mae = mean_absolute_error(y_val[target], y_val_pred[:, i])
    r2 = r2_score(y_val[target], y_val_pred[:, i])
    
    mae_values.append(mae)
    
    print(f"Metrics for {target} on Validation Set:")
    print(f"  Mean Squared Error (MSE): {mse}")
    print(f"  Mean Absolute Error (MAE): {mae}")
    print(f"  R² Score: {r2}\n")

# Calculate the average MAE across all targets
average_mae = sum(mae_values) / len(mae_values)
print(f"Average MAE across all targets: {average_mae}\n")

Metrics for valeur_NO2 on Validation Set:
  Mean Squared Error (MSE): 54.85039199811919
  Mean Absolute Error (MAE): 4.636102644841404
  R² Score: 0.7510948191821226

Metrics for valeur_CO on Validation Set:
  Mean Squared Error (MSE): 0.0034822072587580486
  Mean Absolute Error (MAE): 0.025913145345265697
  R² Score: 0.6630427497753936

Metrics for valeur_O3 on Validation Set:
  Mean Squared Error (MSE): 92.94253715504057
  Mean Absolute Error (MAE): 7.051200684802714
  R² Score: 0.866431366943717

Metrics for valeur_PM10 on Validation Set:
  Mean Squared Error (MSE): 28.3854938572327
  Mean Absolute Error (MAE): 3.5404929525091737
  R² Score: 0.767264032809854

Metrics for valeur_PM25 on Validation Set:
  Mean Squared Error (MSE): 12.427747963750893
  Mean Absolute Error (MAE): 2.292364673309434
  R² Score: 0.8074992306680664

Average MAE across all targets: 3.5092148201615982



In [24]:
df_results = pd.DataFrame(index=X_train_part.columns)
for i, target in enumerate(target_columns):
    df_results[target] = multi_target_model.estimators_[i].feature_importances_ * 100

df_results = df_results.sort_values('valeur_O3', ascending=False)
df_results

Unnamed: 0,valeur_NO2,valeur_CO,valeur_O3,valeur_PM10,valeur_PM25
DayOfYear_cos_lag_2,3.878893,1.426030,3.795556,1.455715,1.434176
DayOfYear_cos_lag_1,5.257946,10.789968,3.395255,1.402177,1.709283
humidity,0.613887,0.534325,2.945261,1.096144,0.747280
DayOfYear_sin_lag_1,1.099108,1.192553,2.402415,1.481380,1.421804
DayOfYear_lag_3,1.183089,1.110124,2.041009,1.433556,1.613788
...,...,...,...,...,...
HourOfDay,0.821785,0.645364,0.608179,0.589643,0.629996
wind_speed,0.699159,0.512965,0.561860,0.449872,0.491435
is_jour_ferie,0.604871,0.663615,0.525703,0.598141,0.723746
precipitation,0.363071,0.338990,0.388577,0.527296,0.801401


In [17]:
test_data = pd.read_csv("data/test.csv")
test_data.head()

Unnamed: 0,id
0,2024-09-03 23
1,2024-09-04 00
2,2024-09-04 01
3,2024-09-04 02
4,2024-09-04 03


In [18]:
# create test set preditions
y_test_pred = multi_target_model.predict(X_test)

# Create DataFrame for predictions with the id column included
test_predictions = pd.DataFrame(y_test_pred, columns=target_columns)
test_predictions['id'] = test_data['id'].values  # Add the id column from test_data

# Save test predictions with id to a CSV file
test_predictions = test_predictions[['id'] + target_columns]  # Reorder to have id first
test_predictions.to_csv("submissions/test_predictions.csv", index=False)
