## Setup

In [2]:
# import all necessary libraries
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd

# Load the datasets
train_data = pd.read_csv("data/imputed_train.csv")
test_data = pd.read_csv("data/test.csv")

from utils.preprocessing import preprocess_data

# Integrate holidays data
train_data = preprocess_data(train_data)
test_data = preprocess_data(test_data)

# Define target columns and separate features for training
target_columns = ['valeur_NO2', 'valeur_CO', 'valeur_O3', 'valeur_PM10', 'valeur_PM25']
X_train = train_data.drop(columns=target_columns + ['id'], errors='ignore')
y_train = train_data[target_columns]
X_test = test_data.drop(columns=['id'], errors='ignore')

In [3]:
# Import necessary libraries for CatBoost, Linear Regression, and stacking
from catboost import CatBoostRegressor
from sklearn.linear_model import LinearRegression
from sklearn.multioutput import MultiOutputRegressor
import numpy as np

# Split the training data for validation (used in stacking)
X_train_part, X_val, y_train_part, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=123)

# Define parameters for the CatBoost model
catboost_params = {
    'learning_rate': 0.1,
    'iterations': 700,
    'depth': 10,
    'random_seed': 123,
    'verbose': 100
}
catboost_model = CatBoostRegressor(**catboost_params)

# Initialize the XGBoost model
xgb_model = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_estimators=500,
    learning_rate=0.1,
    max_depth=10,
    random_state=123
)

# Wrap CatBoost and XGBoost in MultiOutputRegressor for multi-target predictions
multi_catboost_model = MultiOutputRegressor(catboost_model)
multi_xgb_model = MultiOutputRegressor(xgb_model)

# Train both models on the partial training set
multi_catboost_model.fit(X_train_part, y_train_part)
multi_xgb_model.fit(X_train_part, y_train_part)

# Generate predictions from both models on the validation set (for stacking)
catboost_val_preds = multi_catboost_model.predict(X_val)
xgb_val_preds = multi_xgb_model.predict(X_val)

# Stack the predictions as new features for the meta-model
stacked_val_features = np.hstack((catboost_val_preds, xgb_val_preds))

# Initialize the meta-model (using Linear Regression for simplicity)
meta_model = LinearRegression()

# Train the meta-model on the stacked features
meta_model.fit(stacked_val_features, y_val)

# Generate predictions from both models on the test set
catboost_test_preds = multi_catboost_model.predict(X_test)
xgb_test_preds = multi_xgb_model.predict(X_test)

# Stack the test predictions for the meta-model
stacked_test_features = np.hstack((catboost_test_preds, xgb_test_preds))

# Make final predictions on the test set using the meta-model
final_test_predictions = meta_model.predict(stacked_test_features)

# Create DataFrame for test predictions with the id column included
test_predictions = pd.DataFrame(final_test_predictions, columns=target_columns)
test_predictions['id'] = test_data['id'].values  # Add the id column from test_data

# Reorder to have 'id' as the first column and save the predictions
test_predictions = test_predictions[['id'] + target_columns]
test_predictions.to_csv("data/stacked_model_predictions.csv", index=False)
print("Stacked model test predictions saved to data/stacked_model_predictions.csv")


0:	learn: 10.4208670	total: 284ms	remaining: 3m 18s
100:	learn: 5.3753682	total: 18.5s	remaining: 1m 49s
200:	learn: 4.4090025	total: 36.9s	remaining: 1m 31s
300:	learn: 3.8127263	total: 1m	remaining: 1m 20s
400:	learn: 3.3923968	total: 1m 22s	remaining: 1m 1s
500:	learn: 3.0393703	total: 1m 37s	remaining: 38.6s
600:	learn: 2.7707141	total: 1m 57s	remaining: 19.3s
699:	learn: 2.5463535	total: 2m 17s	remaining: 0us
0:	learn: 0.0527255	total: 177ms	remaining: 2m 3s
100:	learn: 0.0255961	total: 18.7s	remaining: 1m 50s
200:	learn: 0.0204536	total: 37.7s	remaining: 1m 33s
300:	learn: 0.0176053	total: 1m 4s	remaining: 1m 24s
400:	learn: 0.0157738	total: 1m 22s	remaining: 1m 1s
500:	learn: 0.0141966	total: 1m 40s	remaining: 39.8s
600:	learn: 0.0128407	total: 1m 59s	remaining: 19.8s
699:	learn: 0.0118113	total: 2m 17s	remaining: 0us
0:	learn: 21.2106879	total: 131ms	remaining: 1m 31s
100:	learn: 8.7458961	total: 21.4s	remaining: 2m 7s
200:	learn: 6.9697835	total: 45.1s	remaining: 1m 51s
300:	l

  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_catego

Stacked model test predictions saved to data/stacked_model_predictions.csv


  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
