## Setup

In [3]:
# import all necessary libraries
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd

# Load the datasets
train_data = pd.read_csv("data/imputed_train.csv")
test_data = pd.read_csv("data/test.csv")

from utils.preprocessing import preprocess_data

# Integrate holidays data
train_data = preprocess_data(train_data)
test_data = preprocess_data(test_data)

## Catboost

## Hyperparam tuning

In [4]:
## DO NOT RUN

from catboost import CatBoostRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import RandomizedSearchCV

cat_features_index = [0, 1, 9, 10, 11, 12, 13, 14] 


# Define target columns and feature columns
target_columns = ['valeur_NO2', 'valeur_CO', 'valeur_O3', 'valeur_PM10', 'valeur_PM25']
feature_columns = [
    'is_holiday', 'is_jour_ferie', 'precipitation', 'wind_speed', 'temperature',
    'humidity', 'pressure', 'visibility', 'global_solar_radiation', 'Year', 'Month', 
    'Weekday', 'Day', 'Hour', 'is_weekend', 'DayOfYear_sin', 'DayOfYear_cos', 
    'HourOfDay_sin', 'HourOfDay_cos'
]

X_train = train_data[feature_columns]
y_train = train_data[target_columns]
X_test = test_data[feature_columns]

# Specify categorical columns by index
cat_features_index = [0, 1, 9, 10, 11, 12, 13, 14]  # adjust this list based on your feature column indices

# Split the training data into training and validation sets
X_train_part, X_val, y_train_part, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=123)

# Define the parameter grid for CatBoost
param_grid = {
    'estimator__learning_rate': [0.01, 0.03, 0.05, 0.1], # Learning rate
    'estimator__depth': [4, 6, 8, 10],                   # Depth of each tree
    'estimator__l2_leaf_reg': [1, 3, 5, 7],              # L2 regularization
    'estimator__bagging_temperature': [0.5, 1, 1.5],     # Subsampling randomness
}

# Initialize CatBoost with basic setup
catboost_model = CatBoostRegressor(
    random_seed=123,
    cat_features=cat_features_index,
    verbose=0
)

# Wrap CatBoost in MultiOutputRegressor
multi_target_model = MultiOutputRegressor(catboost_model)

# Setup RandomizedSearchCV with MultiOutputRegressor
random_search = RandomizedSearchCV(
    multi_target_model,
    param_distributions=param_grid,
    n_iter=20,  # Number of parameter combinations to try
    scoring='neg_mean_absolute_error',
    cv=3,       # 3-fold cross-validation
    random_state=123,
    n_jobs=-1   # Use all available cores
)

# Perform the hyperparameter tuning
random_search.fit(X_train_part, y_train_part)

# Best model after tuning
best_model = random_search.best_estimator_
print("Best Parameters:", random_search.best_params_)


## Training & Prediction

In [9]:
from catboost import CatBoostRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
import pandas as pd

cat_features_index = [0, 1, 9, 10, 11, 12, 13, 14] 


# Define target columns and feature columns
target_columns = ['valeur_NO2', 'valeur_CO', 'valeur_O3', 'valeur_PM10', 'valeur_PM25']
feature_columns = [
    'is_holiday', 'is_jour_ferie', 'precipitation', 'wind_speed', 'temperature',
    'humidity', 'pressure', 'visibility', 'global_solar_radiation', 'Year', 'Month', 
    'Weekday', 'Day', 'Hour', 'is_weekend', 'DayOfYear_sin', 'DayOfYear_cos', 
    'HourOfDay_sin', 'HourOfDay_cos'
]

X_train = train_data[feature_columns]
y_train = train_data[target_columns]
X_test = test_data[feature_columns]

# Specify categorical columns by index
cat_features_index = [0, 1, 9, 10, 11, 12, 13, 14]  # adjust this list based on your feature column indices

# Split the training data into training and validation sets
X_train_part, X_val, y_train_part, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=123)

# Define the CatBoost parameters (acc to hyperparam tuning)
catboost_params = {
    'learning_rate': 0.1,
    'l2_leaf_reg': 1,
    'iterations': 700,
    'depth': 10,
    'bagging_temperature': 0.5,
    'verbose': 100,
    'random_seed': 123
}

# Initialize the base CatBoost model
catboost_model = CatBoostRegressor(**catboost_params)

# Wrap the CatBoost model in MultiOutputRegressor for multi-target prediction
multi_target_model = MultiOutputRegressor(catboost_model)

# Train the model on the partial training set
multi_target_model.fit(X_train_part, y_train_part)

# Make predictions on the validation set
y_val_pred = multi_target_model.predict(X_val)

# Initialize list to store MAE for each target
mae_values = []

# Evaluate the model for each target column
target_columns = ['valeur_NO2', 'valeur_CO', 'valeur_O3', 'valeur_PM10', 'valeur_PM25']
for i, target in enumerate(target_columns):
    mse = mean_squared_error(y_val[target], y_val_pred[:, i])
    mae = mean_absolute_error(y_val[target], y_val_pred[:, i])
    r2 = r2_score(y_val[target], y_val_pred[:, i])
    
    mae_values.append(mae)
    
    print(f"Metrics for {target} on Validation Set:")
    print(f"  Mean Squared Error (MSE): {mse}")
    print(f"  Mean Absolute Error (MAE): {mae}")
    print(f"  R² Score: {r2}\n")

# Calculate the average MAE across all targets
average_mae = sum(mae_values) / len(mae_values)
print(f"Average MAE across all targets: {average_mae}\n")

# Make predictions on the test set
y_test_pred = multi_target_model.predict(X_test)

# Create DataFrame for test predictions with the id column included
test_predictions = pd.DataFrame(y_test_pred, columns=target_columns)
test_predictions['id'] = test_data['id'].values  # Add the id column from test_data

# Reorder to have 'id' as the first column
test_predictions = test_predictions[['id'] + target_columns]

# Save test predictions with id to a CSV file
test_predictions.to_csv("data/catboost_predictions.csv", index=False)
print("Test predictions saved to data/test_predictions.csv")


0:	learn: 14.2817197	total: 35.7ms	remaining: 25s
100:	learn: 6.8325535	total: 3.65s	remaining: 21.7s
200:	learn: 5.4265905	total: 7.81s	remaining: 19.4s
300:	learn: 4.6124725	total: 12.8s	remaining: 16.9s
400:	learn: 4.0188104	total: 18.1s	remaining: 13.5s
500:	learn: 3.5942938	total: 22.6s	remaining: 8.97s
600:	learn: 3.2329877	total: 27s	remaining: 4.44s
699:	learn: 2.9422627	total: 30.3s	remaining: 0us
0:	learn: 0.0938885	total: 35.8ms	remaining: 25s
100:	learn: 0.0447919	total: 3.98s	remaining: 23.6s
200:	learn: 0.0339076	total: 13.7s	remaining: 34s
300:	learn: 0.0279403	total: 20.6s	remaining: 27.3s
400:	learn: 0.0238940	total: 26.3s	remaining: 19.6s
500:	learn: 0.0209824	total: 33.7s	remaining: 13.4s
600:	learn: 0.0188329	total: 42.3s	remaining: 6.97s
699:	learn: 0.0170188	total: 47.6s	remaining: 0us
0:	learn: 24.7665680	total: 56.7ms	remaining: 39.6s
100:	learn: 9.4791173	total: 6.29s	remaining: 37.3s
200:	learn: 7.6064031	total: 10.9s	remaining: 27s
300:	learn: 6.4515961	total