In [None]:
# Install the required packages
!pip install xgboost catboost optuna

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, r2_score
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

In [None]:
card_data_summed=  pd.read_csv('card_data_summed.csv')
card_data_other = pd.read_csv('card_data_other.csv')

In [None]:
# Define train test validation split
def train_test_val_split(data, train_size=0.8, test_size=0.1, val_size=0.1):
    # Split the data
    train, test = train_test_split(data, test_size=1-train_size, random_state=42)
    test, val = train_test_split(test, test_size=val_size/(test_size+val_size), random_state=42)
    return train, test, val


# Define features and targets
X = card_data_summed[['pk1', 'pk2', 'pk3', 'pk4', 'pk5']]
y = card_data_summed[['ca', 'cb', 'cc', 'cd', 'ce', 'cf']]

In [None]:
# Split the data
X_train, X_test, X_val = train_test_val_split(X)
y_train, y_test, y_val = train_test_val_split(y)

In [None]:
# Define XGBoost Regressor model
def create_xgb_model(params):
    model = XGBRegressor(**params)
    multi_output_model = MultiOutputRegressor(model)
    multi_output_model.fit(X_train, y_train)
    return multi_output_model

# Define CatBoost Regressor model
def create_cb_model(params):
    model = CatBoostRegressor(**params)
    model.fit(X_train, y_train, verbose=False)
    return model

# Define multi-target evaluation function
def evaluate_multi_target(model, X, y):
    y_pred = model.predict(X)
    mae = mean_absolute_error(y, y_pred)
    rmse = root_mean_squared_error(y, y_pred)
    r2 = r2_score(y, y_pred)
    return mae, rmse, r2

In [None]:
# Train and evaluate XGBoost model
xgb_params = {
    'n_estimators': 1000,
    'max_depth': 5,
    'learning_rate': 0.1,
    'objective': 'reg:squarederror'  # Standard regression objective
}

xgb_trained_model = create_xgb_model(xgb_params)
xgb_mae, xgb_rmse, xgb_r2 = evaluate_multi_target(xgb_trained_model, X_val, y_val)
print('XGBoost MAE:', xgb_mae)
print('XGBoost RMSE:', xgb_rmse)
print('XGBoost R²:', xgb_r2)

In [None]:
# Train and evaluate CatBoost model
cb_params = {
    'n_estimators': 1000,
    'max_depth': 5,
    'learning_rate': 0.1,
    'loss_function': 'MultiRMSE',  # Suitable for multi-target regression
    'verbose': False
}

cb_trained_model = create_cb_model(cb_params)
cb_mae, cb_rmse, cb_r2 = evaluate_multi_target(cb_trained_model, X_val, y_val)
print('CatBoost MAE:', cb_mae)
print('CatBoost RMSE:', cb_rmse)
print('CatBoost R²:', cb_r2)

In [None]:
# Visualize the results
models = ['XGBoost', 'CatBoost']
mae = [xgb_mae, cb_mae]
rmse = [xgb_rmse, cb_rmse]
r2 = [xgb_r2, cb_r2]

fig, ax = plt.subplots(1, 3, figsize=(15, 5))
ax[0].bar(models, mae, color=['blue', 'green'])
ax[0].set_title('Mean Absolute Error')
ax[1].bar(models, rmse, color=['blue', 'green'])
ax[1].set_title('Root Mean Squared Error')
ax[2].bar(models, r2, color=['blue', 'green'])
ax[2].set_title('R2 Score')
plt.show()


In [None]:
# Import optuna
import optuna

In [None]:
# Objective function for XGBoost
def objective_xgb(trial):
    """
    Objective function for Optuna to optimize XGBoost hyperparameters.

    Parameters:
    - trial: Optuna trial object.

    Returns:
    - mae (float): Mean Absolute Error to minimize.
    """
    # Define the hyperparameters to optimize
    xgb_params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'objective': 'reg:squarederror',
        'random_state': 42,
        'verbosity': 0  # Silent mode
    }
    
    # Create and train the model
    model = create_xgb_model(xgb_params)
    
    # Evaluate the model on validation set
    mae, _, _ = evaluate_multi_target(model, X_val, y_val)
    
    return mae



# Create an Optuna study for XGBoost
study_xgb = optuna.create_study(direction='minimize', study_name='XGBoost Optimization')

# Optimize the objective function
study_xgb.optimize(objective_xgb, n_trials=100, timeout=600)  # Adjust n_trials and timeout as needed

# Get the best hyperparameters
xgb_best_params = study_xgb.best_params
xgb_best_params.update({'objective': 'reg:squarederror', 'random_state': 42, 'verbosity': 0})

print("Best Hyperparameters for XGBoost:")
print(xgb_best_params)
print("\n")

# -----------------------------
# Step 8: Train and Evaluate XGBoost with Best Hyperparameters
# -----------------------------
# Train the best XGBoost model
xgb_best_model = create_xgb_model(xgb_best_params)

# Evaluate the best XGBoost model
xgb_best_mae, xgb_best_rmse, xgb_best_r2 = evaluate_multi_target(xgb_best_model, X_val, y_val)
print('XGBoost Best MAE:', xgb_best_mae)
print('XGBoost Best RMSE:', xgb_best_rmse)
print('XGBoost Best R²:', xgb_best_r2)
print("\n")


In [None]:
# Objective function for CatBoost
def objective_cb(trial):
    """
    Objective function for Optuna to optimize CatBoost hyperparameters.

    Parameters:
    - trial: Optuna trial object.

    Returns:
    - mae (float): Mean Absolute Error to minimize.
    """
    # Define the hyperparameters to optimize
    cb_params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'loss_function': 'MultiRMSE',
        'random_state': 42,
        'verbose': False
    }
    
    # Create and train the model
    model = create_cb_model(cb_params)
    
    # Evaluate the model on validation set
    mae, _, _ = evaluate_multi_target(model, X_val, y_val)
    
    return mae



# Create an Optuna study for CatBoost
study_cb = optuna.create_study(direction='minimize', study_name='CatBoost Optimization')

# Optimize the objective function
study_cb.optimize(objective_cb, n_trials=100, timeout=600)  # Adjust n_trials and timeout as needed

# Get the best hyperparameters
cb_best_params = study_cb.best_params
cb_best_params.update({'loss_function': 'MultiRMSE', 'random_state': 42, 'verbose': False})

print("Best Hyperparameters for CatBoost:")
print(cb_best_params)
print("\n")

# -----------------------------
# Step 10: Train and Evaluate CatBoost with Best Hyperparameters
# -----------------------------
# Train the best CatBoost model
cb_best_model = create_cb_model(cb_best_params)

# Evaluate the best CatBoost model
cb_best_mae, cb_best_rmse, cb_best_r2 = evaluate_multi_target(cb_best_model, X_val, y_val)
print('CatBoost Best MAE:', cb_best_mae)
print('CatBoost Best RMSE:', cb_best_rmse)
print('CatBoost Best R²:', cb_best_r2)
print("\n")


In [None]:
# Visualize the results
models = ['XGBoost', 'CatBoost']
mae = [xgb_best_mae, cb_best_mae]
rmse = [xgb_best_rmse, cb_best_rmse]
r2 = [xgb_best_r2, cb_best_r2]

fig, ax = plt.subplots(1, 3, figsize=(18, 6))

# Mean Absolute Error
ax[0].bar(models, mae, color=['skyblue', 'lightgreen'])
ax[0].set_title('Mean Absolute Error')
ax[0].set_ylabel('MAE')

# Root Mean Squared Error
ax[1].bar(models, rmse, color=['skyblue', 'lightgreen'])
ax[1].set_title('Root Mean Squared Error')
ax[1].set_ylabel('RMSE')

# R² Score
ax[2].bar(models, r2, color=['skyblue', 'lightgreen'])
ax[2].set_title('R² Score')
ax[2].set_ylabel('R²')

plt.tight_layout()
plt.show()