In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputRegressor
import xgboost as xgb
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.cluster import KMeans

In [None]:
# Load data
raw_data = pd.read_csv("./card_data_cleaned.csv")

# Change to categorical
raw_data['gender'] = raw_data['gender'].astype('category')
raw_data['region_city'] = raw_data['region_city'].astype('category')
raw_data['region_county'] = raw_data['region_county'].astype('category')


# Split into young and old
young_data = raw_data[raw_data['age'] < 7].copy()
old_data = raw_data[raw_data['age'] > 6].copy()

# Drop 'age'
young_data.drop(columns=['age'], inplace=True)
old_data.drop(columns=['age'], inplace=True)

In [None]:
young_data.head(10)

In [None]:
X_young = young_data.drop(columns=['cb03_tot', 'cc02_tot', 'cc03_tot', 'cc04_tot', 'cf18_tot'])
y_young = young_data[['cb03_tot', 'cc02_tot', 'cc03_tot', 'cc04_tot', 'cf18_tot']]

In [None]:
y_young.head(10)

In [None]:
X_young.head(10)

In [None]:
X_old = old_data.drop(columns=['cb03_tot', 'cc02_tot', 'cc03_tot', 'cc04_tot', 'cf18_tot'])
y_old = old_data[['cb03_tot', 'cc02_tot', 'cc03_tot', 'cc04_tot', 'cf18_tot']]

In [None]:
# Split young data into train and test
X_train, X_test, y_train, y_test = train_test_split(X_young, y_young, test_size=0.2, random_state=42)

# Initialize and train XGBoost model
xgb_model = xgb.XGBRegressor(n_estimators=1000, max_depth=8, learning_rate=0.05, objective='reg:squarederror', random_state=42, enable_categorical=True, verbosity=0)
multi_xgb = MultiOutputRegressor(xgb_model)
multi_xgb.fit(X_train, y_train)

# Predict on young test data
y_pred_test = multi_xgb.predict(X_test)

# Evaluate on young test data
mae = mean_absolute_error(y_test, y_pred_test)
rmse = mean_squared_error(y_test, y_pred_test, squared=False)
r2 = r2_score(y_test, y_pred_test)
print(f'Young Test MAE: {mae:.4f}')
print(f'Young Test RMSE: {rmse:.4f}')
print(f'Young Test R²: {r2:.4f}')

In [None]:
y_test_mean = y_test.mean().mean()

print(f"MAE percentage: {mae/y_test_mean:.4f}")

In [None]:
# Predict on old data
y_pred_old = multi_xgb.predict(X_old)

# Evaluate on old data
mae_old = mean_absolute_error(y_old, y_pred_old)
rmse_old = mean_squared_error(y_old, y_pred_old, squared=False)
r2_old = r2_score(y_old, y_pred_old)
print(f'Old Data MAE: {mae_old:.4f}')
print(f'Old Data RMSE: {rmse_old:.4f}')
print(f'Old Data R²: {r2_old:.4f}')

In [None]:
y_old_mean = y_old.mean().mean()

print(f"MAE percentage: {mae_old/y_old_mean:.4f}")

In [None]:
# Compute differences
y_pred_old_df = pd.DataFrame(y_pred_old, columns=y_old.columns, index=y_old.index)
old_diff = y_old - y_pred_old_df

# Count negative entries
negative_count = (old_diff < 0).sum().sum()
print(f'Number of negative entries in old_diff: {negative_count}')
print(f'Percentage of negative entries in old_diff: {negative_count / old_diff.size:.4f}')

In [None]:
# Feature Importances
importances = pd.DataFrame()
target_vars  = ['cb03_tot', 'cc02_tot', 'cc03_tot', 'cc04_tot', 'cf18_tot']
for i, target in enumerate(target_vars):
    importances[target] = multi_xgb.estimators_[i].feature_importances_

importances['Average'] = importances.mean(axis=1)
importances = importances.sort_values('Average', ascending=False)

# Plot feature importances
plt.figure(figsize=(10,6))
sns.barplot(x=importances.index, y=importances['Average'],  palette='viridis')
plt.title('Feature Importances (Average across Targets)')
plt.xlabel('Average Importance')
plt.ylabel('Features')
plt.tight_layout()
plt.show()

In [None]:
# List most important features
print(importances[:5])

X_old.columns.tolist()[53]