In [2]:
import os
import warnings
from datetime import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import (accuracy_score, f1_score, roc_auc_score)
from sklearn.metrics.pairwise import cosine_similarity
from xgboost import XGBClassifier
import joblib
from tqdm import tqdm

warnings.filterwarnings("ignore")


In [None]:
df_cleaned = pd.read_csv('../feature_engineering/clean_customer_dataNEW.csv')
df_cleaned

Unnamed: 0,customer_id,residence_country,gender,age,first_join_date,residence_index,channel_entrace,activity_status,household_gross_income,saving_account,...,avg_expense_days_per_month,expense_amount_cv,avg_transactions_per_month,monthly_transaction_std,total_transactions,active_months,avg_monthly_transaction_count,SPS,TSI,demographic_score
0,1375586,ES,0,35,2020-01-12,Y,KHL,1,50887.44,1,...,1.272727,0.939871,1.250000,0.452267,15.0,12.0,1.25,0.122274,0.485454,0.381798
1,1050611,ES,0,23,2017-08-10,Y,KHE,1,30619.38,1,...,1.285714,1.348381,1.357143,0.633324,19.0,14.0,1.36,0.087452,0.201175,0.168332
2,1050612,ES,0,23,2017-08-10,Y,KHE,1,57420.17,0,...,1.000000,0.732798,1.000000,0.000000,10.0,10.0,1.00,0.142663,0.633601,0.418683
3,1050613,ES,1,22,2017-08-10,Y,KHD,1,115661.59,0,...,1.266667,0.709893,1.312500,0.602080,21.0,16.0,1.31,0.111355,0.541213,0.428340
4,1050614,ES,0,23,2017-08-10,Y,KHE,1,28358.36,0,...,1.125000,0.871065,1.125000,0.353553,9.0,8.0,1.12,0.145064,0.539062,0.417684
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939465,1185013,ES,0,53,2021-05-14,Y,KDB,1,116170.73,1,...,1.100000,1.455297,1.090909,0.301511,12.0,11.0,1.09,0.156184,0.223578,0.454709
939466,1168909,ES,1,43,2018-08-23,Y,KDB,1,5589.71,1,...,1.000000,1.409169,1.000000,0.000000,10.0,10.0,1.00,0.140508,0.295416,0.225875
939467,1173729,ES,0,33,2018-09-09,Y,KDB,1,19151.20,0,...,1.111111,0.724515,1.200000,0.421637,12.0,10.0,1.20,0.179521,0.579566,0.197636
939468,1164094,ES,1,54,2021-05-13,Y,KFC,0,13525.97,0,...,1.333333,1.033329,1.333333,1.154701,16.0,12.0,1.33,0.099246,0.639170,0.436374


In [None]:
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 939470 entries, 0 to 939469
Data columns (total 38 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   customer_id                    939470 non-null  int64  
 1   residence_country              939470 non-null  object 
 2   gender                         939470 non-null  int64  
 3   age                            939470 non-null  int64  
 4   first_join_date                939470 non-null  object 
 5   residence_index                939470 non-null  object 
 6   channel_entrace                939470 non-null  object 
 7   activity_status                939470 non-null  int64  
 8   household_gross_income         939470 non-null  float64
 9   saving_account                 939470 non-null  int64  
 10  guarantees                     939470 non-null  int64  
 11  junior_account                 939470 non-null  int64  
 12  loans                         

In [None]:

# Define label columns
label_cols = [
    'saving_account', 'guarantees', 'junior_account', 'loans',
     'pension'
]

# Prepare features and labels
X_raw = df_cleaned.drop(columns=label_cols)
drop_cols = ['credit_card','direct_debit','customer_id', 'first_join_date', 'total_transactions',
             'avg_monthly_transaction_count', 'demographic_score', 'customer_segment','min_balance','max_balance','avg_balance']
X = X_raw.drop(columns=[col for col in drop_cols if col in X_raw.columns])
Y = df_cleaned[label_cols]

# Convert categorical columns
cat_cols = ['residence_country', 'residence_index', 'channel_entrace']
for col in cat_cols:
    if col in X.columns:
        X[col] = X[col].astype('category')
for col in X.select_dtypes(include='object').columns:
    X[col] = X[col].astype('category')

# Derive membership_days from first_join_date
if 'first_join_date' in df_cleaned.columns:
    join_date = pd.to_datetime(df_cleaned['first_join_date'], errors='coerce')
    X['membership_days'] = (pd.Timestamp.now() - join_date).dt.days

In [None]:

# Split into training and test sets first
X_train_full, X_test, Y_train_full, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42
)

# GridSearch hyperparameter space
param_grid = {
    'estimator__n_estimators': [100],
    'estimator__max_depth': [3, 5],
    'estimator__learning_rate': [0.1],
    'estimator__subsample': [1.0]
}

kf = KFold(n_splits=5, shuffle=True, random_state=42)
f1_macro_list = []
best_models = []

# KFold training on training split
for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_full)):
    print(f"\n Fold {fold + 1}/5")

    X_train, X_val = X_train_full.iloc[train_idx], X_train_full.iloc[val_idx]
    Y_train, Y_val = Y_train_full.iloc[train_idx], Y_train_full.iloc[val_idx]

    base_model = XGBClassifier(
        tree_method='hist',
        eval_metric='logloss',
        use_label_encoder=False,
        enable_categorical=True,
        random_state=42
    )

    multi_model = MultiOutputClassifier(base_model, n_jobs=-1)

    grid_search = GridSearchCV(
        estimator=multi_model,
        param_grid=param_grid,
        scoring='f1_macro',
        cv=3,
        verbose=1,
        n_jobs=-1,
        error_score='raise'
    )

    grid_search.fit(X_train, Y_train)

    best_model = grid_search.best_estimator_
    best_models.append(best_model)

    Y_pred = best_model.predict(X_val)
    f1_macro = f1_score(Y_val, Y_pred, average='macro')
    f1_macro_list.append(f1_macro)

    print(f"Fold {fold + 1} Best Parameters: {grid_search.best_params_}")
    print(f"Fold {fold + 1} Macro F1 Score: {f1_macro:.4f}")

# Summary
print("\n Average Macro F1 across folds:", np.mean(f1_macro_list))

# Save final model
final_model = best_models[-1]
joblib.dump(final_model, 'best_multilabel_model.pkl')
print(" Best model saved as best_multilabel_model.pkl")

# Predict on holdout test set
Y_test_pred = final_model.predict(X_test)





📂 Fold 1/5
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Fold 1 Best Parameters: {'estimator__learning_rate': 0.1, 'estimator__max_depth': 5, 'estimator__n_estimators': 100, 'estimator__subsample': 1.0}
Fold 1 Macro F1 Score: 0.4429

📂 Fold 2/5
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Fold 2 Best Parameters: {'estimator__learning_rate': 0.1, 'estimator__max_depth': 3, 'estimator__n_estimators': 100, 'estimator__subsample': 1.0}
Fold 2 Macro F1 Score: 0.4276

📂 Fold 3/5
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Fold 3 Best Parameters: {'estimator__learning_rate': 0.1, 'estimator__max_depth': 5, 'estimator__n_estimators': 100, 'estimator__subsample': 1.0}
Fold 3 Macro F1 Score: 0.4362

📂 Fold 4/5
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Fold 4 Best Parameters: {'estimator__learning_rate': 0.1, 'estimator__max_depth': 5, 'estimator__n_estimators': 100, 'estimator__subsample': 1.0}
Fold 4 Macro F1 Score: 0.4353

📂 Fold 5/5


In [None]:
# Model Evaluate uaing theesholds
custom_thresholds = {
    'saving_account': 0.3,
    'guarantees': 0.5,
    'junior_account': 0.5,
    'loans': 0.5,
    'pension': 0.3,
}


In [None]:
def predict_with_custom_threshold(model, X_input, label_cols, thresholds):
    """
    Apply per-label thresholds on predicted probabilities from MultiOutputClassifier.
    Returns binary predictions.
    """
    probas = model.predict_proba(X_input)
    preds = []

    for i, prob in enumerate(probas):
        threshold = thresholds[label_cols[i]]
        pred_label = (prob[:, 1] >= threshold).astype(int)
        preds.append(pred_label)

    return np.column_stack(preds)



In [None]:
# Apply prediction
Y_test_pred_custom = predict_with_custom_threshold(
    model=final_model,
    X_input=X_test,
    label_cols=label_cols,
    thresholds=custom_thresholds
)

# Evaluate
from sklearn.metrics import classification_report
print("\n Classification Report with Custom Thresholds:")
print(classification_report(Y_test, Y_test_pred_custom, target_names=label_cols))




📋 Classification Report with Custom Thresholds:
                precision    recall  f1-score   support

saving_account       0.45      0.99      0.62     84649
    guarantees       0.00      0.00      0.00         6
junior_account       0.72      0.85      0.78       198
         loans       1.00      1.00      1.00    112974
       pension       0.35      0.03      0.06      8876

     micro avg       0.66      0.95      0.78    206703
     macro avg       0.51      0.57      0.49    206703
  weighted avg       0.75      0.95      0.80    206703
   samples avg       0.62      0.76      0.66    206703



In [None]:
print("\n Evaluation with Custom Thresholds:\n")
Y_pred = final_model.predict(X_test)
Y_pred_proba = final_model.predict_proba(X_test)
for i, label in enumerate(label_cols):


    y_true = Y_test[label].values
    y_pred = Y_pred[:, i]
    y_proba = Y_pred_proba[i][:, 1]

    acc = accuracy_score(y_true, y_pred)
    auc = roc_auc_score(y_true, y_proba)

    print(f"   Product: {label}")
    print(f"   Accuracy: {acc:.4f}")
    print(f"   AUC: {auc:.4f}\n")


📊 Evaluation with Custom Thresholds:

   Product: saving_account
   Accuracy: 0.5683
   AUC: 0.5773

   Product: guarantees
   Accuracy: 1.0000
   AUC: 0.7315

   Product: junior_account
   Accuracy: 0.9995
   AUC: 0.9996

   Product: loans
   Accuracy: 0.9980
   AUC: 0.9999

   Product: pension
   Accuracy: 0.9528
   AUC: 0.8233



In [None]:
# defination the recommend function
def recommend_top_n_products_filtered(model, X_input, Y_current, label_cols, top_n=3):

    probas = model.predict_proba(X_input)
    probs_matrix = np.column_stack([p[:, 1] for p in probas])

    probs_df = pd.DataFrame(probs_matrix, columns=label_cols, index=Y_current.index)

    masked_probs_df = probs_df.mask(Y_current == 1, -1)

    recommendations = []
    for _, customer_probs in masked_probs_df.iterrows():
        top_products = customer_probs.sort_values(ascending=False).head(top_n).index.tolist()
        recommendations.append(top_products)

    result_df = pd.DataFrame({
        'customer_index': Y_current.index,
        'recommended_products': recommendations
    })

    return result_df


In [None]:

X_all = df_cleaned.drop(columns=label_cols)
Y_all = df_cleaned[label_cols]
customer_ids_all = df_cleaned['customer_id'].values


recommendations_all = recommend_top_n_products_filtered(
    model=final_model,
    X_input=X,
    Y_current=Y,
    label_cols=label_cols,
    top_n=3
)


recommendations_all['customer_id'] = customer_ids_all
recommendations_all = recommendations_all.drop(columns='customer_index')
recommendations_all = recommendations_all[['customer_id', 'recommended_products']]

# Save the recommendation csv
recommendations_all.to_csv('customer_data_recommendations.csv', index=False)

In [None]:
recommendations_all.head()

Unnamed: 0,customer_id,recommended_products
0,1375586,"[pension, loans, junior_account]"
1,1050611,"[pension, loans, junior_account]"
2,1050612,"[saving_account, pension, junior_account]"
3,1050613,"[saving_account, pension, junior_account]"
4,1050614,"[saving_account, pension, junior_account]"
