In [7]:
# Fichier : 13.modeling_chatbot_proxy_model.ipynb (Version Définitive)

import pandas as pd
import lightgbm as lgb
import joblib
import os

print("--- Entraînement du Modèle Proxy COMPLET ---")

# --- 1. Chargement des données et du modèle expert neutre ---
input_path = "../../data/processed/dataset_clean_no_outliers.parquet"
model_expert_path = "../../models/step1_risk_model_NEUTRAL.pkl"
model_expert_cols_path = "../../models/step1_risk_model_columns.pkl"

df = pd.read_parquet(input_path)
model_expert_neutral = joblib.load(model_expert_path)
model_expert_columns = joblib.load(model_expert_cols_path)
print("✅ Données et modèle expert NEUTRE chargés.")

# --- 2. Calcul des scores "vérité" ---
selected_features_for_risk_model = [
    'term', 'annual_inc', 'dti', 'revol_util', 'revol_bal', 'loan_amnt', 
    'emp_length', 'home_ownership', 'purpose', 'verification_status', 
    'mort_acc', 'pub_rec', 'open_acc', 'total_acc'
]
X_expert = df[selected_features_for_risk_model]
X_expert = pd.get_dummies(X_expert, drop_first=True, dtype=float)
X_expert = X_expert.reindex(columns=model_expert_columns, fill_value=0)
y_target = model_expert_neutral.predict_proba(X_expert)[:, 1]
print("✅ Cible (scores neutres) générée.")

# --- 3. Préparation des features pour le proxy COMPLET ---
# On utilise la liste complète des champs du formulaire détaillé
features_for_proxy_complete = [
    'loan_amnt', 'annual_inc', 'purpose', 'dti', 'emp_length', 'home_ownership',
    'revol_bal', 'revol_util', 'total_acc', 'open_acc', 'mort_acc',
    'pub_rec', 'pub_rec_bankruptcies', 'term', 'verification_status'
]
X_proxy = df[features_for_proxy_complete]
X_proxy = pd.get_dummies(X_proxy, drop_first=True, dtype=float)

# --- 4. Entraînement et Sauvegarde ---
proxy_model = lgb.LGBMRegressor(random_state=42)
proxy_model.fit(X_proxy, y_target)

output_dir = "../../models/"
proxy_model_path = os.path.join(output_dir, "chatbot_proxy_model_COMPLETE.pkl")
proxy_model_cols_path = os.path.join(output_dir, "chatbot_proxy_model_COMPLETE_columns.pkl")

joblib.dump(proxy_model, proxy_model_path)
joblib.dump(X_proxy.columns.tolist(), proxy_model_cols_path)
print(f"✅ Modèle proxy COMPLET et ses colonnes sauvegardés.")

--- Entraînement du Modèle Proxy COMPLET ---
✅ Données et modèle expert NEUTRE chargés.
✅ Cible (scores neutres) générée.
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008181 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1536
[LightGBM] [Info] Number of data points in the train set: 366214, number of used features: 32
[LightGBM] [Info] Start training from score 0.192686
✅ Modèle proxy COMPLET et ses colonnes sauvegardés.


In [8]:
print("--- Entraînement du Modèle Proxy SIMPLIFIÉ ---")

# --- 1. Chargement des données et du modèle expert neutre ---
input_path = "../../data/processed/dataset_clean_no_outliers.parquet"
model_expert_path = "../../models/step1_risk_model_NEUTRAL.pkl"
model_expert_cols_path = "../../models/step1_risk_model_columns.pkl"
output_dir = "../../models/"

df = pd.read_parquet(input_path)
model_expert_neutral = joblib.load(model_expert_path)
model_expert_columns = joblib.load(model_expert_cols_path)
print("✅ Données et modèle expert NEUTRE chargés.")

# --- 2. Calcul des scores "vérité" avec le modèle expert ---
selected_features_for_risk_model = [
    'term', 'annual_inc', 'dti', 'revol_util', 'revol_bal', 'loan_amnt', 
    'emp_length', 'home_ownership', 'purpose', 'verification_status', 
    'mort_acc', 'pub_rec', 'open_acc', 'total_acc'
]
X_expert = df[selected_features_for_risk_model]
X_expert = pd.get_dummies(X_expert, drop_first=True, dtype=float)
X_expert = X_expert.reindex(columns=model_expert_columns, fill_value=0)
y_target = model_expert_neutral.predict_proba(X_expert)[:, 1]
print("✅ Cible (scores neutres) générée.")

# --- 3. Préparation des features pour le proxy SIMPLIFIÉ ---
# On ne garde que les 7 variables du formulaire final
features_for_proxy_simple = [
    'loan_amnt', 
    'annual_inc', 
    'term',
    'dti', 
    'emp_length', 
    'home_ownership',
    'purpose'
]
# On ajoute le champ fixe pour la cohérence
df['verification_status'] = 'Verified'
features_for_proxy_simple.append('verification_status')

X_proxy = df[features_for_proxy_simple]
X_proxy = pd.get_dummies(X_proxy, drop_first=True, dtype=float)

# --- 4. Entraînement et Sauvegarde ---
proxy_model_simple = lgb.LGBMRegressor(random_state=42)
proxy_model_simple.fit(X_proxy, y_target)

# On sauvegarde ce nouveau modèle et ses colonnes sous de nouveaux noms
proxy_model_path = os.path.join(output_dir, "chatbot_proxy_model_SIMPLE.pkl")
proxy_model_cols_path = os.path.join(output_dir, "chatbot_proxy_model_SIMPLE_columns.pkl")

joblib.dump(proxy_model_simple, proxy_model_path)
joblib.dump(X_proxy.columns.tolist(), proxy_model_cols_path)
print(f"✅ Modèle proxy SIMPLIFIÉ et ses colonnes sauvegardés.")

--- Entraînement du Modèle Proxy SIMPLIFIÉ ---
✅ Données et modèle expert NEUTRE chargés.
✅ Cible (scores neutres) générée.
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002372 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 816
[LightGBM] [Info] Number of data points in the train set: 366214, number of used features: 23
[LightGBM] [Info] Start training from score 0.192686
✅ Modèle proxy SIMPLIFIÉ et ses colonnes sauvegardés.
