In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [None]:
# 🔹 Chargement des données
input_path = "../../data/processed/dataset_clean_no_outliers.parquet"
df = pd.read_parquet(input_path)

# 🔹 Encodage rapide (si pas déjà fait)
grade_order = sorted(df['grade'].dropna().unique())
grade_mapping = {grade: idx + 1 for idx, grade in enumerate(grade_order)}
df['grade_encoded'] = df['grade'].map(grade_mapping)

# Encodage supplémentaire
df['home_ownership_encoded'] = df['home_ownership'].astype('category').cat.codes
df['verification_status_encoded'] = df['verification_status'].astype('category').cat.codes
df['loan_status_encoded'] = (df['loan_status'] == 'Charged Off').astype(int)
df['purpose_encoded'] = df['purpose'].astype('category').cat.codes
df['initial_list_status_encoded'] = df['initial_list_status'].map({'w': 1, 'f': 0})
df['application_type_encoded'] = df['application_type'].astype('category').cat.codes

# 🔹 Sélection des variables explicatives
features = [
    'int_rate', 'term', 'revol_util', 'loan_amnt', 'dti', 
    'installment', 'monthly_payment_calculated', 
    'home_ownership_encoded', 'verification_status_encoded',
    'purpose_encoded', 'initial_list_status_encoded',
    'application_type_encoded'
]

X = df[features]
y = df['grade_encoded']

# 🔥 Gérer les valeurs manquantes avant KNN
X = X.dropna()  # OPTION 1 : suppression des lignes avec NaN
y = y.loc[X.index]  # Important : aligner y avec X

# 🔹 Standardisation obligatoire pour KNN
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 🔹 Découpage Train / Test (70%-30%)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42, stratify=y)

print(f"✅ Jeu d'entraînement : {X_train.shape}")
print(f"✅ Jeu de test : {X_test.shape}")

# 🔹 Modèle KNN
knn = KNeighborsClassifier(n_neighbors=5)  # k=5 par défaut
knn.fit(X_train, y_train)

# 🔹 Prédictions
y_pred = knn.predict(X_test)

# 🔹 Évaluation
accuracy = accuracy_score(y_test, y_pred)
print(f"\n🎯 Accuracy du modèle KNN : {accuracy:.4f}")

print("\n🧾 Rapport de classification :")
print(classification_report(y_test, y_pred))

# 🔹 Matrice de confusion
conf_matrix = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(8,6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues")
plt.title("Matrice de confusion - KNN Prédiction du grade")
plt.xlabel("Grade prédit")
plt.ylabel("Vrai grade")
plt.tight_layout()
plt.show()

✅ Jeu d'entraînement : (256204, 12)
✅ Jeu de test : (109803, 12)
