In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [None]:
# ðŸ”¹ Chargement des donnÃ©es
input_path = "../../data/processed/dataset_clean_no_outliers.parquet"
df = pd.read_parquet(input_path)

# ðŸ”¹ Encodage rapide (si pas dÃ©jÃ  fait)
grade_order = sorted(df['grade'].dropna().unique())
grade_mapping = {grade: idx + 1 for idx, grade in enumerate(grade_order)}
df['grade_encoded'] = df['grade'].map(grade_mapping)

# Encodage supplÃ©mentaire
df['home_ownership_encoded'] = df['home_ownership'].astype('category').cat.codes
df['verification_status_encoded'] = df['verification_status'].astype('category').cat.codes
df['loan_status_encoded'] = (df['loan_status'] == 'Charged Off').astype(int)
df['purpose_encoded'] = df['purpose'].astype('category').cat.codes
df['initial_list_status_encoded'] = df['initial_list_status'].map({'w': 1, 'f': 0})
df['application_type_encoded'] = df['application_type'].astype('category').cat.codes

# ðŸ”¹ SÃ©lection des variables explicatives
features = [
    'int_rate', 'term', 'revol_util', 'loan_amnt', 'dti', 
    'installment', 'monthly_payment_calculated', 
    'home_ownership_encoded', 'verification_status_encoded',
    'purpose_encoded', 'initial_list_status_encoded',
    'application_type_encoded'
]

X = df[features]
y = df['grade_encoded']

# ðŸ”¥ GÃ©rer les valeurs manquantes avant KNN
X = X.dropna()  # OPTION 1 : suppression des lignes avec NaN
y = y.loc[X.index]  # Important : aligner y avec X

# ðŸ”¹ Standardisation obligatoire pour KNN
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# ðŸ”¹ DÃ©coupage Train / Test (70%-30%)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42, stratify=y)

print(f"âœ… Jeu d'entraÃ®nement : {X_train.shape}")
print(f"âœ… Jeu de test : {X_test.shape}")

# ðŸ”¹ ModÃ¨le KNN
knn = KNeighborsClassifier(n_neighbors=5)  # k=5 par dÃ©faut
knn.fit(X_train, y_train)

# ðŸ”¹ PrÃ©dictions
y_pred = knn.predict(X_test)

# ðŸ”¹ Ã‰valuation
accuracy = accuracy_score(y_test, y_pred)
print(f"\nðŸŽ¯ Accuracy du modÃ¨le KNN : {accuracy:.4f}")

print("\nðŸ§¾ Rapport de classification :")
print(classification_report(y_test, y_pred))

# ðŸ”¹ Matrice de confusion
conf_matrix = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(8,6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues")
plt.title("Matrice de confusion - KNN PrÃ©diction du grade")
plt.xlabel("Grade prÃ©dit")
plt.ylabel("Vrai grade")
plt.tight_layout()
plt.show()

âœ… Jeu d'entraÃ®nement : (256204, 12)
âœ… Jeu de test : (109803, 12)
