In [1]:
import pandas as pd
import os

In [3]:
# 🔹 Chemins
input_path = "../../data/cleaning/step5_installment_corrected.parquet"
output_dir = "../../data/processed/"
os.makedirs(output_dir, exist_ok=True)

output_path_clean = os.path.join(output_dir, "dataset_clean_no_outliers.parquet")
output_path_outliers = os.path.join(output_dir, "dataset_only_outliers.parquet")

# 🔹 Chargement des données
df = pd.read_parquet(input_path)

# 📌 Recalcul du flag de détection des outliers (1er et 99e percentiles)
df['has_outlier'] = (
    (df['loan_amnt'] < df['loan_amnt'].quantile(0.01)) | (df['loan_amnt'] > df['loan_amnt'].quantile(0.99)) |
    (df['annual_inc'] < df['annual_inc'].quantile(0.01)) | (df['annual_inc'] > df['annual_inc'].quantile(0.99)) |
    (df['int_rate'] < df['int_rate'].quantile(0.01)) | (df['int_rate'] > df['int_rate'].quantile(0.99)) |
    (df['dti'] < df['dti'].quantile(0.01)) | (df['dti'] > df['dti'].quantile(0.99)) |
    (df['installment'] < df['installment'].quantile(0.01)) | (df['installment'] > df['installment'].quantile(0.99)) |
    (df['monthly_payment_calculated'] < df['monthly_payment_calculated'].quantile(0.01)) | 
    (df['monthly_payment_calculated'] > df['monthly_payment_calculated'].quantile(0.99)) |
    (df['monthly_income'] < df['monthly_income'].quantile(0.01)) | 
    (df['monthly_income'] > df['monthly_income'].quantile(0.99))
)

# 🔄 Séparation des jeux de données
df_clean = df[df['has_outlier'] == False].copy()
df_outliers = df[df['has_outlier'] == True].copy()

# 💾 Sauvegardes
df_clean.to_parquet(output_path_clean, index=False)
df_outliers.to_parquet(output_path_outliers, index=False)

print(f"✅ Dataset SANS outliers sauvegardé : {output_path_clean}")
print(f"✅ Dataset AVEC outliers sauvegardé : {output_path_outliers}")
print(f"📊 Taille sans outliers : {len(df_clean)} lignes")
print(f"📊 Taille avec outliers : {len(df_outliers)} lignes")

✅ Dataset SANS outliers sauvegardé : ../../data/processed/dataset_clean_no_outliers.parquet
✅ Dataset AVEC outliers sauvegardé : ../../data/processed/dataset_only_outliers.parquet
📊 Taille sans outliers : 366214 lignes
📊 Taille avec outliers : 29816 lignes
