In [1]:
from pathlib import Path
import pandas as pd
from sqlalchemy import create_engine, text

DB_URL = "postgresql+psycopg://neondb_owner:npg_9Pfvkmi7FJYe@ep-withered-queen-addpikwv-pooler.c-2.us-east-1.aws.neon.tech/neondb?sslmode=require&channel_binding=require"
MODEL_NAME = "lgbm_vanilla" 
BASELINE_PATH = Path("../../data/application_train.csv")

engine = create_engine(DB_URL)


In [2]:
query = """
SELECT 
    i.created_at,
    i.model_name,
    i.features,
    o.prediction,
    o.prob,
    o.proba_defaut,
    o.proba_solvable,
    o.latency_ms,
    o.error
FROM ml_inputs i
JOIN ml_outputs o ON o.input_id = i.id 
WHERE i.model_name = :model_name
ORDER BY i.created_at DESC
LIMIT 5000;
"""

prod_df = pd.read_sql_query(
    text(query),
    engine,
    params={"model_name": MODEL_NAME},
)

print("Données de prod chargées :", prod_df.shape)
prod_df.head()


Données de prod chargées : (5000, 9)


Unnamed: 0,created_at,model_name,features,prediction,prob,proba_defaut,proba_solvable,latency_ms,error
0,2025-11-24 21:41:35.312848+00:00,lgbm_vanilla,"{'AGE': None, 'nb_loans': None, 'sum_debt': No...",non_solvable,0.679384,0.679384,0.320616,126,
1,2025-11-24 21:41:35.312848+00:00,lgbm_vanilla,"{'AGE': None, 'nb_loans': None, 'sum_debt': No...",solvable,0.556389,0.443611,0.556389,126,
2,2025-11-24 21:41:35.312848+00:00,lgbm_vanilla,"{'AGE': None, 'nb_loans': None, 'sum_debt': No...",solvable,0.624752,0.375248,0.624752,126,
3,2025-11-24 21:41:35.312848+00:00,lgbm_vanilla,"{'AGE': None, 'nb_loans': None, 'sum_debt': No...",solvable,0.577569,0.422431,0.577569,126,
4,2025-11-24 21:41:35.312848+00:00,lgbm_vanilla,"{'AGE': None, 'nb_loans': None, 'sum_debt': No...",non_solvable,0.711588,0.711588,0.288412,126,


In [3]:
prod_features = pd.json_normalize(prod_df["features"])

prod_full = prod_features

prod_full.head()

Unnamed: 0,AGE,nb_loans,sum_debt,AMT_CREDIT,DAYS_BIRTH,FLAG_EMAIL,FLAG_MOBIL,FLAG_PHONE,SK_ID_CURR,AMT_ANNUITY,...,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_YEAR,REGION_POPULATION_RELATIVE,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,WEEKDAY_APPR_PROCESS_START,LIVE_REGION_NOT_WORK_REGION,REGION_RATING_CLIENT_W_CITY,YEARS_BEGINEXPLUATATION_AVG
0,,,,312768.0,-13962,0,1,0,456250,24709.5,...,0.0,0.0,4.0,0.006629,0,0,TUESDAY,0,2,
1,,,,450000.0,-13968,0,1,1,456224,25128.0,...,0.0,0.0,2.0,0.01885,0,1,MONDAY,1,2,0.9896
2,,,,315000.0,-15922,0,1,1,456223,33205.5,...,0.0,0.0,1.0,0.026392,0,0,WEDNESDAY,0,2,0.9955
3,,,,622413.0,-11186,0,1,0,456222,31909.5,...,,,,0.035792,0,0,MONDAY,0,2,
4,,,,252022.5,-11708,0,1,1,456202,23112.0,...,0.0,0.0,2.0,0.009175,0,0,TUESDAY,0,2,0.9786


In [4]:
baseline_df = pd.read_csv(BASELINE_PATH)

baseline_features = baseline_df.drop(columns=["TARGET"])


common_cols = sorted(set(baseline_features.columns) & set(prod_features.columns))

ref_data = baseline_features[common_cols].copy()
cur_data = prod_features[common_cols].copy()

ref_data.head()


Unnamed: 0,AMT_ANNUITY,AMT_CREDIT,AMT_GOODS_PRICE,AMT_INCOME_TOTAL,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_YEAR,...,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,SK_ID_CURR,TOTALAREA_MODE,WALLSMATERIAL_MODE,WEEKDAY_APPR_PROCESS_START,YEARS_BEGINEXPLUATATION_AVG,YEARS_BUILD_AVG
0,24700.5,406597.5,351000.0,202500.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0,0,0,0,100002,0.0149,"Stone, brick",WEDNESDAY,0.9722,0.6192
1,35698.5,1293502.5,1129500.0,270000.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,100003,0.0714,Block,MONDAY,0.9851,0.796
2,6750.0,135000.0,135000.0,67500.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,100004,,,MONDAY,,
3,29686.5,312682.5,297000.0,135000.0,,,,,,,...,0,0,0,0,100006,,,WEDNESDAY,,
4,21865.5,513000.0,513000.0,121500.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,1,0,0,100007,,,THURSDAY,,


In [5]:
from evidently import Report
from evidently.presets import DataDriftPreset

data_drift_report = Report(
    metrics=[
        DataDriftPreset(),
    ]
)

data_drift_result = data_drift_report.run(
    current_data=cur_data,    
    reference_data=ref_data, 
)


In [6]:
REPORT_PATH = Path("reports/evidently/data_drift_report.html")
REPORT_PATH.parent.mkdir(parents=True, exist_ok=True)

data_drift_result.save_html(str(REPORT_PATH))