# Projet 8 - Pipeline complet de préparation & entraînement

In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from joblib import dump

In [2]:
data_dir = Path('../data/raw/')
train_raw = pd.read_csv(data_dir / 'application_train.csv')
test_raw = pd.read_csv(data_dir / 'application_test.csv')
bureau = pd.read_csv(data_dir / 'bureau.csv')

In [3]:

def preprocess(df, bureau_df=None, is_train=True):
    df_result = pd.DataFrame()
    df_result["id"] = df["SK_ID_CURR"]
    df_result["age"] = (-df["DAYS_BIRTH"] / 365).round().astype(int)
    df_result["revenu_annuel"] = df["AMT_INCOME_TOTAL"]
    df_result["nombre_enfants"] = df["CNT_CHILDREN"]
    df_result["anciennete"] = (-df["DAYS_EMPLOYED"] / 365).replace(365243/365, 0).round(1)
    df_result["score_client"] = df[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].mean(axis=1)
    df_result["categorie_produit"] = LabelEncoder().fit_transform(df["NAME_CONTRACT_TYPE"] + "_" + df["NAME_EDUCATION_TYPE"])
    df_result["montant_credit"] = df["AMT_CREDIT"]
    df_result["taux_endettement"] = df["AMT_CREDIT"] / df["AMT_INCOME_TOTAL"]
    df_result["niveau_education"] = LabelEncoder().fit_transform(df["NAME_EDUCATION_TYPE"].fillna("Unknown"))
    df_result["statut_familial"] = LabelEncoder().fit_transform(df["NAME_FAMILY_STATUS"].fillna("Unknown"))

    if bureau_df is not None:
        nb_retards = bureau_df[bureau_df["CREDIT_DAY_OVERDUE"] > 0].groupby("SK_ID_CURR").size()
        df_result["historique_impayes"] = df["SK_ID_CURR"].map(nb_retards).fillna(0).astype(int)
    else:
        df_result["historique_impayes"] = 0

    if is_train:
        df_result["target"] = df["TARGET"]
    return df_result


In [4]:
df_train = preprocess(train_raw, bureau, is_train=True)
df_test = preprocess(test_raw, bureau, is_train=False)

In [5]:
df_train.to_csv('../data/output/dataset_project_8_train.csv', index=False)
df_test.to_csv('../data/output/dataset_project_8_test.csv', index=False)
df_train = df_train.drop(columns='id')
df_test = df_test.drop(columns='id')

In [None]:
X = df_train.drop(columns='target')
y = df_train['target']
xgb_model = XGBClassifier(
    scale_pos_weight=19999 / 157,  # around 127.4
    use_label_encoder=False,
    eval_metric="logloss",
    random_state=42
)
xgb_model.fit(X, y)
dump(xgb_model, '../models/model_project_8_xgb.joblib')

['../models/model_project_8_xgb.joblib']

Le modèle XGBoost a été entraîné et sauvegardé avec succès.