In [9]:
!pip install xgboost scikit-learn imbalanced-learn shap matplotlib seaborn pandas optuna joblib



In [11]:
import pandas as pd
import numpy as np
from pathlib import Path
import joblib
import warnings

from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import (
    StandardScaler, OneHotEncoder,
    OrdinalEncoder, LabelEncoder
)
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
import optuna

warnings.filterwarnings("ignore", category=UserWarning)

# Configuration
DATA_DIR     = Path("/home/nada/Project_odoo_17_IA-main/ML_Risque de depart")
TRAIN_PATH   = DATA_DIR / "train_with_risk.csv"
TEST_PATH    = DATA_DIR / "test_with_predictions.csv"
OUTPUT_DIR   = DATA_DIR / "model"
RANDOM_STATE = 42

# Définition des colonnes ordinales
ORDINAL_MAP = {
    'work_life_balance':    ['Poor', 'Fair', 'Good', 'Excellent'],
    'job_satisfaction':     ['Low', 'Medium', 'High', 'Very High'],
    'performance_rating':   ['Low', 'Below Average', 'Average', 'High'],
    'employee_recognition': ['Low', 'Medium', 'High', 'Very High'],
    'company_reputation':   ['Poor', 'Fair', 'Good', 'Excellent']
}

def load_data(path):
    print(f" Chargement des données depuis {path}")
    df = pd.read_csv(path)
    # nettoyage des noms de colonnes pour obtenir du snake_case
    df.columns = (
        df.columns
          .str.strip()
          .str.lower()
          .str.replace(" ", "_")
          .str.replace("-", "_")
    )
    print(f" {len(df)} lignes chargées, colonnes nettoyées")
    return df

def validate_ordinal_columns(df, ordinal_map):
    print("Validation des colonnes ordinales")
    for col, cats in ordinal_map.items():
        if col in df.columns:
            invalid = set(df[col].dropna()) - set(cats)
            if invalid:
                raise ValueError(f"Colonne {col} contient des valeurs inattendues : {invalid}")
    print(" Toutes les valeurs ordinales sont conformes")

def preprocess_data(train_df, ordinal_map):
    print("  Prétraitement des données")
    validate_ordinal_columns(train_df, ordinal_map)

    # Label encoding de la cible Risk_Level
    print("Encodage de la cible 'risk_level'")
    le = LabelEncoder()
    le.classes_ = np.array(['Low', 'Medium', 'High'])  # fixe l’ordre 0 pour low
    y = le.transform(train_df['risk_level'])
    print(f"Mapping cible : {list(zip(le.classes_, le.transform(le.classes_)))}")

    # Nettoyage des colonnes inutiles
    drop_cols = [
        'employee_id', 'attrition', 'attrition_binary',
        'risk_score', 'risk_level'
    ]
    X = train_df.drop(columns=[c for c in drop_cols if c in train_df.columns])
    print(f" Colonn es supprimées : {drop_cols}")

    # Définition des types de colonnes pour le préprocesseur
    ordinal_cols = list(ordinal_map.keys())
    num_cols     = [
        'age', 'years_at_company', 'monthly_income',
        'distance_from_home', 'number_of_promotions',
        'number_of_dependents'
    ]
    cat_cols     = [
        'job_role', 'job_level', 'company_size', 'education_level',
        'marital_status', 'overtime', 'remote_work',
        'leadership_opportunities', 'innovation_opportunities',
        'gender'
    ]

    preprocessor = ColumnTransformer([
        ('num', StandardScaler(), num_cols),
        ('ord', OrdinalEncoder(categories=[ordinal_map[c] for c in ordinal_cols]), ordinal_cols),
        ('ohe', OneHotEncoder(drop='first', handle_unknown='ignore'), cat_cols),
    ])
    print(" Préprocesseur configuré (numéral, ordinal, one-hot)")

    return X, y, preprocessor, le

def tune_xgboost(X, y, preprocessor, n_trials=50):
    print(f" Lancement de l’optimisation Optuna ({n_trials} essais)…")

    def objective(trial):
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 100, 500),
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
            'subsample': trial.suggest_float('subsample', 0.5, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
            'gamma': trial.suggest_float('gamma', 0, 5),
            'objective': 'multi:softprob',
            'eval_metric': 'mlogloss',
            'random_state': RANDOM_STATE,
            'use_label_encoder': False,

        }
        # Utilisation d’ImbPipeline pour supporter SMOTE
        pipeline = ImbPipeline([
            ('prep', preprocessor),
            ('smote', SMOTE(random_state=RANDOM_STATE)),
            ('clf', XGBClassifier(**params))
        ])

        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
        score = cross_val_score(pipeline, X, y, scoring='f1_macro', cv=cv).mean()
        return score

    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=n_trials)
    print(" Optimisation terminée")
    print(f" Meilleurs paramètres : {study.best_params}")
    return study.best_params

def train_model(X, y, preprocessor, best_params):
    print("  Entraînement du modèle avec SMOTE")
    # 1) Fit du préprocesseur
    X_proc = preprocessor.fit_transform(X)
    print("  Préprocesseur ajusté")

    # 2) SMOTE pour l’entraînement
    X_res, y_res = SMOTE(random_state=RANDOM_STATE).fit_resample(X_proc, y)
    print(f"   • Après SMOTE : {len(X_res)} échantillons")

    # 3) Entraînement du XGBClassifier
    clf = XGBClassifier(
        **best_params,
        objective='multi:softprob',
        eval_metric='mlogloss',
        random_state=RANDOM_STATE,
        use_label_encoder=False,

    )
    clf.fit(X_res, y_res)
    print("   • Classifieur entraîné")

    # 4) Pipeline d’inférence (sans SMOTE)
    inference_pipeline = Pipeline([
        ('prep', preprocessor),
        ('clf', clf)
    ])
    print(" Pipeline d’inférence prêt")

    return inference_pipeline
#une évaluation détaillée du modèle
from sklearn.metrics import (
    classification_report, accuracy_score,
    precision_score, recall_score, f1_score
)

def evaluate_model(pipeline, X, y, label_encoder):
    print("\n📊 Évaluation du modèle sur l'ensemble d'entraînement complet")
    y_pred = pipeline.predict(X)

    print(classification_report(y, y_pred, target_names=label_encoder.classes_))

    report = classification_report(y, y_pred, output_dict=True, zero_division=0)

    results = {
        'Modèle': 'XGBoost_Optuna',
        'F1_macro': round(f1_score(y, y_pred, average='macro'), 3),
        'Accuracy': round(accuracy_score(y, y_pred), 3),
        'Precision': round(precision_score(y, y_pred, average='macro'), 3),
        'Recall': round(recall_score(y, y_pred, average='macro'), 3),
        'F1_Low': round(report['0']['f1-score'], 3),
        'F1_Medium': round(report['1']['f1-score'], 3),
        'F1_High': round(report['2']['f1-score'], 3),
    }
    return results

def save_pipeline_and_encoder(pipeline, encoder):
    print(f"Sauvegarde du pipeline et de l’encodeur dans {OUTPUT_DIR}")
    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
    joblib.dump(pipeline, OUTPUT_DIR / "final_model.pkl")
    joblib.dump(encoder,  OUTPUT_DIR / "label_encoder.pkl")
    print(" Sauvegarde terminée")

def main():
    print(" Démarrage du script d’entraînement")
    train_df = load_data(TRAIN_PATH)

    X, y, preprocessor, le = preprocess_data(train_df, ORDINAL_MAP)
    best_params      = tune_xgboost(X, y, preprocessor, n_trials=50)
    inference_pipe   = train_model(X, y, preprocessor, best_params)
    save_pipeline_and_encoder(inference_pipe, le)
    results = evaluate_model(inference_pipe, X, y, le)

    # Sauvegarde dans un fichier CSV
    results_df = pd.DataFrame([results])
    results_path = OUTPUT_DIR / "resultats_xgboost_optuna.csv"
    results_df.to_csv(results_path, index=False)
    print(f"\n Résultats de performance sauvegardés dans : {results_path}")

    print("Tout est terminé !")

if __name__ == "__main__":
    main()

 Démarrage du script d’entraînement
 Chargement des données depuis /content/drive/MyDrive/final_travail_Attrition/train_with_risk.csv


[I 2025-06-21 23:13:19,610] A new study created in memory with name: no-name-9b6dfad4-8d83-4e5c-a531-2158b2d8731d


 59598 lignes chargées, colonnes nettoyées
  Prétraitement des données
Validation des colonnes ordinales
 Toutes les valeurs ordinales sont conformes
Encodage de la cible 'risk_level'
Mapping cible : [(np.str_('Low'), np.int64(0)), (np.str_('Medium'), np.int64(1)), (np.str_('High'), np.int64(2))]
 Colonn es supprimées : ['employee_id', 'attrition', 'attrition_binary', 'risk_score', 'risk_level']
 Préprocesseur configuré (numéral, ordinal, one-hot)
 Lancement de l’optimisation Optuna (50 essais)…


[I 2025-06-21 23:14:41,068] Trial 0 finished with value: 0.9467530028560421 and parameters: {'n_estimators': 295, 'max_depth': 6, 'learning_rate': 0.108415554974834, 'subsample': 0.8144508285531676, 'colsample_bytree': 0.542326682425544, 'gamma': 3.8884209538222714}. Best is trial 0 with value: 0.9467530028560421.
[I 2025-06-21 23:16:11,749] Trial 1 finished with value: 0.9541530755090607 and parameters: {'n_estimators': 322, 'max_depth': 10, 'learning_rate': 0.2448514987917685, 'subsample': 0.7864809793808938, 'colsample_bytree': 0.9031930626295565, 'gamma': 2.8277719187149404}. Best is trial 1 with value: 0.9541530755090607.
[I 2025-06-21 23:17:16,234] Trial 2 finished with value: 0.9456766439786707 and parameters: {'n_estimators': 186, 'max_depth': 10, 'learning_rate': 0.23586040350611556, 'subsample': 0.8465624764701144, 'colsample_bytree': 0.7393433208160294, 'gamma': 4.949008729467493}. Best is trial 1 with value: 0.9541530755090607.
[I 2025-06-21 23:18:08,164] Trial 3 finished w

 Optimisation terminée
 Meilleurs paramètres : {'n_estimators': 478, 'max_depth': 5, 'learning_rate': 0.24624374866926696, 'subsample': 0.5935618872820965, 'colsample_bytree': 0.9915746129733698, 'gamma': 2.839930615243938}
  Entraînement du modèle avec SMOTE
  Préprocesseur ajusté
   • Après SMOTE : 66405 échantillons
   • Classifieur entraîné
 Pipeline d’inférence prêt
Sauvegarde du pipeline et de l’encodeur dans /content/drive/MyDrive/final_travail_Attrition/model
 Sauvegarde terminée

📊 Évaluation du modèle sur l'ensemble d'entraînement complet
              precision    recall  f1-score   support

         Low       0.99      0.98      0.99     22135
      Medium       0.96      0.97      0.96     17767
        High       0.99      0.98      0.98     19696

    accuracy                           0.98     59598
   macro avg       0.98      0.98      0.98     59598
weighted avg       0.98      0.98      0.98     59598


✅ Résultats de performance sauvegardés dans : /content/drive/My

In [14]:
print("pandas:", pd.__version__)
print("numpy:", np.__version__)
import matplotlib # Import the matplotlib library
print("matplotlib:", matplotlib.__version__)

print("joblib:", joblib.__version__)
print("optuna:", optuna.__version__)
import sklearn # Import the sklearn library
print("scikit-learn:", sklearn.__version__)
import xgboost # Import the xgboost library
print("xgboost:", xgboost.__version__)
import imblearn # Import the imblearn library
print("imblearn:", imblearn.__version__)

pandas: 2.2.2
numpy: 2.0.2
matplotlib: 3.10.0
joblib: 1.5.1
optuna: 4.4.0
scikit-learn: 1.6.1
xgboost: 2.1.4
imblearn: 0.13.0
