In [10]:
!pip install xgboost
!pip install lightgbm


Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [11]:
"""
Ce notebook regroupe les trois étapes principales :
1. Préprocessing (chargement, nettoyage, feature engineering et pipeline)
2. Définition des modèles & évaluation
3. Entraînement, visualisation & sauvegarde
"""

## 1. Préprocessing

# Imports
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Constantes
RANDOM_STATE = 42

def load_data(filepath: str) -> pd.DataFrame:
    """
    Charge le CSV, convertit les trimestres en timestamps
    et trie par département et date.
    """
    df = pd.read_csv(filepath)
    df['quarter'] = pd.PeriodIndex(df['quarter'], freq='Q').to_timestamp()
    df = df.sort_values(['department', 'quarter']).reset_index(drop=True)
    return df

def build_time_features(df: pd.DataFrame, window: int = 4) -> pd.DataFrame:
    """
    Ajoute rolling mean et lags pour les variables clés,
    remplace les NaN par 0, et ajoute le numéro de trimestre.
    """
    # 1) Extraire l'année
    df['annee'] = df['quarter'].dt.year

    # 2) Numéro de trimestre
    df['quarter_num'] = df['quarter'].dt.quarter
    stats = [
        'departs_confirmes', 'candidats_en_cours',
        'postes_ouverts_actuels', 'effectif_actuel', 'turnover_month_pct'
    ]
    for feat in stats:
        # Rolling mean
        df[f'{feat}_rolling_mean'] = (
            df.groupby('department')[feat]
              .transform(lambda x: x.rolling(window, min_periods=1).mean())
        )
        # Lags
        for lag in range(1, window + 1):
            df[f'{feat}_lag_{lag}'] = df.groupby('department')[feat].shift(lag)
    # Remplir les NaN générés par les lags
    lag_cols = [c for c in df.columns if '_lag_' in c]
    df[lag_cols] = df[lag_cols].fillna(0)

    return df

def get_preprocessing_pipeline(df: pd.DataFrame, target: str) -> ColumnTransformer:
    """
    Crée et renvoie un ColumnTransformer pour l'encodage et la normalisation.
    """
    cat_cols = ['department']
    num_cols = [c for c in df.columns if c not in cat_cols + ['quarter', target]]
    return ColumnTransformer([
        ('ohe',   OneHotEncoder(sparse_output=False, handle_unknown='ignore'), cat_cols),
        ('scale', StandardScaler(), num_cols)
    ])



In [12]:
##  2. Définition des modèles & évaluation

# Imports
import numpy as np
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import matplotlib.pyplot as plt

# Constantes
RANDOM_STATE = 42
N_ITER_SEARCH = 10

# 1. Définition des modèles et grilles
def get_models_and_grids():
    models = {
        "RandomForest": RandomForestRegressor(random_state=RANDOM_STATE, n_jobs=-1),
        "XGBoost":      XGBRegressor(random_state=RANDOM_STATE, n_jobs=-1, objective='reg:squarederror'),
        "LightGBM":     LGBMRegressor(random_state=RANDOM_STATE, n_jobs=-1),
        "GradientBoost": GradientBoostingRegressor(random_state=RANDOM_STATE)
    }
    grids = {
        "RandomForest": {
            'model__n_estimators': [200, 500, 800],
            'model__max_depth':    [None, 10, 20],
            'model__max_features': ['sqrt', 'log2']
        },
        "XGBoost": {
            'model__n_estimators': [300, 600, 900],
            'model__learning_rate': [0.01, 0.1, 0.2],
            'model__max_depth':    [3, 6, 9]
        },
        "LightGBM": {
            'model__n_estimators': [300, 600, 900],
            'model__learning_rate': [0.01, 0.1, 0.2],
            'model__num_leaves':   [31, 63, 127]
        },
        "GradientBoost": {
            'model__n_estimators':  [300, 600],
            'model__learning_rate': [0.01, 0.1],
            'model__max_depth':     [3, 6]
        }
    }
    return models, grids

# 2. Fonction d'évaluation
def evaluate_models(df: pd.DataFrame, features: list, target: str, preprocessor):
    """
    Entraîne chaque modèle avec RandomizedSearchCV et renvoie :
      - un DataFrame des performances (R², MAE, RMSE, params)
      - un dict {nom_modèle: pipeline_optimal}
    """
    X, y = df[features], df[target]
    split_idx = int(0.8 * len(df))
    X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
    y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]

    models, grids = get_models_and_grids()
    records, pipelines = [], {}
    tscv = TimeSeriesSplit(n_splits=5)

    for name, estimator in models.items():
        print(f"\n=== Optimisation de {name} ===")
        pipe = Pipeline([('pre', preprocessor), ('model', estimator)])
        search = RandomizedSearchCV(
            pipe, grids[name], n_iter=N_ITER_SEARCH,
            scoring='r2', cv=tscv, verbose=1,
            random_state=RANDOM_STATE, n_jobs=-1
        )
        search.fit(X_train, y_train)
        best_pipe = search.best_estimator_
        pipelines[name] = best_pipe

        y_pred = best_pipe.predict(X_test)
        records.append({
            'Modèle': name,
            'R²': r2_score(y_test, y_pred),
            'MAE': mean_absolute_error(y_test, y_pred),
            'RMSE': np.sqrt(mean_squared_error(y_test, y_pred)),
            'Params': search.best_params_
        })

    results_df = pd.DataFrame(records).sort_values('R²', ascending=False)
    return results_df, pipelines

# 3. Visualisation comparative
def plot_comparison(results_df: pd.DataFrame):
    plt.figure(figsize=(8,5))
    plt.barh(results_df['Modèle'], results_df['R²'])
    plt.gca().invert_yaxis()
    plt.xlabel('R²')
    plt.title('Comparaison des R²')
    plt.tight_layout()
    plt.show()


In [None]:
## 3. Entraînement, visualisation & sauvegarde

# Imports spécifiques
import os
import joblib

# Répertoire de sauvegarde
SAVE_DIR = "/home/nada/Project_odoo_17_IA-main/ML_prévision des effectifs/best_pipeline"

def main():
    # 3.1 Chargement et preprocessing
    df = load_data("/home/nada/Project_odoo_17_IA-main/ML_prévision des effectifs/dataset_Trimestrielle.csv")
    df = build_time_features(df)
    target = 'postes_a_lancer'
    features = [c for c in df.columns if c not in ['quarter', target]]
    # Supprime les lignes incomplètes
    df = df.dropna(subset=[target] + features)

    # 3.2 Création et sauvegarde du préprocesseur
    preprocessor = get_preprocessing_pipeline(df, target)
    preprocessor.fit(df[features])
    os.makedirs(SAVE_DIR, exist_ok=True)
    joblib.dump(preprocessor, os.path.join(SAVE_DIR, 'preprocessor.pkl'))

    # 3.3 Génération de features.txt AVANT One-Hot (raw features)
    stats = [
        'departs_confirmes','candidats_en_cours',
        'postes_ouverts_actuels','effectif_actuel','turnover_month_pct'
    ]
    window = 4

    raw_cols = (
        ['annee', 'department'] +    # année et département brut
        stats +                      # variables de base
        [f"{feat}_rolling_mean" for feat in stats] +
        [f"{feat}_lag_{lag}" for feat in stats for lag in range(1, window+1)] +
        ['quarter_num']
    )

    feat_path = os.path.join(SAVE_DIR, 'features.txt')
    with open(feat_path, 'w', encoding='utf-8') as f:
        for col in raw_cols:
            f.write(col + '\n')

    # 3.4 Entraînement et évaluation
    results_df, pipelines = evaluate_models(df, features, target, preprocessor)
    print(results_df)
    plot_comparison(results_df)

    # 3.5 Sauvegarde du meilleur modèle complet (préproc + modèle)
    best_name = results_df.iloc[0]['Modèle']
    best_pipe = pipelines[best_name]
    joblib.dump(best_pipe, os.path.join(SAVE_DIR, 'pipeline_complete.pkl'))

if __name__ == "__main__":
    main()



=== Optimisation de RandomForest ===
Fitting 5 folds for each of 10 candidates, totalling 50 fits

=== Optimisation de XGBoost ===
Fitting 5 folds for each of 10 candidates, totalling 50 fits

=== Optimisation de LightGBM ===
Fitting 5 folds for each of 10 candidates, totalling 50 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.238414 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1926
[LightGBM] [Info] Number of data points in the train set: 383, number of used features: 34
[LightGBM] [Info] Start training from score 6.765013
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.255053 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1926
[LightGBM] [Info] Number of data points in the train set: 383, number of used features: 34
[LightGBM] [Info] Start training from score 6.765013
[LightGBM] [Info] Auto-choosing 



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.226515 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3621
[LightGBM] [Info] Number of data points in the train set: 1526, number of used features: 38
[LightGBM] [Info] Start training from score 6.672346




