# IMPORTATION DES LIBRAIRIES

In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, precision_recall_curve, average_precision_score
from lightgbm import LGBMClassifier
from sklearn.model_selection import TimeSeriesSplit
import warnings
warnings.filterwarnings('ignore')

# Chargement du dataset

In [None]:
df = pd.read_csv('dataset/dataset_cleaned/df_comp_failure.csv')
print(f"Dimensions du dataset original: {df.shape}")
print(f"Distribution des classes:\n{df['failed_component'].value_counts(normalize=True)}")

Dimensions du dataset original: (2130141, 39)
Distribution des classes:
failed_component
comp2    0.308108
comp4    0.273969
comp3    0.210735
comp1    0.207188
Name: proportion, dtype: float64


# Préparation du tableau

- Echantillonage

In [21]:
df1, _ = train_test_split(
    df, 
    test_size=0.80,  # On garde 20 % seulement
    stratify=df['failed_component'], 
    random_state=42
)

In [22]:
print(f"\nDimensions après échantillonnage: {df1.shape}")
print(f"Distribution des classes après échantillonnage:\n{df1['failed_component'].value_counts(normalize=True)}")


Dimensions après échantillonnage: (426028, 39)
Distribution des classes après échantillonnage:
failed_component
comp2    0.308107
comp4    0.273970
comp3    0.210735
comp1    0.207188
Name: proportion, dtype: float64


- Conserver le datetime comme index

In [23]:
df1.set_index('datetime', inplace=True)

- Suppression des colonnes nons pertinentes

In [24]:
df1.drop(columns=['machineID'], inplace=True)
df1

Unnamed: 0_level_0,volt,rotate,pressure,vibration,age,error_count,maint_count,is_model1,is_model2,is_model3,...,cum_errors,cum_maint,hour,dayofweek,is_weekend,vibration_times_rotation,pressure_over_voltage,had_error_last_3h,had_maint_last_3h,failed_component
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-09-20 15:00:00,156.933583,518.628743,88.461325,52.704987,20,0.0,0.0,False,False,True,...,29.0,23.0,15,6,1,27334.321150,0.563686,0,0,comp4
2015-08-30 14:00:00,180.003294,303.816647,92.271160,48.078820,14,0.0,0.0,False,False,False,...,30.0,18.0,14,6,1,14607.145970,0.512608,0,0,comp4
2015-04-29 15:00:00,173.420724,433.390852,99.422111,36.626679,19,0.0,0.0,False,False,True,...,14.0,9.0,15,2,0,15873.667571,0.573300,0,0,comp4
2015-10-15 22:00:00,170.821505,407.234242,109.907427,32.869488,12,0.0,0.0,False,True,False,...,35.0,20.0,22,3,0,13385.581074,0.643405,0,0,comp3
2015-05-25 00:00:00,177.115278,444.672255,92.771332,35.973546,7,0.0,0.0,True,False,False,...,16.0,15.0,0,0,0,15996.437942,0.523791,0,0,comp2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2015-10-25 14:00:00,158.039850,439.074264,97.528655,39.319402,17,0.0,0.0,False,False,False,...,32.0,18.0,14,6,1,17264.137448,0.617114,0,0,comp4
2015-04-28 21:00:00,162.837909,354.859914,94.046900,47.923420,20,0.0,0.0,False,True,False,...,14.0,8.0,21,1,0,17006.100678,0.577549,0,0,comp4
2015-04-13 13:00:00,170.820515,370.254344,93.765490,29.744269,2,0.0,0.0,True,False,False,...,14.0,9.0,13,0,0,11012.944822,0.548912,0,0,comp2
2015-07-12 04:00:00,176.775231,399.227799,125.698252,34.990378,15,0.0,0.0,True,False,False,...,25.0,13.0,4,6,1,13969.131767,0.711063,0,0,comp4


- Conversion des booléens en entiers

In [25]:
df1 = df1.astype({col: 'int' for col in df1.select_dtypes(include='bool').columns})

- Normalisation des données

In [26]:
numeric_cols = df1.select_dtypes(include=['int64', 'float64']).columns

In [27]:
# On garde uniquement les colonnes ayant plus que 2 valeurs distinctes (on exclu les bools et 'machineID')
norm_cols = [col for col in numeric_cols if df1[col].nunique() > 2]

In [28]:
rbscaler=RobustScaler()
df_n=rbscaler.fit_transform(df1[norm_cols])
df_n

array([[-0.66590701,  1.01709348, -0.8527992 , ...,  0.75      ,
         2.15008893, -0.23488504],
       [ 0.4270746 , -1.97119347, -0.58957704, ...,  0.75      ,
        -0.74687665, -0.68238695],
       [ 0.11520993, -0.16866506, -0.09551644, ..., -0.25      ,
        -0.45859041, -0.15065862],
       ...,
       [-0.00798106, -1.04696757, -0.48633347, ..., -0.75      ,
        -1.5649904 , -0.36432207],
       [ 0.27413756, -0.643913  ,  1.71990705, ...,  0.75      ,
        -0.89210174,  1.05629408],
       [ 0.58373715,  0.30595751, -1.21210734, ..., -0.25      ,
         0.0259123 , -1.1940424 ]])

In [29]:
df_n = pd.DataFrame(df_n, columns=norm_cols, index=df1.index)

In [30]:
df_norm=df1.copy()
df_norm[norm_cols] = df_n
df_norm

Unnamed: 0_level_0,volt,rotate,pressure,vibration,age,error_count,maint_count,is_model1,is_model2,is_model3,...,cum_errors,cum_maint,hour,dayofweek,is_weekend,vibration_times_rotation,pressure_over_voltage,had_error_last_3h,had_maint_last_3h,failed_component
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-09-20 15:00:00,-0.665907,1.017093,-0.852799,1.691872,0.857143,0.0,0.0,0,0,1,...,0.50,0.714286,0.230769,0.75,1,2.150089,-0.234885,0,0,comp4
2015-08-30 14:00:00,0.427075,-1.971193,-0.589577,1.053817,0.000000,0.0,0.0,0,0,0,...,0.55,0.357143,0.153846,0.75,1,-0.746877,-0.682387,0,0,comp4
2015-04-29 15:00:00,0.115210,-0.168665,-0.095516,-0.525698,0.714286,0.0,0.0,0,0,1,...,-0.25,-0.285714,0.230769,-0.25,0,-0.458590,-0.150659,0,0,comp4
2015-10-15 22:00:00,-0.007934,-0.532534,0.628916,-1.043902,-0.285714,0.0,0.0,0,1,0,...,0.80,0.500000,0.769231,0.00,0,-1.024930,0.463539,0,0,comp3
2015-05-25 00:00:00,0.290248,-0.011728,-0.555020,-0.615781,-1.000000,0.0,0.0,1,0,0,...,-0.15,0.142857,-0.923077,-0.75,0,-0.430645,-0.584416,0,0,comp2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2015-10-25 14:00:00,-0.613495,-0.089602,-0.226336,-0.154310,0.428571,0.0,0.0,0,0,0,...,0.65,0.357143,0.153846,0.75,1,-0.142091,0.233203,0,0,comp4
2015-04-28 21:00:00,-0.386176,-1.261122,-0.466891,1.032384,0.857143,0.0,0.0,0,1,0,...,-0.25,-0.357143,0.692308,-0.50,0,-0.200825,-0.113432,0,0,comp4
2015-04-13 13:00:00,-0.007981,-1.046968,-0.486333,-1.474942,-1.714286,0.0,0.0,1,0,0,...,-0.25,-0.285714,0.076923,-0.75,0,-1.564990,-0.364322,0,0,comp2
2015-07-12 04:00:00,0.274138,-0.643913,1.719907,-0.751382,0.142857,0.0,0.0,1,0,0,...,0.30,0.000000,-0.615385,0.75,1,-0.892102,1.056294,0,0,comp4


- Gestion des valeurs manquantes

In [31]:
df_norm.isnull().sum().sum()

480

In [32]:
df_norm.dropna(inplace=True)

In [33]:
df_norm.isnull().sum().sum()

0

# MODELISATION

- Séparation en variables indépendantes et variable cibl

In [34]:
X=df_norm.drop(columns=['failed_component'])
y=df_norm['failed_component']

- Séparation en train et test

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

- MODELISATION AVEC LIGHTGBM

In [37]:
print("\n--- LIGHTGBM ---")
# Définition des hyperparamètres à optimiser
param_grid_lgbm = {
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [200, 300, 400],
    'num_leaves': [31, 50, 100, 150],
    'max_depth': [5, 7, 9, -1], 
    'min_child_samples': [20, 30, 50],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'reg_alpha': [0, 0.1, 0.5],
    'reg_lambda': [0, 0.1, 0.5],
    'boosting_type': ['gbdt', 'dart'],
    'importance_type': ['split', 'gain'],
    'verbosity':[-1]
}

# Utiliser cette configuration pour le RandomizedSearchCV
tscv = TimeSeriesSplit(n_splits=5)

# Recherche des meilleurs hyperparamètres
lgbm_base = LGBMClassifier(class_weight='balanced', random_state=42)
random_search_lgbm = RandomizedSearchCV(
    lgbm_base, 
    param_grid_lgbm, 
    n_iter=10, 
    cv=tscv, 
    scoring='f1_macro',
    random_state=42, 
    n_jobs=-1, 
    verbose=1
)

print("Optimisation des hyperparamètres en cours...")
random_search_lgbm.fit(X_train, y_train)
print(f"Meilleurs paramètres: {random_search_lgbm.best_params_}")

# Utilisation du meilleur modèle
best_lgbm = random_search_lgbm.best_estimator_
y_pred_lgbm = best_lgbm.predict(X_test)
y_proba_lgbm = best_lgbm.predict_proba(X_test)[:, 1]

# Évaluation
print("\nRapport de classification:")
print(classification_report(y_test, y_pred_lgbm))

print("\nMatrice de confusion:")
print(confusion_matrix(y_test, y_pred_lgbm))


--- LIGHTGBM ---
Optimisation des hyperparamètres en cours...
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Meilleurs paramètres: {'verbosity': -1, 'subsample': 1.0, 'reg_lambda': 0, 'reg_alpha': 0, 'num_leaves': 150, 'n_estimators': 400, 'min_child_samples': 50, 'max_depth': -1, 'learning_rate': 0.05, 'importance_type': 'gain', 'colsample_bytree': 0.9, 'boosting_type': 'dart'}

Rapport de classification:
              precision    recall  f1-score   support

       comp1       0.99      1.00      1.00     17701
       comp2       1.00      0.99      1.00     26201
       comp3       1.00      1.00      1.00     17950
       comp4       1.00      1.00      1.00     23342

    accuracy                           1.00     85194
   macro avg       1.00      1.00      1.00     85194
weighted avg       1.00      1.00      1.00     85194


Matrice de confusion:
[[17668    19     1    13]
 [   61 26034    36    70]
 [   10    21 17903    16]
 [   28    22    29 23263]]
