# IMPORTATION DES LIBRAIRIES

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold,TimeSeriesSplit
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import classification_report, confusion_matrix, mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor
import warnings
warnings.filterwarnings('ignore')

# Chargement du dataset

In [None]:
df = pd.read_csv('dataset/dataset_cleaned/df_time_to_failure.csv')
print(f"Dimensions du dataset original: {df.shape}")

Dimensions du dataset original: (245106, 40)


# Préparation du tableau

- Features Temporelles granulaires

In [3]:
df['datetime']=pd.to_datetime(df['datetime'])

In [4]:
df['day_of_year'] = df['datetime'].dt.dayofyear.astype(int)
df['week_of_year'] = df['datetime'].dt.isocalendar().week.astype(int)
df['month'] = df['datetime'].dt.month.astype(int)
df['is_month_start'] = df['datetime'].dt.is_month_start.astype(int)
df['is_month_end'] = df['datetime'].dt.is_month_end.astype(int)

- Saisonnalité annuelle

In [6]:
df['sin_day'] = np.sin(2 * np.pi * df['day_of_year'] / 365)
df['cos_day'] = np.cos(2 * np.pi * df['day_of_year'] / 365)

- Conserver le datetime comme index

In [7]:
df.set_index('datetime', inplace=True)

- Suppression des colonnes nons pertinentes

In [8]:
df.drop(columns=['machineID','next_failure'], inplace=True)
df

Unnamed: 0_level_0,volt,rotate,pressure,vibration,age,error_count,maint_count,is_model1,is_model2,is_model3,...,had_error_last_3h,had_maint_last_3h,time_to_failure_days,day_of_year,week_of_year,month,is_month_start,is_month_end,sin_day,cos_day
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-01-01 06:00:00,176.217853,418.504078,113.077935,45.087686,18,0.0,0.0,False,False,True,...,0,0,4.000000,1,1,1,1,0,1.721336e-02,0.999852
2015-01-01 06:00:00,191.071984,421.317773,139.248508,42.785023,10,0.0,0.0,True,False,False,...,0,0,0.875000,1,1,1,1,0,1.721336e-02,0.999852
2015-01-01 06:00:00,196.874272,439.064185,97.176565,39.689021,5,0.0,0.0,True,False,False,...,0,0,0.875000,1,1,1,1,0,1.721336e-02,0.999852
2015-01-01 06:00:00,168.359381,437.767679,86.006390,42.415429,17,0.0,0.0,False,False,True,...,0,0,0.875000,1,1,1,1,0,1.721336e-02,0.999852
2015-01-01 06:00:00,153.466527,387.953725,89.362896,35.457726,20,0.0,0.0,False,False,False,...,0,0,12.000000,1,1,1,1,0,1.721336e-02,0.999852
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2015-12-31 04:00:00,180.104575,450.600419,100.092875,35.060056,2,0.0,0.0,False,True,False,...,0,0,0.083333,365,53,12,0,1,6.432491e-16,1.000000
2015-12-31 05:00:00,181.998731,449.718546,89.244777,41.117843,20,0.0,0.0,False,False,True,...,0,0,0.041667,365,53,12,0,1,6.432491e-16,1.000000
2015-12-31 05:00:00,184.053315,583.692900,100.567118,45.926909,2,0.0,0.0,False,True,False,...,0,0,0.041667,365,53,12,0,1,6.432491e-16,1.000000
2015-12-31 05:00:00,210.361451,538.381700,136.617842,54.532230,14,0.0,0.0,False,False,True,...,0,0,0.041667,365,53,12,0,1,6.432491e-16,1.000000


- Conversion des booléens en entiers

In [9]:
df = df.astype({col: 'int' for col in df.select_dtypes(include='bool').columns})

- Normalisation des données

In [10]:
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns

In [11]:
# On garde uniquement les colonnes ayant plus que 2 valeurs distinctes (on exclu les bools et 'machineID')
norm_cols = [col for col in numeric_cols if df[col].nunique() > 2]

In [12]:
rbscaler=RobustScaler()
df_n=rbscaler.fit_transform(df[norm_cols])
df_n

array([[ 0.24111651, -0.36899248,  0.85865182, ..., -0.83333333,
        -0.03656315,  0.6994648 ],
       [ 0.94290196, -0.32998335,  2.67441159, ..., -0.83333333,
        -0.03656315,  0.6994648 ],
       [ 1.21703182, -0.08394665, -0.24461291, ..., -0.83333333,
        -0.03656315,  0.6994648 ],
       ...,
       [ 0.61130398,  1.92118944, -0.00937043, ...,  1.        ,
        -0.04876291,  0.6995689 ],
       [ 1.85423546,  1.29299384,  2.4918914 , ...,  1.        ,
        -0.04876291,  0.6995689 ],
       [ 1.9370215 ,  0.12794637,  0.72904722, ...,  1.        ,
        -0.04876291,  0.6995689 ]])

In [13]:
df_n = pd.DataFrame(df_n, columns=norm_cols, index=df.index)

In [14]:
df_norm=df.copy()
df_norm[norm_cols] = df_n
df_norm

Unnamed: 0_level_0,volt,rotate,pressure,vibration,age,error_count,maint_count,is_model1,is_model2,is_model3,...,had_error_last_3h,had_maint_last_3h,time_to_failure_days,day_of_year,week_of_year,month,is_month_start,is_month_end,sin_day,cos_day
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-01-01 06:00:00,0.241117,-0.368992,0.858652,0.642875,0.500,0.0,0.0,0,0,1,...,0,0,-0.422619,-0.972067,-0.961538,-0.833333,1,0,-0.036563,0.699465
2015-01-01 06:00:00,0.942902,-0.329983,2.674412,0.325699,-0.500,0.0,0.0,1,0,0,...,0,0,-0.869048,-0.972067,-0.961538,-0.833333,1,0,-0.036563,0.699465
2015-01-01 06:00:00,1.217032,-0.083947,-0.244613,-0.100755,-1.125,0.0,0.0,1,0,0,...,0,0,-0.869048,-0.972067,-0.961538,-0.833333,1,0,-0.036563,0.699465
2015-01-01 06:00:00,-0.130158,-0.101921,-1.019619,0.274789,0.375,0.0,0.0,0,0,1,...,0,0,-0.869048,-0.972067,-0.961538,-0.833333,1,0,-0.036563,0.699465
2015-01-01 06:00:00,-0.833773,-0.792543,-0.786739,-0.683589,0.750,0.0,0.0,0,0,0,...,0,0,0.720238,-0.972067,-0.961538,-0.833333,1,0,-0.036563,0.699465
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2015-12-31 04:00:00,0.424745,0.075992,-0.042274,-0.738365,-1.500,0.0,0.0,0,1,0,...,0,0,-0.982143,1.061453,1.038462,1.000000,0,1,-0.048763,0.699569
2015-12-31 05:00:00,0.514235,0.063766,-0.794934,0.096056,0.750,0.0,0.0,0,0,1,...,0,0,-0.988095,1.061453,1.038462,1.000000,0,1,-0.048763,0.699569
2015-12-31 05:00:00,0.611304,1.921189,-0.009370,0.758473,-1.500,0.0,0.0,0,1,0,...,0,0,-0.988095,1.061453,1.038462,1.000000,0,1,-0.048763,0.699569
2015-12-31 05:00:00,1.854235,1.292994,2.491891,1.943800,0.000,0.0,0.0,0,0,1,...,0,0,-0.988095,1.061453,1.038462,1.000000,0,1,-0.048763,0.699569


- Gestion des valeurs manquantes

In [15]:
df_norm.isnull().sum().sum()

368

In [16]:
df_norm.dropna(inplace=True)

In [17]:
df_norm.isnull().sum().sum()

0

# MODELISATION

- Séparation en variables indépendantes et variable cibl

In [18]:
X=df_norm.drop(columns=['time_to_failure_days'])
y=df_norm['time_to_failure_days']

- Séparation en train et test

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

- Choix du model

In [20]:
print("\n--- XGBOOST REGRESSOR ---")

param_grid_xgb = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 10],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

tscv = TimeSeriesSplit(n_splits=5)

xgb_base = XGBRegressor(objective='reg:squarederror', random_state=42)

random_search_xgb = RandomizedSearchCV(
    xgb_base,
    param_distributions=param_grid_xgb,
    n_iter=10,
    cv=tscv,
    scoring='neg_mean_absolute_error',
    n_jobs=-1,
    random_state=42,
    verbose=1
)

random_search_xgb.fit(X_train, y_train)
print(f"Meilleurs paramètres: {random_search_xgb.best_params_}")

best_xgb = random_search_xgb.best_estimator_
y_pred_xgb = best_xgb.predict(X_test)

# Évaluation
print("MAE :", mean_absolute_error(y_test, y_pred_xgb))
print("RMSE:", mean_squared_error(y_test, y_pred_xgb, squared=False))
print("R²  :", r2_score(y_test, y_pred_xgb))


--- XGBOOST REGRESSOR ---
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Meilleurs paramètres: {'subsample': 0.8, 'n_estimators': 200, 'max_depth': 10, 'learning_rate': 0.1, 'colsample_bytree': 1.0}
MAE : 0.14196034252930537
RMSE: 0.19087969892219592
R²  : 0.8914689145888968
