In [2]:
import pandas as pd
import numpy as np
import joblib
import optuna

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# ==========================
# 1️⃣  CHARGEMENT & NETTOYAGE
# ==========================
df = pd.read_csv('../projet_final/flights_sample_3m.csv')

# Filtrer un seul aéroport
airport = 'JFK'
df = df[df['ORIGIN'] == airport]

# Supprimer les vols annulés
df = df[df['CANCELLED'] == 0]
df.drop(columns=['CANCELLED', 'CANCELLATION_CODE'], inplace=True)

# Colonnes temporelles au format hhmm
time_hhmm_cols = ['WHEELS_ON','WHEELS_OFF', 'ARR_TIME', 'CRS_ARR_TIME','CRS_DEP_TIME','DEP_TIME']

# Colonnes temporelles en minutes
time_cols = ['TAXI_IN', 'AIR_TIME', 'ELAPSED_TIME', 'ARR_DELAY',
             'DELAY_DUE_CARRIER', 'DELAY_DUE_WEATHER', 'DELAY_DUE_NAS',
             'DELAY_DUE_SECURITY', 'DELAY_DUE_LATE_AIRCRAFT']

# Conversion des colonnes hhmm en entiers (si elles sont lues comme float)
for col in time_hhmm_cols:
    df[col] = df[col].fillna(0).astype(int)

# Fonction pour soustraire des minutes d'un format hhmm
def subtract_minutes(hhmm, minutes):
    if pd.isna(hhmm) or pd.isna(minutes):
        return np.nan  # Évite les erreurs si des valeurs sont NaN

    hours = hhmm // 100
    mins = hhmm % 100
    total_mins = hours * 60 + mins - minutes
    
    if total_mins < 0:  # Gérer les cas où l'heure passe à la veille
        total_mins += 24 * 60

    new_hours = (total_mins // 60) % 24  # Pour éviter des erreurs sur 24h
    new_mins = total_mins % 60
    return new_hours * 100 + new_mins  # Retourne en format hhmm

# Remplacement des valeurs nulles pour ARR_TIME avec CRS_ARR_TIME
df['ARR_TIME'] = df['ARR_TIME'].fillna(df['CRS_ARR_TIME'])

# Remplacement des valeurs nulles pour WHEELS_ON
df['WHEELS_ON'] = df.apply(lambda row: subtract_minutes(row['CRS_ARR_TIME'], row['TAXI_IN']) 
                           if pd.notna(row['TAXI_IN']) and pd.isna(row['WHEELS_ON']) 
                           else row['WHEELS_ON'], axis=1)

# Remplacement des valeurs nulles restantes pour WHEELS_ON et ARR_TIME avec la médiane
for col in time_hhmm_cols:
    df[col] = df[col].fillna(df[col].median()).astype(int)

# Remplacement des valeurs manquantes pour les durées en minutes
df[time_cols] = df[time_cols].fillna(df[time_cols].median())

# Convertir la date
df['FL_DATE'] = pd.to_datetime(df['FL_DATE'])

# Extraire des nouvelles features temporelles
df['YEAR'] = df['FL_DATE'].dt.year
df['MONTH'] = df['FL_DATE'].dt.month
df['DAY'] = df['FL_DATE'].dt.day
df['DAY_OF_WEEK'] = df['FL_DATE'].dt.dayofweek
df['HOUR'] = df['CRS_DEP_TIME'] // 100
df['IS_WEEKEND'] = df['DAY_OF_WEEK'].apply(lambda x: 1 if x in [5, 6] else 0)
df['SEASON'] = df['MONTH'].map({12: 'Winter', 1: 'Winter', 2: 'Winter',
                                 3: 'Spring', 4: 'Spring', 5: 'Spring',
                                 6: 'Summer', 7: 'Summer', 8: 'Summer',
                                 9: 'Autumn', 10: 'Autumn', 11: 'Autumn'})

# ==========================
# 2️⃣  SELECTION DES FEATURES
# ==========================
features = ['AIRLINE_CODE', 'DEST', 'CRS_DEP_TIME','DEP_TIME','DEP_DELAY', 'YEAR', 'MONTH', 
            'DAY', 'DAY_OF_WEEK', 'HOUR', 'IS_WEEKEND', 'SEASON', 'DELAY_DUE_CARRIER', 
            'DELAY_DUE_WEATHER', 'DELAY_DUE_NAS', 'DELAY_DUE_SECURITY', 
            'DELAY_DUE_LATE_AIRCRAFT', 'WHEELS_OFF', 'WHEELS_ON', 'ELAPSED_TIME', 'CRS_ELAPSED_TIME']
target = 'ARR_DELAY'

X = df[features]
y = df[target]

# Séparer en train/val
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# ==========================
# 3️⃣  PIPELINE DE TRANSFORMATION
# ==========================
num_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_features = X.select_dtypes(include=['object']).columns.tolist()

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)
])

# ==========================
# 4️⃣  OPTIMISATION DES HYPERPARAMÈTRES (GRIDSEARCH + OPTUNA)
# ==========================
def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 100, 500, step=50)
    max_depth = trial.suggest_int('max_depth', 5, 30, step=5)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2', None])

    model = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        random_state=42,
        n_jobs=-1
    )

    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('model', model)
    ])

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_val)
    
    return mean_absolute_error(y_val, y_pred)

# Lancer Optuna pour trouver les meilleurs paramètres
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=20)  # Augmente le nombre d'essais si nécessaire

best_params = study.best_params
print("\nMeilleurs hyperparamètres trouvés par Optuna:", best_params)

# ==========================
# 5️⃣  ENTRAÎNEMENT FINAL
# ==========================
best_model = RandomForestRegressor(
    **best_params,
    random_state=42,
    n_jobs=-1
)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', best_model)
])

print("\n🔄 Entraînement du modèle final avec les meilleurs hyperparamètres...")
pipeline.fit(X_train, y_train)
print("✅ Modèle final entraîné avec succès !")

# ==========================
# 6️⃣  ÉVALUATION DU MODÈLE
# ==========================
y_pred = pipeline.predict(X_val)

mae = mean_absolute_error(y_val, y_pred)
mse = mean_squared_error(y_val, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_val, y_pred)

print(f"\n📊 **Évaluation du modèle final:**")
print(f"MAE (Erreur absolue moyenne) : {mae:.2f} min")
print(f"MSE (Erreur quadratique moyenne) : {mse:.2f}")
print(f"RMSE (Racine de l'erreur quadratique moyenne) : {rmse:.2f} min")
print(f"R² Score : {r2:.4f}")

# ==========================
# 7️⃣  SAUVEGARDE DU MODÈLE
# ==========================
joblib.dump(pipeline, "best_flight_delay_model2.pkl")
print("\n✅ Modèle sauvegardé sous 'best_flight_delay_model2.pkl'")


[I 2025-03-17 18:39:11,830] A new study created in memory with name: no-name-6e45a0a9-7f8e-417c-8bba-dfc0940b647a
[I 2025-03-17 18:41:36,560] Trial 0 finished with value: 7.422996880662696 and parameters: {'n_estimators': 350, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 8, 'max_features': None}. Best is trial 0 with value: 7.422996880662696.
[I 2025-03-17 18:41:46,323] Trial 1 finished with value: 17.652378344333144 and parameters: {'n_estimators': 300, 'max_depth': 5, 'min_samples_split': 4, 'min_samples_leaf': 10, 'max_features': 'sqrt'}. Best is trial 0 with value: 7.422996880662696.
[I 2025-03-17 18:43:03,361] Trial 2 finished with value: 9.429762614417559 and parameters: {'n_estimators': 400, 'max_depth': 30, 'min_samples_split': 3, 'min_samples_leaf': 6, 'max_features': 'sqrt'}. Best is trial 0 with value: 7.422996880662696.
[I 2025-03-17 18:44:12,557] Trial 3 finished with value: 10.985803789690799 and parameters: {'n_estimators': 250, 'max_depth': 20, 'min_samp


Meilleurs hyperparamètres trouvés par Optuna: {'n_estimators': 300, 'max_depth': 25, 'min_samples_split': 6, 'min_samples_leaf': 4, 'max_features': None}

🔄 Entraînement du modèle final avec les meilleurs hyperparamètres...
✅ Modèle final entraîné avec succès !

📊 **Évaluation du modèle final:**
MAE (Erreur absolue moyenne) : 1.85 min
MSE (Erreur quadratique moyenne) : 19.34
RMSE (Racine de l'erreur quadratique moyenne) : 4.40 min
R² Score : 0.9937

✅ Modèle sauvegardé sous 'best_flight_delay_model2.pkl'
