In [1]:
import pandas as pd
from os.path import join
from sklearn.model_selection import train_test_split

In [2]:
raw_data_path = join("..", "data", "raw")

In [3]:
sample_100000_path = join(raw_data_path, "balanced_sample_100000.parquet")
sample_250000_path = join(raw_data_path, "balanced_sample_250000.parquet")

In [4]:
df = pd.read_parquet(sample_100000_path)

df.isna().sum()

YEAR                       0
QUARTER                    0
MONTH                      0
DAY_OF_MONTH               0
DAY_OF_WEEK                0
                       ...  
SECURITY_DELAY         50000
LATE_AIRCRAFT_DELAY    50000
FIRST_DEP_TIME         98476
TOTAL_ADD_GTIME        98476
LONGEST_ADD_GTIME      98476
Length: 64, dtype: int64

In [5]:
df.columns

Index(['YEAR', 'QUARTER', 'MONTH', 'DAY_OF_MONTH', 'DAY_OF_WEEK', 'FL_DATE',
       'UNIQUE_CARRIER', 'AIRLINE_ID', 'CARRIER', 'TAIL_NUM', 'FL_NUM',
       'ORIGIN_AIRPORT_ID', 'ORIGIN_AIRPORT_SEQ_ID', 'ORIGIN_CITY_MARKET_ID',
       'ORIGIN', 'ORIGIN_CITY_NAME', 'ORIGIN_STATE_ABR', 'ORIGIN_STATE_FIPS',
       'ORIGIN_STATE_NM', 'ORIGIN_WAC', 'DEST_AIRPORT_ID',
       'DEST_AIRPORT_SEQ_ID', 'DEST_CITY_MARKET_ID', 'DEST', 'DEST_CITY_NAME',
       'DEST_STATE_ABR', 'DEST_STATE_FIPS', 'DEST_STATE_NM', 'DEST_WAC',
       'CRS_DEP_TIME', 'DEP_TIME', 'DEP_DELAY', 'DEP_DELAY_NEW', 'DEP_DEL15',
       'DEP_DELAY_GROUP', 'DEP_TIME_BLK', 'TAXI_OUT', 'WHEELS_OFF',
       'WHEELS_ON', 'TAXI_IN', 'CRS_ARR_TIME', 'ARR_TIME', 'ARR_DELAY',
       'ARR_DELAY_NEW', 'ARR_DEL15', 'ARR_DELAY_GROUP', 'ARR_TIME_BLK',
       'CANCELLED', 'CANCELLATION_CODE', 'DIVERTED', 'CRS_ELAPSED_TIME',
       'ACTUAL_ELAPSED_TIME', 'AIR_TIME', 'FLIGHTS', 'DISTANCE',
       'DISTANCE_GROUP', 'CARRIER_DELAY', 'WEATHER_DELAY

In [6]:
features = [
    "MONTH",  # num
    "DAY_OF_MONTH",  # num
    "DAY_OF_WEEK",  # num
    "UNIQUE_CARRIER",  # cat
    # "TAIL_NUM", #
    "FL_NUM",  # cat
    "ORIGIN",  # cat
    "DEST",  # cat
    # celle-ci plutôt que DEP_TIME_BLK pour la comparer DEP_TIME
    "CRS_DEP_TIME",  # num
    "DEP_TIME",  # num
    # celle-ci pour avoir une plus grande finesse que DEP_DEL15
    "DEP_DELAY",  # num
    "TAXI_OUT",  # num
    "WHEELS_OFF",  # num
    "CRS_ARR_TIME",  # num
    "CRS_ELAPSED_TIME",  # num
    "DISTANCE",  # num
]
target = ["ARR_DEL15"]


In [None]:
cat_features = ["UNIQUE_CARRIER", "FL_NUM", "ORIGIN", "DEST"]
num_features = [feat for feat in features if feat not in cat_features]
print(f"cat: {cat_features}")
print(f"num: {num_features}")

cat: ['UNIQUE_CARRIER', 'FL_NUM', 'ORIGIN', 'DEST']
num: ['MONTH', 'DAY_OF_MONTH', 'DAY_OF_WEEK', 'CRS_DEP_TIME', 'DEP_TIME', 'DEP_DELAY', 'TAXI_OUT', 'WHEELS_OFF', 'CRS_ARR_TIME', 'CRS_ELAPSED_TIME', 'DISTANCE']


In [8]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [9]:
X = df[features]
y = df[target]

In [10]:
test_size = 0.2

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=test_size, random_state=42
)

In [11]:
num_pipeline = Pipeline(
    [("imputer", SimpleImputer(strategy="mean")), ("scaler", StandardScaler())]
)

cat_pipeline = Pipeline(
    [
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
    ]
)

preprocessor = ColumnTransformer(
    [("num", num_pipeline, num_features), ("cat", cat_pipeline, cat_features)]
)

In [12]:
X_processed = preprocessor.fit_transform(X_train)

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [14]:
lr = LogisticRegression(max_iter=1000, random_state=42, class_weight="balanced")

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

# Crée un pipeline complet avec préprocessing + modèle
full_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", lr)
])

# Mets à jour le param_grid pour le bon nom d’étape
param_grid = {
    "classifier__C": [0.01, 0.1, 1, 10, 100],
    "classifier__penalty": ["l1", "l2"],
    "classifier__solver": ["liblinear"],
}

grid_search = GridSearchCV(
    full_pipeline,
    param_grid,
    cv=5,
    n_jobs=-1,
    verbose=0,
)

grid_search.fit(X_train, y_train.values.ravel())

In [18]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    "classifier__C": [0.01, 0.1, 1, 10, 100],
    "classifier__penalty": ["l1", "l2"],
    "classifier__solver": ["liblinear"],
}
grid_search = GridSearchCV(
    lr,
    param_grid,
    cv=5,
    n_jobs=-1,
    verbose=0,
)

In [19]:
grid_search.fit(X_train, y_train)

print(f"Meilleurs paramètres : {grid_search.best_params_}")
print(f"Meilleur score CV : {grid_search.best_score_:.4f}")

ValueError: Invalid parameter 'classifier' for estimator LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42). Valid parameters are: ['C', 'class_weight', 'dual', 'fit_intercept', 'intercept_scaling', 'l1_ratio', 'max_iter', 'multi_class', 'n_jobs', 'penalty', 'random_state', 'solver', 'tol', 'verbose', 'warm_start'].