In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, mean_squared_error
import xgboost as xgb

In [2]:
# Import dataset
print("Loading dataset...")
df = pd.read_csv("./assets/flights_usa_2019.csv")  # sep = ";"
pd.set_option('display.max_columns', None)
print("...Done.")
print()

Loading dataset...
...Done.



In [3]:
#  Merge airports list into dataframe

airports = pd.read_csv('./assets/airports_list.csv')
airports.head(1)

Unnamed: 0,ORIGIN_AIRPORT_ID,DISPLAY_AIRPORT_NAME,ORIGIN_CITY_NAME,NAME
0,12992,Adams Field,"Little Rock, AR","NORTH LITTLE ROCK AIRPORT, AR US"


In [4]:
df = pd.merge(df, airports[['ORIGIN_AIRPORT_ID','NAME']], left_on='ORIGIN_AIRPORT_ID', right_on='ORIGIN_AIRPORT_ID')
df.head(1)

Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_UNIQUE_CARRIER,TAIL_NUM,OP_CARRIER_FL_NUM,ORIGIN_AIRPORT_ID,ORIGIN,ORIGIN_CITY_NAME,DEST_AIRPORT_ID,DEST,DEST_CITY_NAME,CRS_DEP_TIME,DEP_TIME,DEP_DELAY_NEW,DEP_DEL15,DEP_TIME_BLK,CRS_ARR_TIME,ARR_TIME,ARR_DELAY_NEW,ARR_TIME_BLK,CANCELLED,CANCELLATION_CODE,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,DISTANCE,DISTANCE_GROUP,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,Unnamed: 32,NAME
0,1,6,7,9E,N8694A,3280,10397,ATL,"Atlanta, GA",11150,CSG,"Columbus, GA",1645,1643.0,0.0,0.0,1600-1659,1732,1720.0,0.0,1700-1759,0.0,,47.0,37.0,83.0,1,,,,,,,ATLANTA HARTSFIELD JACKSON INTERNATIONAL AIRPO...


In [5]:
# Merge carriers list into dataframe
carriers = pd.read_csv('./assets/CARRIER_DECODE.csv')
carriers.drop_duplicates(subset=['OP_UNIQUE_CARRIER'], inplace=True)
df = pd.merge(df, carriers, on='OP_UNIQUE_CARRIER', how='left')

df.head(1)

Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_UNIQUE_CARRIER,TAIL_NUM,OP_CARRIER_FL_NUM,ORIGIN_AIRPORT_ID,ORIGIN,ORIGIN_CITY_NAME,DEST_AIRPORT_ID,DEST,DEST_CITY_NAME,CRS_DEP_TIME,DEP_TIME,DEP_DELAY_NEW,DEP_DEL15,DEP_TIME_BLK,CRS_ARR_TIME,ARR_TIME,ARR_DELAY_NEW,ARR_TIME_BLK,CANCELLED,CANCELLATION_CODE,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,DISTANCE,DISTANCE_GROUP,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,Unnamed: 32,NAME,AIRLINE_ID,CARRIER_NAME
0,1,6,7,9E,N8694A,3280,10397,ATL,"Atlanta, GA",11150,CSG,"Columbus, GA",1645,1643.0,0.0,0.0,1600-1659,1732,1720.0,0.0,1700-1759,0.0,,47.0,37.0,83.0,1,,,,,,,ATLANTA HARTSFIELD JACKSON INTERNATIONAL AIRPO...,20363,Endeavor Air Inc.


In [6]:
df = df.drop([ 'Unnamed: 32'], axis=1)
df = df.rename(columns={'NAME': 'ORIGIN_AIRPORT_NAME'})

df.head(1)

Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_UNIQUE_CARRIER,TAIL_NUM,OP_CARRIER_FL_NUM,ORIGIN_AIRPORT_ID,ORIGIN,ORIGIN_CITY_NAME,DEST_AIRPORT_ID,DEST,DEST_CITY_NAME,CRS_DEP_TIME,DEP_TIME,DEP_DELAY_NEW,DEP_DEL15,DEP_TIME_BLK,CRS_ARR_TIME,ARR_TIME,ARR_DELAY_NEW,ARR_TIME_BLK,CANCELLED,CANCELLATION_CODE,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,DISTANCE,DISTANCE_GROUP,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,ORIGIN_AIRPORT_NAME,AIRLINE_ID,CARRIER_NAME
0,1,6,7,9E,N8694A,3280,10397,ATL,"Atlanta, GA",11150,CSG,"Columbus, GA",1645,1643.0,0.0,0.0,1600-1659,1732,1720.0,0.0,1700-1759,0.0,,47.0,37.0,83.0,1,,,,,,ATLANTA HARTSFIELD JACKSON INTERNATIONAL AIRPO...,20363,Endeavor Air Inc.


In [7]:

# 1. Gérer les valeurs manquantes
cols_to_drop = ['CANCELLATION_CODE', 'CARRIER_DELAY', 'WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY']
df = df.drop(cols_to_drop, axis=1)

to_median = ['DEP_DELAY_NEW', 'DEP_TIME', 'DEP_DEL15', 'ARR_TIME', 'ARR_DELAY_NEW', 'ACTUAL_ELAPSED_TIME', 'CRS_ELAPSED_TIME']
df[to_median] = df[to_median].fillna(df[to_median].mean())

In [8]:
print(df.shape)
pd.concat([df.isnull().sum(), 100 * df.isnull().sum()/len(df)], axis=1).rename(columns={0:'Missing Records', 1:'Percentage (%)'})

(8467550, 29)


Unnamed: 0,Missing Records,Percentage (%)
MONTH,0,0.0
DAY_OF_MONTH,0,0.0
DAY_OF_WEEK,0,0.0
OP_UNIQUE_CARRIER,0,0.0
TAIL_NUM,20307,0.239821
OP_CARRIER_FL_NUM,0,0.0
ORIGIN_AIRPORT_ID,0,0.0
ORIGIN,0,0.0
ORIGIN_CITY_NAME,0,0.0
DEST_AIRPORT_ID,0,0.0


In [9]:
# 2. Feature engineering
cols_to_drop = ['ORIGIN_AIRPORT_ID', 'DEST_AIRPORT_ID', 'DISTANCE_GROUP', 'CANCELLED', 'ORIGIN_CITY_NAME', 'DEST_CITY_NAME', 'AIRLINE_ID', 'CARRIER_NAME']
df = df.drop(cols_to_drop, axis=1)

df["DEP_DEL15"] = (df["DEP_DEL15"] > 0).astype(int)
# df['FLIGHT_DURATION'] = df['CRS_ARR_TIME'] - df['CRS_DEP_TIME']

# # 3. Prétraitement des données
# cat_cols = ['OP_UNIQUE_CARRIER', 'TAIL_NUM', 'ORIGIN', 'DEST', 'DEP_TIME_BLK', 'ARR_TIME_BLK']
# num_cols = ['CRS_DEP_TIME', 'DEP_TIME', 'CRS_ARR_TIME', 'ARR_TIME', 'CRS_ELAPSED_TIME', 'ACTUAL_ELAPSED_TIME', 'DISTANCE']



In [13]:
# Séparer les données en X (variables indépendantes) et y (variables cibles)
X = df.drop(['DEP_DELAY_NEW', 'DEP_DEL15', 'ACTUAL_ELAPSED_TIME', 'TAIL_NUM', 'DEP_TIME', 'DEP_TIME_BLK', 'CRS_ARR_TIME', 'ARR_TIME', 'ARR_DELAY_NEW', 'ARR_TIME_BLK', 'ORIGIN_AIRPORT_NAME'], axis=1)
y_classification = df['DEP_DEL15']
y_regression = df['DEP_DELAY_NEW']

In [14]:
X.columns


Index(['MONTH', 'DAY_OF_MONTH', 'DAY_OF_WEEK', 'OP_UNIQUE_CARRIER',
       'OP_CARRIER_FL_NUM', 'ORIGIN', 'DEST', 'CRS_DEP_TIME',
       'CRS_ELAPSED_TIME', 'DISTANCE'],
      dtype='object')

In [12]:
fgbgr

NameError: name 'fgbgr' is not defined

In [15]:
df['DEP_DEL15'].isna().is_unique

False

In [16]:
# Automatically detect names of numeric/categorical columns
num_cols = []
cat_cols = []

for nom_colonne, dtype in X.dtypes.items():
    if ('float' in str(dtype)) or ('int' in str(dtype)):
        num_cols.append(nom_colonne)
    else:
        cat_cols.append(nom_colonne)

print('numeric features detected : ', num_cols)
print('categorical features détected : ', cat_cols)

numeric features detected :  ['MONTH', 'DAY_OF_MONTH', 'DAY_OF_WEEK', 'OP_CARRIER_FL_NUM', 'CRS_DEP_TIME', 'CRS_ELAPSED_TIME', 'DISTANCE']
categorical features détected :  ['OP_UNIQUE_CARRIER', 'ORIGIN', 'DEST']


In [17]:
# Séparer les données en ensembles d'entraînement et de test
X_train, X_test, y_train_class, y_test_class, y_train_reg, y_test_reg = train_test_split(X, y_classification, y_regression, test_size=0.2, random_state=42)


In [18]:
# Create pipeline for numeric features
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# Create pipeline for categorical features
categorical_transformer = OneHotEncoder(handle_unknown='ignore') # no missing values in categorical data, so we only need the OHE

# Use ColumnTransformer to make a preprocessor object that describes all the treatments to be done
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_cols),
        ('cat', categorical_transformer, cat_cols)
    ])

# Preprocessings on train set
from sklearn.calibration import LabelEncoder

print("Performing preprocessings on train set...")
X_train = preprocessor.fit_transform(X_train)
print('...Done.')
print(X_train[0:5]) # MUST use this syntax because X_train is a numpy array and not a pandas DataFrame anymore
print()
# Label encoding
print("Encoding labels...")
encoder = LabelEncoder()
y_train_class = encoder.fit_transform(y_train_class)
y_train_reg = y_train_reg.values.reshape(-1,1) # Convert to 2D numpy array
print("...Done")

# Preprocessings on test set
print("Performing preprocessings on test set...")
X_test = preprocessor.transform(X_test) # Don't fit again !! The test set is used for validating decisions
# we made based on the training set, therefore we can only apply transformations that were parametered using the training set.
# Otherwise this creates what is called a leak from the test set which will introduce a bias in all your results.
print('...Done.')
print(X_test[0:5,:]) # MUST use this syntax because X_test is a numpy array and not a pandas DataFrame anymore
print()
# Label encoding
print("Preparing labels...")
y_test_class = encoder.transform(y_test_class)
y_test_reg = y_test_reg.values.reshape(-1,1) # Convert to 2D numpy array
print("...Done")


Performing preprocessings on train set...
...Done.
  (0, 0)	0.4681675751041413
  (0, 1)	0.3697471691085774
  (0, 2)	-1.4627761201281468
  (0, 3)	-0.7790055303219219
  (0, 4)	0.3635644004706472
  (0, 5)	-0.9595583364050911
  (0, 6)	-0.95943699627236
  (0, 21)	1.0
  (0, 65)	1.0
  (0, 380)	1.0
  (1, 0)	0.4681675751041413
  (1, 1)	0.3697471691085774
  (1, 2)	-1.4627761201281468
  (1, 3)	-0.02095695545888056
  (1, 4)	0.3433542569481696
  (1, 5)	-0.40445958261125153
  (1, 6)	-0.23194034691197998
  (1, 11)	1.0
  (1, 78)	1.0
  (1, 165)	1.0
  (2, 0)	-1.4495664726828819
  (2, 1)	0.4837277134790427
  (2, 2)	-0.46041206485375574
  (2, 3)	-0.8233624611841254
  (2, 4)	0.7273469838752431
  (2, 5)	0.17771715917253145
  (2, 6)	0.0011892611785054349
  (2, 17)	1.0
  (2, 79)	1.0
  (2, 365)	1.0
  (3, 0)	0.7877899164019785
  (3, 1)	-1.5679220851893327
  (3, 2)	-1.4627761201281468
  (3, 3)	1.0964964951081684
  (3, 4)	-1.633197779550135
  (3, 5)	0.3943410630920786
  (3, 6)	0.419499834560724
  (3, 23)	1.0
  (3

In [None]:
# X_train, X_test, y_train_class, y_test_class, y_train_reg, y_test_reg

In [19]:
X.head(1)

Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_UNIQUE_CARRIER,OP_CARRIER_FL_NUM,ORIGIN,DEST,CRS_DEP_TIME,CRS_ELAPSED_TIME,DISTANCE
0,1,6,7,9E,3280,ATL,CSG,1645,47.0,83.0


In [20]:
# Entraînement et évaluation des modèles de classification
from sklearn.metrics import f1_score, precision_score, recall_score


clf1 = LogisticRegression(max_iter=1000)
clf2 = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
clf3 = RandomForestClassifier(n_jobs=-1)

for clf in [clf1, clf2, clf3]:
    clf.fit(X_train, y_train_class)
    y_class_pred = clf.predict(X_test)
    acc = accuracy_score(y_test_class, y_class_pred)
    print(f"{clf.__class__.__name__} accuracy: {acc}")
    print()
    f1 = f1_score(y_test_class, y_class_pred)
    print(f"{clf.__class__.__name__} F1-score: {f1}")
    print()
    precision = precision_score(y_test_class, y_class_pred)
    print(f"{clf.__class__.__name__} Precision: {precision}")
    print()
    recall= recall_score(y_test_class, y_class_pred)
    print(f"{clf.__class__.__name__} Recall: {recall}")
    print()



LogisticRegression accuracy: 0.7935553967794698

LogisticRegression F1-score: 0.016872131737604608

LogisticRegression Precision: 0.4581551618814905

LogisticRegression Recall: 0.008594314001856371

XGBClassifier accuracy: 0.8038635732886136

XGBClassifier F1-score: 0.16055962859908363

XGBClassifier Precision: 0.6813375372670142

XGBClassifier Recall: 0.09100232619432316



In [None]:
# Filtrage des vols retardés pour la régression
from sklearn.base import r2_score


X_train_reg = X_train[y_train_class == 1]
X_test_reg = X_test[y_test_class == 1]
y_reg_train = y_train_reg[y_train_class == 1]
y_reg_test = y_test_reg[y_test_class == 1]

# Entraînement et évaluation des modèles de régression
reg1 = xgb.XGBRegressor(use_label_encoder=False, eval_metric='rmse')
reg2 = RandomForestRegressor(n_jobs=-1)

for reg in [reg1, reg2]:
    reg.fit(X_train_reg, y_reg_train)
    y_reg_pred = reg.predict(X_test_reg)
    mse = mean_squared_error(y_reg_test, y_reg_pred)
    rmse = np.sqrt(mse)
    print(f"{reg.__class__.__name__} RMSE: {rmse}")
    print()
    r2 = r2_score(y_reg_test, y_reg_pred)
    print(f"{reg.__class__.__name__} R2 Score: {r2}")
