# LIBRAIRIES IMPORTATION

In [1]:
import pandas as pd
import numpy as np

import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import io
pio.renderers.default = "vscode"  # or "notebook"

import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
from IPython.display import display
from PIL import Image
pal = sns.color_palette()

from sklearn.preprocessing import  StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, VotingClassifier 
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, VotingRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression, PoissonRegressor, SGDClassifier, SGDRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, ConfusionMatrixDisplay, RocCurveDisplay
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) # to avoid deprecation warnings


 # File reading and basic exploration 

In [2]:
# # Import dataset
# print("Loading dataset...")
# df = pd.read_csv("./assets/echantillon_df_10.csv")
# pd.set_option('display.max_columns', None)
# print("...Done.")
# print()

# Import dataset
print("Loading dataset...")
df = pd.read_csv("./assets/flights_usa_2019.csv")  # sep = ";"
pd.set_option('display.max_columns', None)
print("...Done.")
print()

Loading dataset...
...Done.



In [3]:
# Basic stats
print("Number of rows : {}".format(df.shape[0]))
print()

print("Display of dataset: ")
display(df.head())
print()

print("Basics statistics: ")
data_desc = df.describe(include='all')
display(data_desc)
print()

print("Percentage of missing values: ")
display(100*df.isnull().sum()/df.shape[0])

Number of rows : 9351550

Display of dataset: 


Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_UNIQUE_CARRIER,TAIL_NUM,OP_CARRIER_FL_NUM,ORIGIN_AIRPORT_ID,ORIGIN,ORIGIN_CITY_NAME,DEST_AIRPORT_ID,DEST,DEST_CITY_NAME,CRS_DEP_TIME,DEP_TIME,DEP_DELAY_NEW,DEP_DEL15,DEP_TIME_BLK,CRS_ARR_TIME,ARR_TIME,ARR_DELAY_NEW,ARR_TIME_BLK,CANCELLED,CANCELLATION_CODE,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,DISTANCE,DISTANCE_GROUP,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,Unnamed: 32
0,1,6,7,9E,N8694A,3280,10397,ATL,"Atlanta, GA",11150,CSG,"Columbus, GA",1645,1643.0,0.0,0.0,1600-1659,1732,1720.0,0.0,1700-1759,0.0,,47.0,37.0,83.0,1,,,,,,
1,1,7,1,9E,N8970D,3280,10397,ATL,"Atlanta, GA",11150,CSG,"Columbus, GA",1645,1640.0,0.0,0.0,1600-1659,1732,1712.0,0.0,1700-1759,0.0,,47.0,32.0,83.0,1,,,,,,
2,1,8,2,9E,N820AY,3280,10397,ATL,"Atlanta, GA",11150,CSG,"Columbus, GA",1645,1640.0,0.0,0.0,1600-1659,1732,1719.0,0.0,1700-1759,0.0,,47.0,39.0,83.0,1,,,,,,
3,1,9,3,9E,N840AY,3280,10397,ATL,"Atlanta, GA",11150,CSG,"Columbus, GA",1645,1640.0,0.0,0.0,1600-1659,1732,1717.0,0.0,1700-1759,0.0,,47.0,37.0,83.0,1,,,,,,
4,1,10,4,9E,N8969A,3280,10397,ATL,"Atlanta, GA",11150,CSG,"Columbus, GA",1645,1640.0,0.0,0.0,1600-1659,1732,1721.0,0.0,1700-1759,0.0,,47.0,41.0,83.0,1,,,,,,



Basics statistics: 


Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_UNIQUE_CARRIER,TAIL_NUM,OP_CARRIER_FL_NUM,ORIGIN_AIRPORT_ID,ORIGIN,ORIGIN_CITY_NAME,DEST_AIRPORT_ID,DEST,DEST_CITY_NAME,CRS_DEP_TIME,DEP_TIME,DEP_DELAY_NEW,DEP_DEL15,DEP_TIME_BLK,CRS_ARR_TIME,ARR_TIME,ARR_DELAY_NEW,ARR_TIME_BLK,CANCELLED,CANCELLATION_CODE,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,DISTANCE,DISTANCE_GROUP,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,Unnamed: 32
count,9351550.0,9351550.0,9351550.0,9351550,9329631,9351550.0,9351550.0,9351550,9351550,9351550.0,9351550,9351550,9351550.0,9184434.0,9184400.0,9184400.0,9351550,9351550.0,9174672.0,9153384.0,9351550,9351550.0,173639,9351414.0,9153384.0,9351550.0,9351550.0,1766679.0,1766679.0,1766679.0,1766679.0,1766679.0,0.0
unique,,,,17,5891,,,360,352,,360,352,,,,,19,,,,19,,4,,,,,,,,,,
top,,,,WN,N485HA,,,ATL,"Chicago, IL",,ATL,"Chicago, IL",,,,,0600-0659,,,,2100-2159,,B,,,,,,,,,,
freq,,,,1715215,4557,,,498732,533869,,498763,533827,,,,,700179,,,,589883,,90694,,,,,,,,,,
mean,6.53974,15.75711,3.9208,,,2553.356,12648.69,,,12648.63,,,1330.823,1334.684,14.41948,0.1892718,,1484.798,1460.514,14.4567,,0.01856794,,141.859,136.6522,801.5248,3.679667,21.31858,3.921179,16.7769,0.09332822,27.78371,
std,3.126851,8.774593,1.995751,,,1797.446,1524.394,,,1524.37,,,493.7206,508.5849,48.56696,0.3917244,,522.5834,544.6019,48.29737,,0.1349932,,72.42755,72.67892,593.8144,2.337186,66.52283,32.29774,40.75528,3.274139,53.77315,
min,1.0,1.0,1.0,,,1.0,10135.0,,,10135.0,,,1.0,1.0,0.0,0.0,,1.0,1.0,0.0,,0.0,,1.0,15.0,31.0,1.0,0.0,0.0,0.0,0.0,0.0,
25%,4.0,8.0,2.0,,,1024.0,11292.0,,,11292.0,,,913.0,913.0,0.0,0.0,,1100.0,1044.0,0.0,,0.0,,90.0,84.0,369.0,2.0,0.0,0.0,0.0,0.0,0.0,
50%,7.0,16.0,4.0,,,2154.0,12889.0,,,12889.0,,,1322.0,1326.0,0.0,0.0,,1515.0,1502.0,0.0,,0.0,,124.0,119.0,640.0,3.0,0.0,0.0,2.0,0.0,3.0,
75%,9.0,23.0,6.0,,,3910.0,13931.0,,,13931.0,,,1738.0,1747.0,7.0,0.0,,1921.0,1917.0,8.0,,0.0,,171.0,167.0,1034.0,5.0,18.0,0.0,20.0,0.0,34.0,



Percentage of missing values: 


MONTH                    0.000000
DAY_OF_MONTH             0.000000
DAY_OF_WEEK              0.000000
OP_UNIQUE_CARRIER        0.000000
TAIL_NUM                 0.234389
OP_CARRIER_FL_NUM        0.000000
ORIGIN_AIRPORT_ID        0.000000
ORIGIN                   0.000000
ORIGIN_CITY_NAME         0.000000
DEST_AIRPORT_ID          0.000000
DEST                     0.000000
DEST_CITY_NAME           0.000000
CRS_DEP_TIME             0.000000
DEP_TIME                 1.787041
DEP_DELAY_NEW            1.787404
DEP_DEL15                1.787404
DEP_TIME_BLK             0.000000
CRS_ARR_TIME             0.000000
ARR_TIME                 1.891430
ARR_DELAY_NEW            2.119071
ARR_TIME_BLK             0.000000
CANCELLED                0.000000
CANCELLATION_CODE       98.143206
CRS_ELAPSED_TIME         0.001454
ACTUAL_ELAPSED_TIME      2.119071
DISTANCE                 0.000000
DISTANCE_GROUP           0.000000
CARRIER_DELAY           81.108169
WEATHER_DELAY           81.108169
NAS_DELAY     

In [4]:
# columns to drop : Drop the columns with > 80% null values
df.drop(columns = ["CANCELLATION_CODE", "LATE_AIRCRAFT_DELAY" , "Unnamed: 32", "SECURITY_DELAY", "CARRIER_DELAY", "NAS_DELAY", "WEATHER_DELAY"], inplace = True)


In [5]:
df.isnull().any()

MONTH                  False
DAY_OF_MONTH           False
DAY_OF_WEEK            False
OP_UNIQUE_CARRIER      False
TAIL_NUM                True
OP_CARRIER_FL_NUM      False
ORIGIN_AIRPORT_ID      False
ORIGIN                 False
ORIGIN_CITY_NAME       False
DEST_AIRPORT_ID        False
DEST                   False
DEST_CITY_NAME         False
CRS_DEP_TIME           False
DEP_TIME                True
DEP_DELAY_NEW           True
DEP_DEL15               True
DEP_TIME_BLK           False
CRS_ARR_TIME           False
ARR_TIME                True
ARR_DELAY_NEW           True
ARR_TIME_BLK           False
CANCELLED              False
CRS_ELAPSED_TIME        True
ACTUAL_ELAPSED_TIME     True
DISTANCE               False
DISTANCE_GROUP         False
dtype: bool

In [6]:
# df.select_dtypes(exclude=["object", "int"]).count()
df.loc[:, df.isnull().any()].count()

TAIL_NUM               9329631
DEP_TIME               9184434
DEP_DELAY_NEW          9184400
DEP_DEL15              9184400
ARR_TIME               9174672
ARR_DELAY_NEW          9153384
CRS_ELAPSED_TIME       9351414
ACTUAL_ELAPSED_TIME    9153384
dtype: int64

In [7]:
# Drop the rest of the null rows

'''
Since, rest of the null values are very lest ~2%, it is better to drop these rows. 
But still to be sure, we can check the information loss.
'''

rows_after_dropping_null = df.dropna(axis=0).shape[0]
rows_before_dropping_null = df.shape[0]

perc_info_loss = (rows_before_dropping_null - rows_after_dropping_null)*100/rows_before_dropping_null
print("Percentage information loss after dropping null values is {:.3f}%".format(perc_info_loss))

Percentage information loss after dropping null values is 2.119%


In [8]:
# for col in df.select_dtypes(exclude=["object", "int"]).columns:
#     df[col] = df[col].fillna(method='ffill') # Attention car plusieurs compagnies
#     df[col] = df[col].fillna(method='bfill') #

# df.select_dtypes(exclude=["object", "int"]).isnull().any()

In [9]:
for col in df.loc[:, df.isnull().any()].iloc[:, 1:].columns:
    df[[col]] = df[[col]].apply(lambda x: x.fillna(x.median()))
    
df.loc[:, df.isnull().any()].count().any()


True

In [10]:
df.dropna(axis=0, inplace=True)
df.isnull().any()

MONTH                  False
DAY_OF_MONTH           False
DAY_OF_WEEK            False
OP_UNIQUE_CARRIER      False
TAIL_NUM               False
OP_CARRIER_FL_NUM      False
ORIGIN_AIRPORT_ID      False
ORIGIN                 False
ORIGIN_CITY_NAME       False
DEST_AIRPORT_ID        False
DEST                   False
DEST_CITY_NAME         False
CRS_DEP_TIME           False
DEP_TIME               False
DEP_DELAY_NEW          False
DEP_DEL15              False
DEP_TIME_BLK           False
CRS_ARR_TIME           False
ARR_TIME               False
ARR_DELAY_NEW          False
ARR_TIME_BLK           False
CANCELLED              False
CRS_ELAPSED_TIME       False
ACTUAL_ELAPSED_TIME    False
DISTANCE               False
DISTANCE_GROUP         False
dtype: bool

## Normalize dataset 

Let's normalize our dataset before going anywhere:

In [11]:
df["DEP_DELAY"] = df["DEP_DELAY_NEW"].apply(lambda x : 1 if x > 0 else 0)
df["ARR_DELAY"] = df["ARR_DELAY_NEW"].apply(lambda x : 1 if x > 0 else 0)
df.loc[:, ["DEP_DELAY_NEW", "ARR_DELAY_NEW"]] = df.select_dtypes(exclude = ["object"]).loc[:, ["DEP_DELAY_NEW", "ARR_DELAY_NEW"]]\
                                                  .apply(lambda x: np.where(x > 0, np.log(x), 0))

df.head()


divide by zero encountered in log



Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_UNIQUE_CARRIER,TAIL_NUM,OP_CARRIER_FL_NUM,ORIGIN_AIRPORT_ID,ORIGIN,ORIGIN_CITY_NAME,DEST_AIRPORT_ID,DEST,DEST_CITY_NAME,CRS_DEP_TIME,DEP_TIME,DEP_DELAY_NEW,DEP_DEL15,DEP_TIME_BLK,CRS_ARR_TIME,ARR_TIME,ARR_DELAY_NEW,ARR_TIME_BLK,CANCELLED,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,DISTANCE,DISTANCE_GROUP,DEP_DELAY,ARR_DELAY
0,1,6,7,9E,N8694A,3280,10397,ATL,"Atlanta, GA",11150,CSG,"Columbus, GA",1645,1643.0,0.0,0.0,1600-1659,1732,1720.0,0.0,1700-1759,0.0,47.0,37.0,83.0,1,0,0
1,1,7,1,9E,N8970D,3280,10397,ATL,"Atlanta, GA",11150,CSG,"Columbus, GA",1645,1640.0,0.0,0.0,1600-1659,1732,1712.0,0.0,1700-1759,0.0,47.0,32.0,83.0,1,0,0
2,1,8,2,9E,N820AY,3280,10397,ATL,"Atlanta, GA",11150,CSG,"Columbus, GA",1645,1640.0,0.0,0.0,1600-1659,1732,1719.0,0.0,1700-1759,0.0,47.0,39.0,83.0,1,0,0
3,1,9,3,9E,N840AY,3280,10397,ATL,"Atlanta, GA",11150,CSG,"Columbus, GA",1645,1640.0,0.0,0.0,1600-1659,1732,1717.0,0.0,1700-1759,0.0,47.0,37.0,83.0,1,0,0
4,1,10,4,9E,N8969A,3280,10397,ATL,"Atlanta, GA",11150,CSG,"Columbus, GA",1645,1640.0,0.0,0.0,1600-1659,1732,1721.0,0.0,1700-1759,0.0,47.0,41.0,83.0,1,0,0


In [12]:
# import math

# for index, row in df.iterrows():
#     df.at[index, 'CRS_DEP_TIME'] = math.floor(row['CRS_DEP_TIME'] / 100)

# Définir une fonction pour formater les valeurs en heures
def format_heure(valeur):
    heure = str(valeur // 100).zfill(2)
    minute = str(valeur % 100).zfill(2)
    return "{} h {}".format(heure, minute)

# Définir une fonction pour convertir en l'heure en minutes 
def format_minutes(format_heure):
    minutes = format_heure // 100 * 60 + format_heure % 100
    return minutes

# Appliquer la fonction à ces colonnes

df['DEP_TIME'] = df['DEP_TIME'].astype(int).apply(format_minutes)
df['ARR_TIME'] = df['ARR_TIME'].astype(int).apply(format_minutes)
df['CRS_DEP_TIME'] = df['CRS_DEP_TIME'].apply(format_minutes)
df['CRS_ARR_TIME'] = df['CRS_ARR_TIME'].apply(format_minutes)

# Afficher le DataFrame résultant
df.head()

Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_UNIQUE_CARRIER,TAIL_NUM,OP_CARRIER_FL_NUM,ORIGIN_AIRPORT_ID,ORIGIN,ORIGIN_CITY_NAME,DEST_AIRPORT_ID,DEST,DEST_CITY_NAME,CRS_DEP_TIME,DEP_TIME,DEP_DELAY_NEW,DEP_DEL15,DEP_TIME_BLK,CRS_ARR_TIME,ARR_TIME,ARR_DELAY_NEW,ARR_TIME_BLK,CANCELLED,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,DISTANCE,DISTANCE_GROUP,DEP_DELAY,ARR_DELAY
0,1,6,7,9E,N8694A,3280,10397,ATL,"Atlanta, GA",11150,CSG,"Columbus, GA",1005,1003,0.0,0.0,1600-1659,1052,1040,0.0,1700-1759,0.0,47.0,37.0,83.0,1,0,0
1,1,7,1,9E,N8970D,3280,10397,ATL,"Atlanta, GA",11150,CSG,"Columbus, GA",1005,1000,0.0,0.0,1600-1659,1052,1032,0.0,1700-1759,0.0,47.0,32.0,83.0,1,0,0
2,1,8,2,9E,N820AY,3280,10397,ATL,"Atlanta, GA",11150,CSG,"Columbus, GA",1005,1000,0.0,0.0,1600-1659,1052,1039,0.0,1700-1759,0.0,47.0,39.0,83.0,1,0,0
3,1,9,3,9E,N840AY,3280,10397,ATL,"Atlanta, GA",11150,CSG,"Columbus, GA",1005,1000,0.0,0.0,1600-1659,1052,1037,0.0,1700-1759,0.0,47.0,37.0,83.0,1,0,0
4,1,10,4,9E,N8969A,3280,10397,ATL,"Atlanta, GA",11150,CSG,"Columbus, GA",1005,1000,0.0,0.0,1600-1659,1052,1041,0.0,1700-1759,0.0,47.0,41.0,83.0,1,0,0


In [13]:
# Get the list of all column names from headers
column_headers = df.columns.tolist()
print("The Column Header :", column_headers, f"\nTotal: {len(list(df.columns))} columns")

The Column Header : ['MONTH', 'DAY_OF_MONTH', 'DAY_OF_WEEK', 'OP_UNIQUE_CARRIER', 'TAIL_NUM', 'OP_CARRIER_FL_NUM', 'ORIGIN_AIRPORT_ID', 'ORIGIN', 'ORIGIN_CITY_NAME', 'DEST_AIRPORT_ID', 'DEST', 'DEST_CITY_NAME', 'CRS_DEP_TIME', 'DEP_TIME', 'DEP_DELAY_NEW', 'DEP_DEL15', 'DEP_TIME_BLK', 'CRS_ARR_TIME', 'ARR_TIME', 'ARR_DELAY_NEW', 'ARR_TIME_BLK', 'CANCELLED', 'CRS_ELAPSED_TIME', 'ACTUAL_ELAPSED_TIME', 'DISTANCE', 'DISTANCE_GROUP', 'DEP_DELAY', 'ARR_DELAY'] 
Total: 28 columns


In [14]:
# Choose the columns you want to have as your features
features_list = ['MONTH', 'DAY_OF_MONTH', 'DAY_OF_WEEK', 'OP_UNIQUE_CARRIER', 'TAIL_NUM', 'OP_CARRIER_FL_NUM', 
                 'ORIGIN_AIRPORT_ID', 'ORIGIN', 'DEST_AIRPORT_ID', 'DEST', 'DEST_CITY_NAME', 
                 'CRS_DEP_TIME', 'DEP_TIME', 'CRS_ELAPSED_TIME', 'ACTUAL_ELAPSED_TIME', 'DISTANCE', 
                 'DISTANCE_GROUP'
                ]

X = df.loc[:,features_list] # We add feature_list into our loc 


Using <a href="https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.select_dtypes.html" target="_blank">`select_dtypes`</a> from Pandas, filter out all categorical variables. 

In [15]:
df_num = X.select_dtypes(exclude = ["object"]); df_num.head()

Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_CARRIER_FL_NUM,ORIGIN_AIRPORT_ID,DEST_AIRPORT_ID,CRS_DEP_TIME,DEP_TIME,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,DISTANCE,DISTANCE_GROUP
0,1,6,7,3280,10397,11150,1005,1003,47.0,37.0,83.0,1
1,1,7,1,3280,10397,11150,1005,1000,47.0,32.0,83.0,1
2,1,8,2,3280,10397,11150,1005,1000,47.0,39.0,83.0,1
3,1,9,3,3280,10397,11150,1005,1000,47.0,37.0,83.0,1
4,1,10,4,3280,10397,11150,1005,1000,47.0,41.0,83.0,1


## ACP

Dans le sous-module `decomposition`, nous allons importer la fonction [`PCA()`](http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html). Le fonctionnement de celle-ci est similaire à toutes les autres dans `scikit-learn`. 

1. On créé d'abord un objet (nommé ici `pca`) qui va contenir le résultat de l'ACP. Dans la fonction `PCA()`, on pourra aussi indiquer les paramètres tels que le nombre de composants à retenir (`n_components` - ici on garde tout).
2. Ensuite, on ajuste (*fit*  en anglais) sur des données avec la fonction `fit()` de l'objet précédemment créé. Dans cette fonction, nous devons donc passer les données à utiliser (variables uniquement quantitatives).

Si l'on souhaite une ACP normée (ce qui est notre cas ici), il nous faut standardiser les données en amont, avec par exemple la fonction `scale()` du sous-module `preprocessing` (importée aussi ici). Bien évidemment, il ne faut passer que des variables quantitatives, sans données manquantes.

In [16]:
# Import Standard Scaler
from sklearn.preprocessing import StandardScaler

# Initialize StandardScaler
# StandardScaler will substract mean and divide by standard deviation to each observation
sc = StandardScaler()

# Apply StandardScaler to X
X = sc.fit_transform(df_num)

# Visualize first five rows 
# Standard scaler transform X as numpy array. Therefore you can't use .head()
X[:5]

array([[-1.7730186 , -1.11209136,  1.54250252,  0.40355687, -1.47686981,
        -0.98296403,  0.6580584 ,  0.6282336 , -1.30915283, -1.37886621,
        -1.20947301, -1.14604024],
       [-1.7730186 , -0.99810926, -1.46367096,  0.40355687, -1.47686981,
        -0.98296403,  0.6580584 ,  0.61831956, -1.30915283, -1.44828245,
        -1.20947301, -1.14604024],
       [-1.7730186 , -0.88412716, -0.96264205,  0.40355687, -1.47686981,
        -0.98296403,  0.6580584 ,  0.61831956, -1.30915283, -1.35109971,
        -1.20947301, -1.14604024],
       [-1.7730186 , -0.77014505, -0.46161313,  0.40355687, -1.47686981,
        -0.98296403,  0.6580584 ,  0.61831956, -1.30915283, -1.37886621,
        -1.20947301, -1.14604024],
       [-1.7730186 , -0.65616295,  0.03941578,  0.40355687, -1.47686981,
        -0.98296403,  0.6580584 ,  0.61831956, -1.30915283, -1.32333321,
        -1.20947301, -1.14604024]])

In [17]:
# import PCA 
from sklearn.decomposition import PCA

# Instanciate PCA 
# We can even specify how many components we want within the class
pca = PCA()

# Fit pca to our model 
PC = pca.fit_transform(X)

# Visualize first five rows
PC[:5]

array([[-2.69127282e+00, -8.90825493e-01,  3.66709436e-01,
        -2.67468546e+00,  2.99459474e-01, -1.01728772e+00,
        -9.03477717e-02, -8.87603250e-01, -1.66160476e-01,
        -1.21170853e-01, -3.46023825e-02, -1.61388797e-03],
       [-2.73435251e+00, -8.67487497e-01, -1.99057323e+00,
        -1.62679493e+00, -7.19518923e-01, -1.22317770e+00,
        -3.80368888e-02,  2.56290797e-01, -2.24479468e-01,
        -1.60854896e-01, -6.02172550e-03, -7.86718213e-04],
       [-2.68580461e+00, -8.70591715e-01, -1.55031122e+00,
        -1.71253696e+00, -5.54759616e-01, -1.27553304e+00,
        -4.66022797e-02,  7.61538171e-02, -1.68398033e-01,
        -1.09701615e-01, -4.73721551e-02, -6.82420300e-03],
       [-2.69753827e+00, -8.72771521e-01, -1.11555567e+00,
        -1.79263116e+00, -3.85835050e-01, -1.32496070e+00,
        -5.50568178e-02, -1.18028848e-01, -1.80986391e-01,
        -1.22296705e-01, -3.54407420e-02, -4.27048867e-03],
       [-2.66908422e+00, -8.75567602e-01, -6.7712914

In [18]:
# Here are our eigen vectors
pca.components_

array([[-3.24560520e-03,  2.91019730e-04,  3.25146337e-03,
        -2.10172054e-01,  5.44720792e-02,  6.42343330e-02,
        -1.32089507e-02, -1.52662604e-02,  4.88101492e-01,
         4.82448327e-01,  4.89878387e-01,  4.87160586e-01],
       [ 1.53721123e-03,  1.90464355e-03, -5.19396437e-03,
         8.93515682e-03,  4.43136706e-02, -4.72170583e-02,
        -7.05523281e-01, -7.05401561e-01, -6.88697289e-03,
        -7.39830102e-03, -1.12924572e-02, -1.04831021e-02],
       [ 3.46307925e-01,  3.27905054e-01,  7.95570655e-01,
         3.48462522e-01, -9.10472975e-02, -5.53173389e-02,
        -1.40561934e-03, -2.47888298e-03,  4.57206219e-02,
         4.40696370e-02,  3.76715988e-02,  3.71605120e-02],
       [ 4.14919011e-01,  6.97755232e-01, -3.21100998e-01,
        -1.81693158e-01,  3.83554711e-01,  2.28129424e-01,
         7.00860965e-03,  6.46209172e-03, -4.33163259e-02,
        -4.52009856e-02, -3.04992160e-02, -2.76374969e-02],
       [-5.36241072e-01, -1.66022135e-02,  3.3908483

### Variance expliquée

L'objet `pca` comprend maintenant un certain nombre d'objets et de fonctions. Le premier objet est le tableau des variances expliquées (`explained_variance_` - proportionnelles aux valeurs propres) par chaque dimension, et le ratio (proportion) de variance expliquée par dimension (`explained_variance_ratio_`).

In [19]:
print(pca.explained_variance_)
print(pca.explained_variance_ratio_)

[4.08001721 1.9489784  1.04176968 0.99690313 0.99579316 0.9926552
 0.98185367 0.8251259  0.05665892 0.05225468 0.01838981 0.00960151]
[0.3400014  0.16241485 0.08681413 0.08307525 0.08298275 0.08272126
 0.08182113 0.06876048 0.00472158 0.00435456 0.00153248 0.00080013]


Bien évidemment, il est possible (et préférable) de faire un tableau récapitulatif, avec les variances expliquées, les proportions de variance expliquée simples et cumulées. Voici un petit code permettant de faire ceci.

In [20]:
eig = pd.DataFrame(
    {
        "Dimension" : ["Dim" + str(x + 1) for x in range(12)], 
        "Variance expliquée" : pca.explained_variance_,
        "% variance expliquée" : np.round(pca.explained_variance_ratio_ * 100, decimals=2),
        "% cum. var. expliquée" : np.round(np.cumsum(pca.explained_variance_ratio_ * 100), decimals=2)
    }
)
eig

Unnamed: 0,Dimension,Variance expliquée,% variance expliquée,% cum. var. expliquée
0,Dim1,4.080017,34.0,34.0
1,Dim2,1.948978,16.24,50.24
2,Dim3,1.04177,8.68,58.92
3,Dim4,0.996903,8.31,67.23
4,Dim5,0.995793,8.3,75.53
5,Dim6,0.992655,8.27,83.8
6,Dim7,0.981854,8.18,91.98
7,Dim8,0.825126,6.88,98.86
8,Dim9,0.056659,0.47,99.33
9,Dim10,0.052255,0.44,99.77


## Model I : Classification

In [21]:
 # Separate target variable Y from features X
## Choose the columns you want to have as your features

features_list = ['MONTH', 'DAY_OF_MONTH', 'DAY_OF_WEEK', 'OP_UNIQUE_CARRIER', 'OP_CARRIER_FL_NUM', 
                 'ORIGIN_AIRPORT_ID', 'ORIGIN', 'DEST_AIRPORT_ID', 'DEST', 
                 'DEST_CITY_NAME','CRS_DEP_TIME', 'CRS_ELAPSED_TIME', 
                 'DISTANCE', 
                ]

target = ['DEP_DELAY']

print("Splitting dataset into X and y...")
X = df.loc[:, features_list] # We add feature_list into our loc 
y = df.loc[:, target] # We set a target variable for the molel 
print("...Done.")

Splitting dataset into X and y...
...Done.


In [22]:
# Divide dataset Train set & Test set 
## First we import train_test_split


print("Splitting dataset into train set and test set...")
## Then we use train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, 
                                                          random_state=0,
                                                          stratify=y
                                                   )

print("...Done.")   

Splitting dataset into train set and test set...
...Done.


In [23]:
y_train.value_counts()*100/len(y_train)


DEP_DELAY
0            65.755785
1            34.244215
Name: count, dtype: float64

In [24]:
X_train.head()

Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_UNIQUE_CARRIER,OP_CARRIER_FL_NUM,ORIGIN_AIRPORT_ID,ORIGIN,DEST_AIRPORT_ID,DEST,DEST_CITY_NAME,CRS_DEP_TIME,CRS_ELAPSED_TIME,DISTANCE
7016591,9,10,2,DL,1704,11697,FLL,11433,DTW,"Detroit, MI",1074,177.0,1127.0
8415821,11,12,2,F9,1045,11042,CLE,13204,MCO,"Orlando, FL",630,153.0,895.0
6874091,8,6,2,OO,3017,11423,DSM,13930,ORD,"Chicago, IL",579,89.0,299.0
1621355,3,26,2,WN,2359,10821,BWI,10529,BDL,"Hartford, CT",425,75.0,283.0
542809,1,20,7,DL,1069,11298,DFW,10397,ATL,"Atlanta, GA",959,120.0,731.0


In [26]:
# Encoding categorical features and standardizing numeric features
print("#### X_train BEFORE preprocessing ####")
print(X_train.head())
print()

print("Encoding categorical features and standardizing numerical features...")
## First let's import libraries
## StandardScaler to scale data (i.e apply Z-score)
## OneHotEncoder to encode categorical variables


numeric_features = ["MONTH","DAY_OF_MONTH","DAY_OF_WEEK","CRS_DEP_TIME","CRS_ELAPSED_TIME","DISTANCE"] # Choose which column index we are going to scale
numeric_transformer = StandardScaler()


categorical_features = ["ORIGIN","DEST"]
categorical_transformer = OneHotEncoder(drop='first', handle_unknown = 'ignore')   # drop='first', handle_unknown = 'ignore'

# Apply ColumnTransformer to create a pipeline that will apply the above preprocessing
feature_encoder = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features),    
        ('num', numeric_transformer, numeric_features)
        ]
    )

X_train = feature_encoder.fit_transform(X_train)
print("...Done.")
print("#### X_train AFTER preprocessing ####")
print(X_train[0:5,:]) # print first 5 rows (not using iloc since now X_train became a numpy array)
print()

#### X_train BEFORE preprocessing ####
         MONTH  DAY_OF_MONTH  DAY_OF_WEEK OP_UNIQUE_CARRIER   
7016591      9            10            2                DL  \
8415821     11            12            2                F9   
6874091      8             6            2                OO   
1621355      3            26            2                WN   
542809       1            20            7                DL   

         OP_CARRIER_FL_NUM  ORIGIN_AIRPORT_ID ORIGIN  DEST_AIRPORT_ID DEST   
7016591               1704              11697    FLL            11433  DTW  \
8415821               1045              11042    CLE            13204  MCO   
6874091               3017              11423    DSM            13930  ORD   
1621355               2359              10821    BWI            10529  BDL   
542809                1069              11298    DFW            10397  ATL   

        DEST_CITY_NAME  CRS_DEP_TIME  CRS_ELAPSED_TIME  DISTANCE  
7016591    Detroit, MI          1074          

In [27]:
### Testing pipeline ###
print("--- Testing pipeline ---")

# Standardizing numeric features
print("Standardizing numerical features...")
print(X_test)
print()

X_test = feature_encoder.transform(X_test)

print("...Done.")
print(X_test[:5]) # print first 5 rows (not using iloc since now X_test became a numpy array)
print()

--- Testing pipeline ---
Standardizing numerical features...
         MONTH  DAY_OF_MONTH  DAY_OF_WEEK OP_UNIQUE_CARRIER   
2480011      4             5            5                DL  \
1465519      3            23            6                MQ   
4295881      7             5            5                F9   
1617645      3            22            5                OO   
5014285      8            18            7                DL   
...        ...           ...          ...               ...   
4607814      7            22            1                YX   
9071726     12            15            7                DL   
347772       1            14            1                UA   
1160305      3             5            2                DL   
4139904      6            21            5                WN   

         OP_CARRIER_FL_NUM  ORIGIN_AIRPORT_ID ORIGIN  DEST_AIRPORT_ID DEST   
2480011               1837              14869    SLC            13487  MSP  \
1465519               3535

In [28]:
# # Train model
# print("Train model...")
# # This code creates a Logistic Regression classifier object and then fits it to the training data 
# # (X_train and Y_train). The print statement is simply a notification that the fitting process is complete.
# classifier = LogisticRegression(solver='liblinear', max_iter=1000) # 'liblinear' plus adapté sur la classif binaire
# classifier.fit(X_train, y_train.values.ravel())
# print("...Done.")

# Train models using different classifiers and print their performance score
sgdc = SGDClassifier(random_state=0)
dtc = DecisionTreeClassifier(random_state=0)
knc = KNeighborsClassifier(n_neighbors=2)

# Let's see what we get with the wisdom of the crowd
vc = VotingClassifier([('SGD', sgdc), 
                            ('Tree', dtc), 
                            ('KNN', knc)], voting = 'hard')

for model in (sgdc, dtc, knc, vc):
    model.fit(X_train, y_train.values.ravel())
    print(model.__class__.__name__, model.score(X_test, y_test.values.ravel()))



SGDClassifier 0.6587179203184119


**Bagging**

In [None]:
model = BaggingClassifier(base_estimator = KNeighborsClassifier)


In [None]:
# # Predictions on training set
# print("Predictions on training set...")
# y_train_pred = classifier.predict(X_train)
# print("...Done.")
# print(y_train_pred[:5]) # print first 5 rows (not using iloc since now y_train became a numpy array)
# print()

Predictions on training set...
...Done.
[0 0 0 0 0]



In [None]:
# # Predictions on test set
# print("Predictions on test set...")
# y_test_pred = classifier.predict(X_test)
# print("...Done.")
# print(y_test_pred[:5])
# print()

--- Testing pipeline ---
Standardizing numerical features...
        MONTH  DAY_OF_MONTH  DAY_OF_WEEK OP_UNIQUE_CARRIER TAIL_NUM  \
392851      7            30            2                DL   N387DA   
114536      8            27            2                OH   N570NN   
23152       4            27            6                WN   N905WN   
295103      3             4            1                G4    334NV   
767255      4            26            5                WN   N7731A   
...       ...           ...          ...               ...      ...   
420310     12            18            3                WN   N8570W   
806557      4            30            2                UA   N19130   
496471      1             7            1                WN   N913WN   
172541     12            26            4                AA   N979UY   
469962      6             7            5                YX   N806MD   

        OP_CARRIER_FL_NUM  ORIGIN_AIRPORT_ID ORIGIN   ORIGIN_CITY_NAME_x  \
392851    

In [None]:
# # Print scores
# print("accuracy on training set : ", accuracy_score(y_train, y_train_pred))
# print("accuracy on test set : ", accuracy_score(y_test, y_test_pred))
# print()

# print("f1-score on training set : ", f1_score(y_train, y_train_pred))
# print("f1-score on test set : ", f1_score(y_test, y_test_pred))
# print()

accuracy on training set :  0.7056390265271932
accuracy on test set :  0.7042463636847994

f1-score on training set :  0.39770583443610347
f1-score on test set :  0.39342908830421425



In [None]:
# # You can also check more performance metrics to better understand what your model is doing
# print("Confusion matrix on train set : ")
# print(confusion_matrix(y_train, y_train_pred))
# print()
# print("Confusion matrix on test set : ")
# print(confusion_matrix(y_test, y_test_pred))
# print()

Confusion matrix on train set : 
[[399028  40338]
 [151198  85120]]

Confusion matrix on test set : 
[[98746 11096]
 [39196 19883]]



## Model II : GLM (Poisson Regression)

In [None]:
 # Separate target variable Y from features X
## Choose the columns you want to have as your features
features_list = ['MONTH', 'DAY_OF_MONTH', 'DAY_OF_WEEK', 'OP_UNIQUE_CARRIER', 'TAIL_NUM', 'OP_CARRIER_FL_NUM', 
                 'ORIGIN_AIRPORT_ID', 'ORIGIN', 'ORIGIN_CITY_NAME_x', 'DEST_AIRPORT_ID', 'DEST', 
                 'DEST_CITY_NAME','CRS_DEP_TIME', 'DEP_TIME', 'CRS_ELAPSED_TIME', 'ACTUAL_ELAPSED_TIME', 
                 'DISTANCE', 'DISTANCE_GROUP', 'DISPLAY_AIRPORT_NAME', 'NAME'
                ]

target = ['DEP_DELAY_NEW']

print("Splitting dataset into X and y...")
X = df.loc[:, features_list] # We add feature_list into our loc 
y = df.loc[:, target] # We set a target variable for the molel 
print("...Done.")

Splitting dataset into X and y...
...Done.


In [None]:
# Divide dataset Train set & Test set 
## First we import train_test_split


print("Splitting dataset into train set and test set...")
## Then we use train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, 
                                                          random_state=0
                                                   )

print("...Done.")   


Splitting dataset into train set and test set...
...Done.


In [None]:
# Encoding categorical features and standardizing numeric features
print("#### X_train BEFORE preprocessing ####")
print(X_train.head())
print()

print("Encoding categorical features and standardizing numerical features...")
## First let's import libraries
## StandardScaler to scale data (i.e apply Z-score)
## OneHotEncoder to encode categorical variables


numeric_features = ["MONTH","DAY_OF_MONTH","DAY_OF_WEEK","CRS_DEP_TIME","DEP_TIME","CRS_ELAPSED_TIME","ACTUAL_ELAPSED_TIME","DISTANCE"] # Choose which column index we are going to scale
numeric_transformer = StandardScaler()


categorical_features = ["ORIGIN","DEST"]
categorical_transformer = OneHotEncoder(drop='first', handle_unknown = 'ignore')   # drop='first', handle_unknown = 'ignore'

# Apply ColumnTransformer to create a pipeline that will apply the above preprocessing
feature_encoder = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features),    
        ('num', numeric_transformer, numeric_features)
        ]
    )

X_train = feature_encoder.fit_transform(X_train)
print("...Done.")
print("#### X_train AFTER preprocessing ####")
print(X_train[0:5,:]) # print first 5 rows (not using iloc since now X_train became a numpy array)
print()

#### X_train BEFORE preprocessing ####
        MONTH  DAY_OF_MONTH  DAY_OF_WEEK OP_UNIQUE_CARRIER TAIL_NUM  \
166518      7            26            5                DL   N713TW   
364784      1             4            5                G4    314NV   
434001      5             3            5                AA   N771AN   
95693       9            26            4                DL   N345DN   
228223      4            12            5                DL   N106DU   

        OP_CARRIER_FL_NUM  ORIGIN_AIRPORT_ID ORIGIN     ORIGIN_CITY_NAME_x  \
166518                520              12478    JFK           New York, NY   
364784                705              14679    SAN          San Diego, CA   
434001                102              12173    HNL           Honolulu, HI   
95693                2201              11433    DTW            Detroit, MI   
228223               2500              11298    DFW  Dallas/Fort Worth, TX   

        DEST_AIRPORT_ID DEST         DEST_CITY_NAME  CRS_DEP_TIME

In [None]:
### Testing pipeline ###
print("--- Testing pipeline ---")

# Standardizing numeric features
print("Standardizing numerical features...")
print(X_test)
print()

X_test = feature_encoder.transform(X_test)

print("...Done.")
print(X_test[:5]) # print first 5 rows (not using iloc since now X_test became a numpy array)
print()

In [None]:
# # Train model
# print("Train model...")
# # This code creates a Logistic Regression classifier object and then fits it to the training data 
# # (X_train and Y_train). The print statement is simply a notification that the fitting process is complete.
# regressor = LinearRegression() # This steps is the actual training !
# regressor.fit(X_train, y_train.values.ravel()) # .values.ravel()
# print("...Done.")

# Train models using different regressors and print their performance score
model_1 = SGDRegressor(random_state=0)
model_2 = DecisionTreeRegressor(random_state=0)
model_3 = KNeighborsRegressor(n_neighbors=2)

# Let's see what we get with the wisdom of the crowd
model_4 = VotingRegressor([('SGD', model_1), 
                            ('Tree', model_2), 
                            ('KNN', model_3)],
                            voting = 'mean')

for model in (model_1, model_2, model_3, model_4):
    model.fit(X_train, y_train.values.ravel())
    print(model.__class__.__name__, model.score(X_test, y_test.values.ravel()))

Train model...
...Done.


In [None]:
# # Predictions on training set
# print("Predictions on training set...")
# y_train_pred = regressor.predict(X_train)
# print("...Done.")
# print(y_train_pred[:5]) # print first 5 rows (not using iloc since now y_train became a numpy array)
# print()

Predictions on training set...
...Done.
[1.47846423 0.88514098 1.54095205 1.33061533 1.22666811]



In [None]:
# # Predictions on test set
# print("Predictions on test set...")
# y_test_pred = regressor.predict(X_test)
# print("...Done.")
# print(y_test_pred[:5])
# print()

--- Testing pipeline ---
Standardizing numerical features...
        MONTH  DAY_OF_MONTH  DAY_OF_WEEK OP_UNIQUE_CARRIER TAIL_NUM  \
153504      4            14            7                DL   N380DA   
544804      2            27            3                AA   N960NN   
233141      1             2            3                NK   N625NK   
84813       8             9            5                AS   N428AS   
54387       3            10            7                NK   N502NK   
...       ...           ...          ...               ...      ...   
485526      5             7            2                WN   N910WN   
494429      8            19            1                NK   N690NK   
759247      8             5            1                DL   N596NW   
438190      4            29            1                B6   N266JB   
615959      7             2            2                WN   N729SW   

        OP_CARRIER_FL_NUM  ORIGIN_AIRPORT_ID ORIGIN  \
153504                913      

In [None]:
# # Performance assessment
# print("--- Assessing the performances of the model ---")

# # Print R^2 scores
# print("D² the deviance of GLM on training set : ", regressor.score(X_train, y_train.values.ravel()))
# print("D² the deviance of GLM on test set : ", regressor.score(X_test, y_test.values.ravel()))

--- Assessing the performances of the model ---
D² the deviance of GLM on training set :  0.04780535995495472
D² the deviance of GLM on test set :  0.048515045088213804
