# LIBRAIRIES IMPORTATION

In [1]:
import pandas as pd
import numpy as np

import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import io
pio.renderers.default = "vscode"  # or "notebook"

import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
from IPython.display import display
from PIL import Image
pal = sns.color_palette()

from sklearn.preprocessing import  StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, VotingClassifier 
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, VotingRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression, PoissonRegressor, SGDClassifier, SGDRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, ConfusionMatrixDisplay, RocCurveDisplay
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) # to avoid deprecation warnings


 # File reading and basic exploration 

In [2]:
# # Import dataset
# print("Loading dataset...")
# df = pd.read_csv("./assets/echantillon_df_10.csv")
# pd.set_option('display.max_columns', None)
# print("...Done.")
# print()

# Import dataset
print("Loading dataset...")
df = pd.read_csv("./assets/flights_usa_2019.csv")  # sep = ";"
pd.set_option('display.max_columns', None)
print("...Done.")
print()

Loading dataset...
...Done.



In [3]:
random_indices = np.random.choice(df.index, size=int(len(df)*0.1), replace=False)
df = df.loc[random_indices]

# Order by month
df = df.sort_values('MONTH')

In [4]:
# Basic stats
print("Number of rows : {}".format(df.shape[0]))
print()

print("Display of dataset: ")
display(df.head())
print()

print("Basics statistics: ")
data_desc = df.describe(include='all')
display(data_desc)
print()

print("Percentage of missing values: ")
display(100*df.isnull().sum()/df.shape[0])

Number of rows : 935155

Display of dataset: 


Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_UNIQUE_CARRIER,TAIL_NUM,OP_CARRIER_FL_NUM,ORIGIN_AIRPORT_ID,ORIGIN,ORIGIN_CITY_NAME,DEST_AIRPORT_ID,DEST,DEST_CITY_NAME,CRS_DEP_TIME,DEP_TIME,DEP_DELAY_NEW,DEP_DEL15,DEP_TIME_BLK,CRS_ARR_TIME,ARR_TIME,ARR_DELAY_NEW,ARR_TIME_BLK,CANCELLED,CANCELLATION_CODE,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,DISTANCE,DISTANCE_GROUP,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,Unnamed: 32
477919,1,1,2,AA,N9677W,2297,12278,ICT,"Wichita, KS",11298,DFW,"Dallas/Fort Worth, TX",600,552.0,0.0,0.0,0600-0659,735,721.0,0.0,0700-0759,0.0,,95.0,89.0,328.0,2,,,,,,
14315,1,7,1,WN,N7874B,1827,14107,PHX,"Phoenix, AZ",11292,DEN,"Denver, CO",1230,1226.0,0.0,0.0,1200-1259,1420,1407.0,0.0,1400-1459,0.0,,110.0,101.0,602.0,3,,,,,,
402319,1,18,5,B6,N334JB,102,12478,JFK,"New York, NY",10792,BUF,"Buffalo, NY",1837,2017.0,100.0,1.0,1800-1859,2012,2201.0,109.0,2000-2059,0.0,,95.0,104.0,301.0,2,100.0,0.0,9.0,0.0,0.0,
356635,1,11,5,OO,N259SY,4027,10713,BOI,"Boise, ID",14869,SLC,"Salt Lake City, UT",900,858.0,0.0,0.0,0900-0959,1024,1018.0,0.0,1000-1059,0.0,,84.0,80.0,290.0,2,,,,,,
325924,1,29,2,DL,N977DL,1778,10397,ATL,"Atlanta, GA",12278,ICT,"Wichita, KS",2009,2005.0,0.0,0.0,2000-2059,2134,2139.0,5.0,2100-2159,0.0,,145.0,154.0,782.0,4,,,,,,



Basics statistics: 


Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_UNIQUE_CARRIER,TAIL_NUM,OP_CARRIER_FL_NUM,ORIGIN_AIRPORT_ID,ORIGIN,ORIGIN_CITY_NAME,DEST_AIRPORT_ID,DEST,DEST_CITY_NAME,CRS_DEP_TIME,DEP_TIME,DEP_DELAY_NEW,DEP_DEL15,DEP_TIME_BLK,CRS_ARR_TIME,ARR_TIME,ARR_DELAY_NEW,ARR_TIME_BLK,CANCELLED,CANCELLATION_CODE,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,DISTANCE,DISTANCE_GROUP,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,Unnamed: 32
count,935155.0,935155.0,935155.0,935155,932935,935155.0,935155.0,935155,935155,935155.0,935155,935155,935155.0,918301.0,918300.0,918300.0,935155,935155.0,917361.0,915197.0,935155,935155.0,17495,935134.0,915197.0,935155.0,935155.0,175546.0,175546.0,175546.0,175546.0,175546.0,0.0
unique,,,,17,5853,,,360,352,,360,352,,,,,19,,,,19,,3,,,,,,,,,,
top,,,,WN,N483HA,,,ATL,"Chicago, IL",,ATL,"Chicago, IL",,,,,0600-0659,,,,2100-2159,,B,,,,,,,,,,
freq,,,,171999,465,,,49589,53491,,49993,53271,,,,,69533,,,,59084,,9042,,,,,,,,,,
mean,6.539261,15.745332,3.918582,,,2554.259215,12650.636674,,,12647.597195,,,1330.364027,1334.464129,14.29872,0.18865,,1484.386401,1460.502912,14.32704,,0.018708,,141.734477,136.522751,800.609836,3.676524,21.091879,3.891749,16.931175,0.094386,27.6127,
std,3.12592,8.779985,1.995633,,,1797.737418,1524.441845,,,1524.884829,,,493.761747,508.443395,48.033008,0.39123,,522.376045,544.157442,47.715076,,0.135492,,72.266621,72.526898,592.686126,2.333522,65.479342,32.324025,40.963905,3.539018,53.249035,
min,1.0,1.0,1.0,,,1.0,10135.0,,,10135.0,,,1.0,1.0,0.0,0.0,,1.0,1.0,0.0,,0.0,,20.0,16.0,31.0,1.0,0.0,0.0,0.0,0.0,0.0,
25%,4.0,8.0,2.0,,,1025.0,11292.0,,,11292.0,,,911.0,913.0,0.0,0.0,,1100.0,1044.0,0.0,,0.0,,90.0,84.0,369.0,2.0,0.0,0.0,0.0,0.0,0.0,
50%,7.0,16.0,4.0,,,2154.0,12889.0,,,12889.0,,,1320.0,1326.0,0.0,0.0,,1514.0,1501.0,0.0,,0.0,,123.0,118.0,640.0,3.0,0.0,0.0,2.0,0.0,3.0,
75%,9.0,23.0,6.0,,,3911.0,13931.0,,,13931.0,,,1738.0,1747.0,7.0,0.0,,1920.0,1917.0,7.0,,0.0,,171.0,167.0,1032.0,5.0,18.0,0.0,20.0,0.0,34.0,



Percentage of missing values: 


MONTH                    0.000000
DAY_OF_MONTH             0.000000
DAY_OF_WEEK              0.000000
OP_UNIQUE_CARRIER        0.000000
TAIL_NUM                 0.237394
OP_CARRIER_FL_NUM        0.000000
ORIGIN_AIRPORT_ID        0.000000
ORIGIN                   0.000000
ORIGIN_CITY_NAME         0.000000
DEST_AIRPORT_ID          0.000000
DEST                     0.000000
DEST_CITY_NAME           0.000000
CRS_DEP_TIME             0.000000
DEP_TIME                 1.802268
DEP_DELAY_NEW            1.802375
DEP_DEL15                1.802375
DEP_TIME_BLK             0.000000
CRS_ARR_TIME             0.000000
ARR_TIME                 1.902786
ARR_DELAY_NEW            2.134192
ARR_TIME_BLK             0.000000
CANCELLED                0.000000
CANCELLATION_CODE       98.129187
CRS_ELAPSED_TIME         0.002246
ACTUAL_ELAPSED_TIME      2.134192
DISTANCE                 0.000000
DISTANCE_GROUP           0.000000
CARRIER_DELAY           81.228139
WEATHER_DELAY           81.228139
NAS_DELAY     

In [5]:
# columns to drop : Drop the columns with > 80% null values
df.drop(columns = ["CANCELLATION_CODE", "LATE_AIRCRAFT_DELAY" , "Unnamed: 32", "SECURITY_DELAY", "CARRIER_DELAY", "NAS_DELAY", "WEATHER_DELAY"], inplace = True)


In [6]:
df.isnull().any()

MONTH                  False
DAY_OF_MONTH           False
DAY_OF_WEEK            False
OP_UNIQUE_CARRIER      False
TAIL_NUM                True
OP_CARRIER_FL_NUM      False
ORIGIN_AIRPORT_ID      False
ORIGIN                 False
ORIGIN_CITY_NAME       False
DEST_AIRPORT_ID        False
DEST                   False
DEST_CITY_NAME         False
CRS_DEP_TIME           False
DEP_TIME                True
DEP_DELAY_NEW           True
DEP_DEL15               True
DEP_TIME_BLK           False
CRS_ARR_TIME           False
ARR_TIME                True
ARR_DELAY_NEW           True
ARR_TIME_BLK           False
CANCELLED              False
CRS_ELAPSED_TIME        True
ACTUAL_ELAPSED_TIME     True
DISTANCE               False
DISTANCE_GROUP         False
dtype: bool

In [7]:
# df.select_dtypes(exclude=["object", "int"]).count()
df.loc[:, df.isnull().any()].count()

TAIL_NUM               932935
DEP_TIME               918301
DEP_DELAY_NEW          918300
DEP_DEL15              918300
ARR_TIME               917361
ARR_DELAY_NEW          915197
CRS_ELAPSED_TIME       935134
ACTUAL_ELAPSED_TIME    915197
dtype: int64

In [8]:
# Drop the rest of the null rows

'''
Since, rest of the null values are very lest ~2%, it is better to drop these rows. 
But still to be sure, we can check the information loss.
'''

rows_after_dropping_null = df.dropna(axis=0).shape[0]
rows_before_dropping_null = df.shape[0]

perc_info_loss = (rows_before_dropping_null - rows_after_dropping_null)*100/rows_before_dropping_null
print("Percentage information loss after dropping null values is {:.3f}%".format(perc_info_loss))

Percentage information loss after dropping null values is 2.134%


In [9]:
# for col in df.select_dtypes(exclude=["object", "int"]).columns:
#     df[col] = df[col].fillna(method='ffill') # Attention car plusieurs compagnies
#     df[col] = df[col].fillna(method='bfill') #

# df.select_dtypes(exclude=["object", "int"]).isnull().any()

In [10]:
for col in df.loc[:, df.isnull().any()].iloc[:, 1:].columns:
    df[[col]] = df[[col]].apply(lambda x: x.fillna(x.median()))
    
df.loc[:, df.isnull().any()].count().any()


True

In [11]:
df.dropna(axis=0, inplace=True)
df.isnull().any()

MONTH                  False
DAY_OF_MONTH           False
DAY_OF_WEEK            False
OP_UNIQUE_CARRIER      False
TAIL_NUM               False
OP_CARRIER_FL_NUM      False
ORIGIN_AIRPORT_ID      False
ORIGIN                 False
ORIGIN_CITY_NAME       False
DEST_AIRPORT_ID        False
DEST                   False
DEST_CITY_NAME         False
CRS_DEP_TIME           False
DEP_TIME               False
DEP_DELAY_NEW          False
DEP_DEL15              False
DEP_TIME_BLK           False
CRS_ARR_TIME           False
ARR_TIME               False
ARR_DELAY_NEW          False
ARR_TIME_BLK           False
CANCELLED              False
CRS_ELAPSED_TIME       False
ACTUAL_ELAPSED_TIME    False
DISTANCE               False
DISTANCE_GROUP         False
dtype: bool

## Normalize dataset 

Let's normalize our dataset before going anywhere:

In [12]:
df["DEP_DELAY"] = df["DEP_DELAY_NEW"].apply(lambda x : 1 if x > 0 else 0)
df["ARR_DELAY"] = df["ARR_DELAY_NEW"].apply(lambda x : 1 if x > 0 else 0)
df.loc[:, ["DEP_DELAY_NEW", "ARR_DELAY_NEW"]] = df.select_dtypes(exclude = ["object"]).loc[:, ["DEP_DELAY_NEW", "ARR_DELAY_NEW"]]\
                                                  .apply(lambda x: np.where(x > 0, np.log(x), 0))

df.head()


divide by zero encountered in log



Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_UNIQUE_CARRIER,TAIL_NUM,OP_CARRIER_FL_NUM,ORIGIN_AIRPORT_ID,ORIGIN,ORIGIN_CITY_NAME,DEST_AIRPORT_ID,DEST,DEST_CITY_NAME,CRS_DEP_TIME,DEP_TIME,DEP_DELAY_NEW,DEP_DEL15,DEP_TIME_BLK,CRS_ARR_TIME,ARR_TIME,ARR_DELAY_NEW,ARR_TIME_BLK,CANCELLED,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,DISTANCE,DISTANCE_GROUP,DEP_DELAY,ARR_DELAY
477919,1,1,2,AA,N9677W,2297,12278,ICT,"Wichita, KS",11298,DFW,"Dallas/Fort Worth, TX",600,552.0,0.0,0.0,0600-0659,735,721.0,0.0,0700-0759,0.0,95.0,89.0,328.0,2,0,0
14315,1,7,1,WN,N7874B,1827,14107,PHX,"Phoenix, AZ",11292,DEN,"Denver, CO",1230,1226.0,0.0,0.0,1200-1259,1420,1407.0,0.0,1400-1459,0.0,110.0,101.0,602.0,3,0,0
402319,1,18,5,B6,N334JB,102,12478,JFK,"New York, NY",10792,BUF,"Buffalo, NY",1837,2017.0,4.60517,1.0,1800-1859,2012,2201.0,4.691348,2000-2059,0.0,95.0,104.0,301.0,2,1,1
356635,1,11,5,OO,N259SY,4027,10713,BOI,"Boise, ID",14869,SLC,"Salt Lake City, UT",900,858.0,0.0,0.0,0900-0959,1024,1018.0,0.0,1000-1059,0.0,84.0,80.0,290.0,2,0,0
325924,1,29,2,DL,N977DL,1778,10397,ATL,"Atlanta, GA",12278,ICT,"Wichita, KS",2009,2005.0,0.0,0.0,2000-2059,2134,2139.0,1.609438,2100-2159,0.0,145.0,154.0,782.0,4,0,1


In [13]:
# import math

# for index, row in df.iterrows():
#     df.at[index, 'CRS_DEP_TIME'] = math.floor(row['CRS_DEP_TIME'] / 100)

# Définir une fonction pour formater les valeurs en heures
def format_heure(valeur):
    heure = str(valeur // 100).zfill(2)
    minute = str(valeur % 100).zfill(2)
    return "{} h {}".format(heure, minute)

# Définir une fonction pour convertir en l'heure en minutes 
def format_minutes(format_heure):
    minutes = format_heure // 100 * 60 + format_heure % 100
    return minutes

# Appliquer la fonction à ces colonnes

df['DEP_TIME'] = df['DEP_TIME'].astype(int).apply(format_minutes)
df['ARR_TIME'] = df['ARR_TIME'].astype(int).apply(format_minutes)
df['CRS_DEP_TIME'] = df['CRS_DEP_TIME'].apply(format_minutes)
df['CRS_ARR_TIME'] = df['CRS_ARR_TIME'].apply(format_minutes)

# Afficher le DataFrame résultant
df.head()

Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_UNIQUE_CARRIER,TAIL_NUM,OP_CARRIER_FL_NUM,ORIGIN_AIRPORT_ID,ORIGIN,ORIGIN_CITY_NAME,DEST_AIRPORT_ID,DEST,DEST_CITY_NAME,CRS_DEP_TIME,DEP_TIME,DEP_DELAY_NEW,DEP_DEL15,DEP_TIME_BLK,CRS_ARR_TIME,ARR_TIME,ARR_DELAY_NEW,ARR_TIME_BLK,CANCELLED,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,DISTANCE,DISTANCE_GROUP,DEP_DELAY,ARR_DELAY
477919,1,1,2,AA,N9677W,2297,12278,ICT,"Wichita, KS",11298,DFW,"Dallas/Fort Worth, TX",360,352,0.0,0.0,0600-0659,455,441,0.0,0700-0759,0.0,95.0,89.0,328.0,2,0,0
14315,1,7,1,WN,N7874B,1827,14107,PHX,"Phoenix, AZ",11292,DEN,"Denver, CO",750,746,0.0,0.0,1200-1259,860,847,0.0,1400-1459,0.0,110.0,101.0,602.0,3,0,0
402319,1,18,5,B6,N334JB,102,12478,JFK,"New York, NY",10792,BUF,"Buffalo, NY",1117,1217,4.60517,1.0,1800-1859,1212,1321,4.691348,2000-2059,0.0,95.0,104.0,301.0,2,1,1
356635,1,11,5,OO,N259SY,4027,10713,BOI,"Boise, ID",14869,SLC,"Salt Lake City, UT",540,538,0.0,0.0,0900-0959,624,618,0.0,1000-1059,0.0,84.0,80.0,290.0,2,0,0
325924,1,29,2,DL,N977DL,1778,10397,ATL,"Atlanta, GA",12278,ICT,"Wichita, KS",1209,1205,0.0,0.0,2000-2059,1294,1299,1.609438,2100-2159,0.0,145.0,154.0,782.0,4,0,1


In [14]:
# Get the list of all column names from headers
column_headers = df.columns.tolist()
print("The Column Header :", column_headers, f"\nTotal: {len(list(df.columns))} columns")

The Column Header : ['MONTH', 'DAY_OF_MONTH', 'DAY_OF_WEEK', 'OP_UNIQUE_CARRIER', 'TAIL_NUM', 'OP_CARRIER_FL_NUM', 'ORIGIN_AIRPORT_ID', 'ORIGIN', 'ORIGIN_CITY_NAME', 'DEST_AIRPORT_ID', 'DEST', 'DEST_CITY_NAME', 'CRS_DEP_TIME', 'DEP_TIME', 'DEP_DELAY_NEW', 'DEP_DEL15', 'DEP_TIME_BLK', 'CRS_ARR_TIME', 'ARR_TIME', 'ARR_DELAY_NEW', 'ARR_TIME_BLK', 'CANCELLED', 'CRS_ELAPSED_TIME', 'ACTUAL_ELAPSED_TIME', 'DISTANCE', 'DISTANCE_GROUP', 'DEP_DELAY', 'ARR_DELAY'] 
Total: 28 columns


In [15]:
# Choose the columns you want to have as your features
features_list = ['MONTH', 'DAY_OF_MONTH', 'DAY_OF_WEEK', 'OP_UNIQUE_CARRIER', 'TAIL_NUM', 'OP_CARRIER_FL_NUM', 
                 'ORIGIN_AIRPORT_ID', 'ORIGIN', 'DEST_AIRPORT_ID', 'DEST', 'DEST_CITY_NAME', 
                 'CRS_DEP_TIME', 'DEP_TIME', 'CRS_ELAPSED_TIME', 'ACTUAL_ELAPSED_TIME', 'DISTANCE', 
                 'DISTANCE_GROUP'
                ]

X = df.loc[:,features_list] # We add feature_list into our loc 


Using <a href="https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.select_dtypes.html" target="_blank">`select_dtypes`</a> from Pandas, filter out all categorical variables. 

In [16]:
df_num = X.select_dtypes(exclude = ["object"]); df_num.head()

Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_CARRIER_FL_NUM,ORIGIN_AIRPORT_ID,DEST_AIRPORT_ID,CRS_DEP_TIME,DEP_TIME,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,DISTANCE,DISTANCE_GROUP
477919,1,1,2,2297,12278,11298,360,352,95.0,89.0,328.0,2
14315,1,7,1,1827,14107,11292,750,746,110.0,101.0,602.0,3
402319,1,18,5,102,12478,10792,1117,1217,95.0,104.0,301.0,2
356635,1,11,5,4027,10713,14869,540,538,84.0,80.0,290.0,2
325924,1,29,2,1778,10397,12278,1209,1205,145.0,154.0,782.0,4


## ACP

Dans le sous-module `decomposition`, nous allons importer la fonction [`PCA()`](http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html). Le fonctionnement de celle-ci est similaire à toutes les autres dans `scikit-learn`. 

1. On créé d'abord un objet (nommé ici `pca`) qui va contenir le résultat de l'ACP. Dans la fonction `PCA()`, on pourra aussi indiquer les paramètres tels que le nombre de composants à retenir (`n_components` - ici on garde tout).
2. Ensuite, on ajuste (*fit*  en anglais) sur des données avec la fonction `fit()` de l'objet précédemment créé. Dans cette fonction, nous devons donc passer les données à utiliser (variables uniquement quantitatives).

Si l'on souhaite une ACP normée (ce qui est notre cas ici), il nous faut standardiser les données en amont, avec par exemple la fonction `scale()` du sous-module `preprocessing` (importée aussi ici). Bien évidemment, il ne faut passer que des variables quantitatives, sans données manquantes.

In [17]:
# Import Standard Scaler
from sklearn.preprocessing import StandardScaler

# Initialize StandardScaler
# StandardScaler will substract mean and divide by standard deviation to each observation
sc = StandardScaler()

# Apply StandardScaler to X
X = sc.fit_transform(df_num)

# Visualize first five rows 
# Standard scaler transform X as numpy array. Therefore you can't use .head()
X[:5]

array([[-1.77333308, -1.67979903, -0.96163814, -0.14372574, -0.24439376,
        -0.88488484, -1.51131798, -1.52321183, -0.64618508, -0.65625343,
        -0.79688066, -0.71797481],
       [-1.77333308, -0.99628629, -1.46268217, -0.40514008,  0.95532045,
        -0.88881917, -0.19910032, -0.22072054, -0.43864192, -0.48930527,
        -0.33462805, -0.28946774],
       [-1.77333308,  0.25682039,  0.54149393, -1.3645863 , -0.11320576,
        -1.21667999,  1.03573014,  1.33631854, -0.64618508, -0.44756822,
        -0.8424311 , -0.71797481],
       [-1.77333308, -0.54061114,  0.54149393,  0.81850149, -1.27093982,
         1.45669719, -0.90567906, -0.90833015, -0.7983834 , -0.78146456,
        -0.86098869, -0.71797481],
       [-1.77333308,  1.50992708, -0.96163814, -0.43239391, -1.47821685,
        -0.24227762,  1.34527892,  1.29664876,  0.04562547,  0.24804915,
        -0.03095845,  0.13903933]])

In [18]:
# import PCA 
from sklearn.decomposition import PCA

# Instanciate PCA 
# We can even specify how many components we want within the class
pca = PCA()

# Fit pca to our model 
PC = pca.fit_transform(X)

# Visualize first five rows
PC[:5]

array([[-1.36599007e+00,  2.20431502e+00, -2.05903150e+00,
        -1.16220271e+00,  2.37413054e-01,  1.33954353e+00,
        -4.18127117e-01, -6.87021970e-02, -4.69843106e-02,
         4.71705695e-02,  1.93671066e-02, -8.49525743e-02],
       [-6.67855781e-01,  4.01397809e-01, -2.36826890e+00,
        -8.50343601e-02, -1.83516687e-01,  9.80963288e-01,
        -1.26453934e+00,  1.50020574e-01,  1.21662625e-01,
        -6.06677954e-02, -1.91702596e-02, -3.95320117e-04],
       [-1.11767216e+00, -1.61123051e+00, -6.47566265e-01,
        -6.44571069e-01, -1.20522100e+00,  1.03324171e+00,
        -1.17625380e+00, -1.67726777e+00, -3.77962963e-01,
        -8.41750876e-04, -5.94761343e-02, -1.31261588e-01],
       [-1.65177762e+00,  1.18327592e+00, -1.99051010e-01,
        -5.49962030e-01, -7.09636813e-01,  1.86796005e+00,
         1.93643453e+00,  2.40894755e-01,  7.88515526e-02,
        -4.63414792e-02, -6.89257248e-02, -5.67580406e-02],
       [ 1.54864938e-01, -1.92006754e+00, -9.0550390

In [19]:
# Here are our eigen vectors
pca.components_

array([[-3.86204183e-03,  1.11069260e-03,  3.57438934e-03,
        -2.09851718e-01,  5.49310383e-02,  6.40326813e-02,
        -1.37398362e-02, -1.56474549e-02,  4.88100186e-01,
         4.82490072e-01,  4.89880143e-01,  4.87197174e-01],
       [ 1.52251378e-03, -6.59554732e-04, -7.77790625e-03,
         7.55430801e-03,  4.40223533e-02, -4.80696841e-02,
        -7.05485332e-01, -7.05371919e-01, -7.34578256e-03,
        -8.02641429e-03, -1.17180602e-02, -1.07808601e-02],
       [ 3.61768289e-01,  3.39048126e-01,  7.88701658e-01,
         3.43167192e-01, -7.79170085e-02, -4.37509689e-02,
        -3.87049596e-03, -4.65918961e-03,  4.40296948e-02,
         4.23055917e-02,  3.61603588e-02,  3.60305492e-02],
       [ 1.51287893e-01,  2.15878046e-01,  4.81639931e-03,
        -1.12318844e-01,  7.88975383e-01,  5.33160702e-01,
         6.94913313e-03,  7.50748754e-03, -6.10660530e-02,
        -6.06361655e-02, -4.55526298e-02, -3.92674217e-02],
       [ 5.62735416e-01, -8.09415332e-01,  8.1803148

### Variance expliquée

L'objet `pca` comprend maintenant un certain nombre d'objets et de fonctions. Le premier objet est le tableau des variances expliquées (`explained_variance_` - proportionnelles aux valeurs propres) par chaque dimension, et le ratio (proportion) de variance expliquée par dimension (`explained_variance_ratio_`).

In [20]:
print(pca.explained_variance_)
print(pca.explained_variance_ratio_)

[4.08010685 1.95049612 1.04243661 0.99573373 0.99481532 0.99335021
 0.98143488 0.82656458 0.0559689  0.05124435 0.01829051 0.0095708 ]
[0.34000854 0.16254117 0.08686962 0.08297772 0.08290119 0.0827791
 0.08178615 0.06888031 0.00466407 0.00427036 0.00152421 0.00079757]


Bien évidemment, il est possible (et préférable) de faire un tableau récapitulatif, avec les variances expliquées, les proportions de variance expliquée simples et cumulées. Voici un petit code permettant de faire ceci.

In [21]:
eig = pd.DataFrame(
    {
        "Dimension" : ["Dim" + str(x + 1) for x in range(12)], 
        "Variance expliquée" : pca.explained_variance_,
        "% variance expliquée" : np.round(pca.explained_variance_ratio_ * 100, decimals=2),
        "% cum. var. expliquée" : np.round(np.cumsum(pca.explained_variance_ratio_ * 100), decimals=2)
    }
)
eig

Unnamed: 0,Dimension,Variance expliquée,% variance expliquée,% cum. var. expliquée
0,Dim1,4.080107,34.0,34.0
1,Dim2,1.950496,16.25,50.25
2,Dim3,1.042437,8.69,58.94
3,Dim4,0.995734,8.3,67.24
4,Dim5,0.994815,8.29,75.53
5,Dim6,0.99335,8.28,83.81
6,Dim7,0.981435,8.18,91.99
7,Dim8,0.826565,6.89,98.87
8,Dim9,0.055969,0.47,99.34
9,Dim10,0.051244,0.43,99.77


## Model I : Classification

In [22]:
 # Separate target variable Y from features X
## Choose the columns you want to have as your features

features_list = ['MONTH', 'DAY_OF_MONTH', 'DAY_OF_WEEK', 'OP_UNIQUE_CARRIER', 'OP_CARRIER_FL_NUM', 
                 'ORIGIN_AIRPORT_ID', 'ORIGIN', 'DEST_AIRPORT_ID', 'DEST', 
                 'DEST_CITY_NAME','CRS_DEP_TIME', 'CRS_ELAPSED_TIME', 
                 'DISTANCE', 
                ]

target = ['DEP_DELAY']

print("Splitting dataset into X and y...")
X = df.loc[:, features_list] # We add feature_list into our loc 
y = df.loc[:, target] # We set a target variable for the molel 
print("...Done.")

Splitting dataset into X and y...
...Done.


In [23]:
# Divide dataset Train set & Test set 
## First we import train_test_split


print("Splitting dataset into train set and test set...")
## Then we use train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, 
                                                          random_state=0,
                                                          stratify=y
                                                   )

print("...Done.")   

Splitting dataset into train set and test set...
...Done.


In [24]:
y_train.value_counts()*100/len(y_train)


DEP_DELAY
0            65.838813
1            34.161187
Name: count, dtype: float64

In [25]:
X_train.head()

Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_UNIQUE_CARRIER,OP_CARRIER_FL_NUM,ORIGIN_AIRPORT_ID,ORIGIN,DEST_AIRPORT_ID,DEST,DEST_CITY_NAME,CRS_DEP_TIME,CRS_ELAPSED_TIME,DISTANCE
188991,1,30,3,EV,4098,12266,IAH,11267,DAY,"Dayton, OH",1110,157.0,929.0
511443,1,22,2,WN,16,10693,BNA,11433,DTW,"Detroit, MI",855,90.0,456.0
387326,1,8,2,WN,1766,14893,SMF,14057,PDX,"Portland, OR",395,90.0,479.0
4039418,6,2,7,AA,579,13158,MAF,11298,DFW,"Dallas/Fort Worth, TX",1096,86.0,309.0
7536622,10,26,6,MQ,3841,13303,MIA,10599,BHM,"Birmingham, AL",946,119.0,661.0


In [26]:
# Encoding categorical features and standardizing numeric features
print("#### X_train BEFORE preprocessing ####")
print(X_train.head())
print()

print("Encoding categorical features and standardizing numerical features...")
## First let's import libraries
## StandardScaler to scale data (i.e apply Z-score)
## OneHotEncoder to encode categorical variables


numeric_features = ["MONTH","DAY_OF_MONTH","DAY_OF_WEEK","CRS_DEP_TIME","CRS_ELAPSED_TIME","DISTANCE"] # Choose which column index we are going to scale
numeric_transformer = StandardScaler()


categorical_features = ["ORIGIN","DEST"]
categorical_transformer = OneHotEncoder(drop='first', handle_unknown = 'ignore')   # drop='first', handle_unknown = 'ignore'

# Apply ColumnTransformer to create a pipeline that will apply the above preprocessing
feature_encoder = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features),    
        ('num', numeric_transformer, numeric_features)
        ]
    )

X_train = feature_encoder.fit_transform(X_train)
print("...Done.")
print("#### X_train AFTER preprocessing ####")
print(X_train[0:5,:]) # print first 5 rows (not using iloc since now X_train became a numpy array)
print()

#### X_train BEFORE preprocessing ####
         MONTH  DAY_OF_MONTH  DAY_OF_WEEK OP_UNIQUE_CARRIER   
188991       1            30            3                EV  \
511443       1            22            2                WN   
387326       1             8            2                WN   
4039418      6             2            7                AA   
7536622     10            26            6                MQ   

         OP_CARRIER_FL_NUM  ORIGIN_AIRPORT_ID ORIGIN  DEST_AIRPORT_ID DEST   
188991                4098              12266    IAH            11267  DAY  \
511443                  16              10693    BNA            11433  DTW   
387326                1766              14893    SMF            14057  PDX   
4039418                579              13158    MAF            11298  DFW   
7536622               3841              13303    MIA            10599  BHM   

                DEST_CITY_NAME  CRS_DEP_TIME  CRS_ELAPSED_TIME  DISTANCE  
188991              Dayton, OH        

In [27]:
### Testing pipeline ###
print("--- Testing pipeline ---")

# Standardizing numeric features
print("Standardizing numerical features...")
print(X_test)
print()

X_test = feature_encoder.transform(X_test)

print("...Done.")
print(X_test[:5]) # print first 5 rows (not using iloc since now X_test became a numpy array)
print()

--- Testing pipeline ---
Standardizing numerical features...
         MONTH  DAY_OF_MONTH  DAY_OF_WEEK OP_UNIQUE_CARRIER   
2470697      4             4            4                AA  \
1094611      2            17            7                DL   
5604868      7            31            3                DL   
6411470      8            18            7                WN   
3939215      6             9            7                OH   
...        ...           ...          ...               ...   
8217725     11             6            3                AA   
2624193      4            15            1                DL   
4197121      6            10            1                DL   
2892144      4            21            7                AA   
1354214      3            29            5                MQ   

         OP_CARRIER_FL_NUM  ORIGIN_AIRPORT_ID ORIGIN  DEST_AIRPORT_ID DEST   
2470697               2755              14747    SEA            11298  DFW  \
1094611               1449

In [28]:
# # Train model
# print("Train model...")
# # This code creates a Logistic Regression classifier object and then fits it to the training data 
# # (X_train and Y_train). The print statement is simply a notification that the fitting process is complete.
# classifier = LogisticRegression(solver='liblinear', max_iter=1000) # 'liblinear' plus adapté sur la classif binaire
# classifier.fit(X_train, y_train.values.ravel())
# print("...Done.")

# Train models using different classifiers and print their performance score
sgdc = SGDClassifier(random_state=0)
dtc = DecisionTreeClassifier(random_state=0)
knc = KNeighborsClassifier(n_neighbors=2)

# Let's see what we get with the wisdom of the crowd
vc = VotingClassifier([('SGD', sgdc), 
                            ('Tree', dtc), 
                            ('KNN', knc)], voting = 'hard')

for model in (sgdc, knc, dtc,  vc):
    model.fit(X_train, y_train.values.ravel())
    print(model.__class__.__name__, model.score(X_test, y_test.values.ravel()))



SGDClassifier 0.6595696027954738
KNeighborsClassifier 0.654885469181545
DecisionTreeClassifier 0.6184735655510735
VotingClassifier 0.6661938466705493


**Bagging**

In [35]:
model = BaggingClassifier(base_estimator = KNeighborsClassifier)


In [30]:
# # Predictions on training set
# print("Predictions on training set...")
# y_train_pred = classifier.predict(X_train)
# print("...Done.")
# print(y_train_pred[:5]) # print first 5 rows (not using iloc since now y_train became a numpy array)
# print()

In [31]:
# # Predictions on test set
# print("Predictions on test set...")
# y_test_pred = classifier.predict(X_test)
# print("...Done.")
# print(y_test_pred[:5])
# print()

In [32]:
# # Print scores
# print("accuracy on training set : ", accuracy_score(y_train, y_train_pred))
# print("accuracy on test set : ", accuracy_score(y_test, y_test_pred))
# print()

# print("f1-score on training set : ", f1_score(y_train, y_train_pred))
# print("f1-score on test set : ", f1_score(y_test, y_test_pred))
# print()

In [33]:
# # You can also check more performance metrics to better understand what your model is doing
# print("Confusion matrix on train set : ")
# print(confusion_matrix(y_train, y_train_pred))
# print()
# print("Confusion matrix on test set : ")
# print(confusion_matrix(y_test, y_test_pred))
# print()

## Model II : GLM (Poisson Regression)

In [36]:
 # Separate target variable Y from features X
## Choose the columns you want to have as your features
features_list = ['MONTH', 'DAY_OF_MONTH', 'DAY_OF_WEEK', 'OP_UNIQUE_CARRIER', 'TAIL_NUM', 'OP_CARRIER_FL_NUM', 
                 'ORIGIN_AIRPORT_ID', 'ORIGIN', 'ORIGIN_CITY_NAME_x', 'DEST_AIRPORT_ID', 'DEST', 
                 'DEST_CITY_NAME','CRS_DEP_TIME', 'DEP_TIME', 'CRS_ELAPSED_TIME', 'ACTUAL_ELAPSED_TIME', 
                 'DISTANCE', 'DISTANCE_GROUP', 'DISPLAY_AIRPORT_NAME', 'NAME'
                ]

target = ['DEP_DELAY_NEW']

print("Splitting dataset into X and y...")
X = df.loc[:, features_list] # We add feature_list into our loc 
y = df.loc[:, target] # We set a target variable for the molel 
print("...Done.")

Splitting dataset into X and y...


KeyError: "['ORIGIN_CITY_NAME_x', 'DISPLAY_AIRPORT_NAME', 'NAME'] not in index"

In [None]:
# Divide dataset Train set & Test set 
## First we import train_test_split


print("Splitting dataset into train set and test set...")
## Then we use train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, 
                                                          random_state=0
                                                   )

print("...Done.")   


Splitting dataset into train set and test set...
...Done.


In [None]:
# Encoding categorical features and standardizing numeric features
print("#### X_train BEFORE preprocessing ####")
print(X_train.head())
print()

print("Encoding categorical features and standardizing numerical features...")
## First let's import libraries
## StandardScaler to scale data (i.e apply Z-score)
## OneHotEncoder to encode categorical variables


numeric_features = ["MONTH","DAY_OF_MONTH","DAY_OF_WEEK","CRS_DEP_TIME","DEP_TIME","CRS_ELAPSED_TIME","ACTUAL_ELAPSED_TIME","DISTANCE"] # Choose which column index we are going to scale
numeric_transformer = StandardScaler()


categorical_features = ["ORIGIN","DEST"]
categorical_transformer = OneHotEncoder(drop='first', handle_unknown = 'ignore')   # drop='first', handle_unknown = 'ignore'

# Apply ColumnTransformer to create a pipeline that will apply the above preprocessing
feature_encoder = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features),    
        ('num', numeric_transformer, numeric_features)
        ]
    )

X_train = feature_encoder.fit_transform(X_train)
print("...Done.")
print("#### X_train AFTER preprocessing ####")
print(X_train[0:5,:]) # print first 5 rows (not using iloc since now X_train became a numpy array)
print()

#### X_train BEFORE preprocessing ####
        MONTH  DAY_OF_MONTH  DAY_OF_WEEK OP_UNIQUE_CARRIER TAIL_NUM  \
166518      7            26            5                DL   N713TW   
364784      1             4            5                G4    314NV   
434001      5             3            5                AA   N771AN   
95693       9            26            4                DL   N345DN   
228223      4            12            5                DL   N106DU   

        OP_CARRIER_FL_NUM  ORIGIN_AIRPORT_ID ORIGIN     ORIGIN_CITY_NAME_x  \
166518                520              12478    JFK           New York, NY   
364784                705              14679    SAN          San Diego, CA   
434001                102              12173    HNL           Honolulu, HI   
95693                2201              11433    DTW            Detroit, MI   
228223               2500              11298    DFW  Dallas/Fort Worth, TX   

        DEST_AIRPORT_ID DEST         DEST_CITY_NAME  CRS_DEP_TIME

In [None]:
### Testing pipeline ###
print("--- Testing pipeline ---")

# Standardizing numeric features
print("Standardizing numerical features...")
print(X_test)
print()

X_test = feature_encoder.transform(X_test)

print("...Done.")
print(X_test[:5]) # print first 5 rows (not using iloc since now X_test became a numpy array)
print()

In [None]:
# # Train model
# print("Train model...")
# # This code creates a Logistic Regression classifier object and then fits it to the training data 
# # (X_train and Y_train). The print statement is simply a notification that the fitting process is complete.
# regressor = LinearRegression() # This steps is the actual training !
# regressor.fit(X_train, y_train.values.ravel()) # .values.ravel()
# print("...Done.")

# Train models using different regressors and print their performance score
model_1 = SGDRegressor(random_state=0)
model_2 = DecisionTreeRegressor(random_state=0)
model_3 = KNeighborsRegressor(n_neighbors=2)

# Let's see what we get with the wisdom of the crowd
model_4 = VotingRegressor([('SGD', model_1), 
                            ('Tree', model_2), 
                            ('KNN', model_3)],
                            voting = 'mean')

for model in (model_1, model_2, model_3, model_4):
    model.fit(X_train, y_train.values.ravel())
    print(model.__class__.__name__, model.score(X_test, y_test.values.ravel()))

Train model...
...Done.


In [None]:
# # Predictions on training set
# print("Predictions on training set...")
# y_train_pred = regressor.predict(X_train)
# print("...Done.")
# print(y_train_pred[:5]) # print first 5 rows (not using iloc since now y_train became a numpy array)
# print()

Predictions on training set...
...Done.
[1.47846423 0.88514098 1.54095205 1.33061533 1.22666811]



In [None]:
# # Predictions on test set
# print("Predictions on test set...")
# y_test_pred = regressor.predict(X_test)
# print("...Done.")
# print(y_test_pred[:5])
# print()

--- Testing pipeline ---
Standardizing numerical features...
        MONTH  DAY_OF_MONTH  DAY_OF_WEEK OP_UNIQUE_CARRIER TAIL_NUM  \
153504      4            14            7                DL   N380DA   
544804      2            27            3                AA   N960NN   
233141      1             2            3                NK   N625NK   
84813       8             9            5                AS   N428AS   
54387       3            10            7                NK   N502NK   
...       ...           ...          ...               ...      ...   
485526      5             7            2                WN   N910WN   
494429      8            19            1                NK   N690NK   
759247      8             5            1                DL   N596NW   
438190      4            29            1                B6   N266JB   
615959      7             2            2                WN   N729SW   

        OP_CARRIER_FL_NUM  ORIGIN_AIRPORT_ID ORIGIN  \
153504                913      

In [None]:
# # Performance assessment
# print("--- Assessing the performances of the model ---")

# # Print R^2 scores
# print("D² the deviance of GLM on training set : ", regressor.score(X_train, y_train.values.ravel()))
# print("D² the deviance of GLM on test set : ", regressor.score(X_test, y_test.values.ravel()))

--- Assessing the performances of the model ---
D² the deviance of GLM on training set :  0.04780535995495472
D² the deviance of GLM on test set :  0.048515045088213804
