# LIBRAIRIES IMPORTATION

In [2]:
import pandas as pd
import numpy as np


import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import io
pio.renderers.default = "vscode"


import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display
from PIL import Image
pal = sns.color_palette()

from sklearn.preprocessing import  StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) # to avoid deprecation warnings

 # File reading and basic exploration 

In [28]:
# Import dataset
print("Loading dataset...")
dataset = pd.read_csv("./assets/flights_delays_from_jan2013_to_ jan2023.csv")  # sep = ";"
pd.set_option('display.max_columns', None)
print("...Done.")
print()

Loading dataset...
...Done.



In [29]:
# Basic stats
print("Number of rows : {}".format(dataset.shape[0]))
print()

print("Display of dataset: ")
display(dataset.head())
print()

print("Basics statistics: ")
data_desc = dataset.describe(include='all')
display(data_desc)
print()

print("Percentage of missing values: ")
display(100*dataset.isnull().sum()/dataset.shape[0])

Number of rows : 167110

Display of dataset: 


Unnamed: 0,year,month,carrier,carrier_name,airport,airport_name,arr_flights,arr_del15,carrier_ct,weather_ct,nas_ct,security_ct,late_aircraft_ct,arr_cancelled,arr_diverted,arr_delay,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay
0,2023,1,9E,Endeavor Air Inc.,ABE,"Allentown/Bethlehem/Easton, PA: Lehigh Valley ...",14.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,648.0,0.0,647.0,1.0,0.0,0.0
1,2023,1,9E,Endeavor Air Inc.,ABY,"Albany, GA: Southwest Georgia Regional",82.0,10.0,6.06,1.72,0.47,0.0,1.74,0.0,0.0,1800.0,1393.0,272.0,56.0,0.0,79.0
2,2023,1,9E,Endeavor Air Inc.,AEX,"Alexandria, LA: Alexandria International",60.0,8.0,3.54,1.59,2.71,0.0,0.16,2.0,4.0,484.0,176.0,184.0,113.0,0.0,11.0
3,2023,1,9E,Endeavor Air Inc.,AGS,"Augusta, GA: Augusta Regional at Bush Field",26.0,6.0,3.4,1.2,0.64,0.0,0.76,0.0,0.0,395.0,119.0,167.0,30.0,0.0,79.0
4,2023,1,9E,Endeavor Air Inc.,ALB,"Albany, NY: Albany International",109.0,30.0,8.08,0.61,9.8,0.0,11.52,3.0,0.0,1591.0,402.0,22.0,333.0,0.0,834.0



Basics statistics: 


Unnamed: 0,year,month,carrier,carrier_name,airport,airport_name,arr_flights,arr_del15,carrier_ct,weather_ct,nas_ct,security_ct,late_aircraft_ct,arr_cancelled,arr_diverted,arr_delay,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay
count,167110.0,167110.0,167110,167110,167110,167110,166876.0,166669.0,166876.0,166876.0,166876.0,166876.0,166876.0,166876.0,166876.0,166876.0,166876.0,166876.0,166876.0,166876.0,166876.0
unique,,,21,24,396,417,,,,,,,,,,,,,,,
top,,,OO,SkyWest Airlines Inc.,AUS,"Austin, TX: Austin - Bergstrom International",,,,,,,,,,,,,,,
freq,,,24534,24534,1512,1512,,,,,,,,,,,,,,,
mean,2017.9623,6.489019,,,,,365.4216,66.4275,20.537636,2.211416,19.683589,0.151449,23.761061,7.549756,0.8579,4170.705326,1395.480602,214.902365,929.705224,7.044206,1623.568596
std,2.944419,3.488475,,,,,997.753344,178.670761,49.076928,7.162642,62.784311,0.6931,71.894932,43.923477,3.742051,12240.947749,3995.171204,795.025358,3476.399194,40.384665,5018.119222
min,2013.0,1.0,,,,,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2015.0,3.0,,,,,51.0,6.0,2.15,0.0,1.02,0.0,1.23,0.0,0.0,333.0,110.0,0.0,35.0,0.0,65.0
50%,2018.0,7.0,,,,,102.0,17.0,6.42,0.37,4.0,0.0,5.0,1.0,0.0,1016.0,374.0,17.0,149.0,0.0,320.0
75%,2021.0,10.0,,,,,253.0,48.0,17.33,1.82,11.99,0.0,15.39,4.0,1.0,2879.0,1099.0,142.25,484.0,0.0,1067.0



Percentage of missing values: 


year                   0.000000
month                  0.000000
carrier                0.000000
carrier_name           0.000000
airport                0.000000
airport_name           0.000000
arr_flights            0.140028
arr_del15              0.263898
carrier_ct             0.140028
weather_ct             0.140028
nas_ct                 0.140028
security_ct            0.140028
late_aircraft_ct       0.140028
arr_cancelled          0.140028
arr_diverted           0.140028
arr_delay              0.140028
carrier_delay          0.140028
weather_delay          0.140028
nas_delay              0.140028
security_delay         0.140028
late_aircraft_delay    0.140028
dtype: float64

In [30]:
print('List of columns:')
list_columns = list(dataset.columns)
print(list(dataset.columns))
print(f'Total: {len(list(dataset.columns))} columns')


List of columns:
['year', 'month', 'carrier', 'carrier_name', 'airport', 'airport_name', 'arr_flights', 'arr_del15', 'carrier_ct', 'weather_ct', 'nas_ct', 'security_ct', 'late_aircraft_ct', 'arr_cancelled', 'arr_diverted', 'arr_delay', 'carrier_delay', 'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay']
Total: 21 columns
