# LIBRAIRIES IMPORTATION

In [2]:
import pandas as pd
import numpy as np


import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import io
pio.renderers.default = "vscode"


import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display
from PIL import Image
pal = sns.color_palette()

from sklearn.preprocessing import  StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) # to avoid deprecation warnings

 # File reading and basic exploration 

In [3]:
# Import dataset
print("Loading dataset...")
dataset = pd.read_csv("./assets/flights_delays_from _jan2017_to_ jul2022.csv")  # sep = ";"
pd.set_option('display.max_columns', None)
print("...Done.")
print()

Loading dataset...
...Done.



In [4]:
# Basic stats
print("Number of rows : {}".format(dataset.shape[0]))
print()

print("Display of dataset: ")
display(dataset.head())
print()

print("Basics statistics: ")
data_desc = dataset.describe(include='all')
display(data_desc)
print()

print("Percentage of missing values: ")
display(100*dataset.isnull().sum()/dataset.shape[0])

Number of rows : 101315

Display of dataset: 


Unnamed: 0,year,month,carrier,carrier_name,airport,airport_name,arr_flights,arr_del15,carrier_ct,weather_ct,nas_ct,security_ct,late_aircraft_ct,arr_cancelled,arr_diverted,arr_delay,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay
0,2022,7,9E,Endeavor Air Inc.,ABE,"Allentown/Bethlehem/Easton, PA: Lehigh Valley ...",33.0,2.0,0.92,1.0,0.08,0.0,0.0,0.0,0.0,129.0,98.0,23.0,8.0,0.0,0.0
1,2022,7,9E,Endeavor Air Inc.,ABY,"Albany, GA: Southwest Georgia Regional",78.0,25.0,11.8,0.72,5.01,0.0,7.48,0.0,0.0,1664.0,887.0,52.0,224.0,0.0,501.0
2,2022,7,9E,Endeavor Air Inc.,ACK,"Nantucket, MA: Nantucket Memorial",124.0,19.0,5.84,1.0,6.76,0.0,5.4,5.0,4.0,1523.0,388.0,35.0,511.0,0.0,589.0
3,2022,7,9E,Endeavor Air Inc.,AEX,"Alexandria, LA: Alexandria International",67.0,10.0,1.32,1.0,2.4,1.0,4.28,0.0,1.0,657.0,103.0,82.0,93.0,25.0,354.0
4,2022,7,9E,Endeavor Air Inc.,AGS,"Augusta, GA: Augusta Regional at Bush Field",174.0,30.0,18.1,5.75,3.6,0.0,2.55,1.0,0.0,2462.0,1686.0,310.0,139.0,0.0,327.0



Basics statistics: 


Unnamed: 0,year,month,carrier,carrier_name,airport,airport_name,arr_flights,arr_del15,carrier_ct,weather_ct,nas_ct,security_ct,late_aircraft_ct,arr_cancelled,arr_diverted,arr_delay,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay
count,101315.0,101315.0,101315,101315,101315,101315,101157.0,100960.0,101157.0,101157.0,101157.0,101157.0,101157.0,101157.0,101157.0,101157.0,101157.0,101157.0,101157.0,101157.0,101157.0
unique,,,19,20,384,384,,,,,,,,,,,,,,,
top,,,OO,SkyWest Airlines Inc.,BNA,"Nashville, TN: Nashville International",,,,,,,,,,,,,,,
freq,,,14919,14919,913,913,,,,,,,,,,,,,,,
mean,2019.516261,6.279633,,,,,336.033413,58.073217,18.457503,2.047227,17.234516,0.153487,20.067422,7.969137,0.773362,3817.083069,1319.266744,212.609785,838.294967,7.370395,1439.534031
std,1.558253,3.438409,,,,,942.283273,163.327,47.042014,6.878678,58.42791,0.734138,63.043181,50.943481,3.395803,11715.243158,3941.46165,810.274436,3390.448796,44.296998,4636.488256
min,2017.0,1.0,,,,,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2018.0,3.0,,,,,42.0,5.0,1.92,0.0,0.94,0.0,1.0,0.0,0.0,265.0,86.0,0.0,24.0,0.0,42.0
50%,2020.0,6.0,,,,,90.0,14.0,5.28,0.24,3.2,0.0,3.96,1.0,0.0,865.0,316.0,12.0,120.0,0.0,253.0
75%,2021.0,9.0,,,,,227.0,40.0,14.61,1.66,9.85,0.0,12.64,4.0,1.0,2520.0,990.0,132.0,401.0,0.0,916.0



Percentage of missing values: 


year                   0.000000
month                  0.000000
carrier                0.000000
carrier_name           0.000000
airport                0.000000
airport_name           0.000000
arr_flights            0.155949
arr_del15              0.350392
carrier_ct             0.155949
weather_ct             0.155949
nas_ct                 0.155949
security_ct            0.155949
late_aircraft_ct       0.155949
arr_cancelled          0.155949
arr_diverted           0.155949
arr_delay              0.155949
carrier_delay          0.155949
weather_delay          0.155949
nas_delay              0.155949
security_delay         0.155949
late_aircraft_delay    0.155949
dtype: float64

In [5]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101315 entries, 0 to 101314
Data columns (total 21 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   year                 101315 non-null  int64  
 1   month                101315 non-null  int64  
 2   carrier              101315 non-null  object 
 3   carrier_name         101315 non-null  object 
 4   airport              101315 non-null  object 
 5   airport_name         101315 non-null  object 
 6   arr_flights          101157 non-null  float64
 7   arr_del15            100960 non-null  float64
 8   carrier_ct           101157 non-null  float64
 9   weather_ct           101157 non-null  float64
 10  nas_ct               101157 non-null  float64
 11  security_ct          101157 non-null  float64
 12  late_aircraft_ct     101157 non-null  float64
 13  arr_cancelled        101157 non-null  float64
 14  arr_diverted         101157 non-null  float64
 15  arr_delay        

In [6]:
print('List of columns:')
list_columns = list(dataset.columns)
print(list(dataset.columns))
print(f'Total: {len(list(dataset.columns))} columns')


List of columns:
['year', 'month', 'carrier', 'carrier_name', 'airport', 'airport_name', 'arr_flights', 'arr_del15', 'carrier_ct', 'weather_ct', 'nas_ct', 'security_ct', 'late_aircraft_ct', 'arr_cancelled', 'arr_diverted', 'arr_delay', 'carrier_delay', 'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay']
Total: 21 columns
