# LIBRAIRIES IMPORTATION

In [3]:
import pandas as pd
import numpy as np

import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
pio.renderers.default = "vscode"

from sklearn.preprocessing import  StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) # to avoid deprecation warnings

 # File reading and basic exploration 

In [4]:
# Import dataset
print("Loading dataset...")
dataset = pd.read_csv("./assets/flight_data_2022.csv")
print("...Done.")
print()

Loading dataset...



Columns (11,13,78,85,86,93) have mixed types. Specify dtype option on import or set low_memory=False.



...Done.



In [5]:
# Basic stats
print("Number of rows : {}".format(dataset.shape[0]))
print()

print("Display of dataset: ")
display(dataset.head())
print()

print("Basics statistics: ")
data_desc = dataset.describe(include='all')
display(data_desc)
print()

print("Percentage of missing values: ")
display(100*dataset.isnull().sum()/dataset.shape[0])

Number of rows : 563737

Display of dataset: 


Unnamed: 0,Year,Quarter,Month,DayofMonth,DayOfWeek,FlightDate,Marketing_Airline_Network,Operated_or_Branded_Code_Share_Partners,DOT_ID_Marketing_Airline,IATA_Code_Marketing_Airline,...,Div5Airport,Div5AirportID,Div5AirportSeqID,Div5WheelsOn,Div5TotalGTime,Div5LongestGTime,Div5WheelsOff,Div5TailNum,Duplicate,Unnamed: 119
0,2022,1,1,6,4,2022-01-06,DL,DL,19790,DL,...,,,,,,,,,N,
1,2022,1,1,6,4,2022-01-06,DL,DL,19790,DL,...,,,,,,,,,N,
2,2022,1,1,6,4,2022-01-06,DL,DL,19790,DL,...,,,,,,,,,N,
3,2022,1,1,6,4,2022-01-06,DL,DL,19790,DL,...,,,,,,,,,N,
4,2022,1,1,6,4,2022-01-06,DL,DL,19790,DL,...,,,,,,,,,N,



Basics statistics: 


Unnamed: 0,Year,Quarter,Month,DayofMonth,DayOfWeek,FlightDate,Marketing_Airline_Network,Operated_or_Branded_Code_Share_Partners,DOT_ID_Marketing_Airline,IATA_Code_Marketing_Airline,...,Div5Airport,Div5AirportID,Div5AirportSeqID,Div5WheelsOn,Div5TotalGTime,Div5LongestGTime,Div5WheelsOff,Div5TailNum,Duplicate,Unnamed: 119
count,563737.0,563737.0,563737.0,563737.0,563737.0,563737,563737,563737,563737.0,563737,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,563737,0.0
unique,,,,,,31,10,14,,10,...,,,,,,,,,1,
top,,,,,,2022-01-03,AA,WN,,AA,...,,,,,,,,,N,
freq,,,,,,20166,149453,97436,,149453,...,,,,,,,,,563737,
mean,2022.0,1.0,1.0,15.963577,4.044444,,,,19831.807753,,...,,,,,,,,,,
std,0.0,0.0,0.0,8.982503,2.086369,,,,271.869441,,...,,,,,,,,,,
min,2022.0,1.0,1.0,1.0,1.0,,,,19393.0,,...,,,,,,,,,,
25%,2022.0,1.0,1.0,8.0,2.0,,,,19790.0,,...,,,,,,,,,,
50%,2022.0,1.0,1.0,16.0,4.0,,,,19805.0,,...,,,,,,,,,,
75%,2022.0,1.0,1.0,24.0,6.0,,,,19977.0,,...,,,,,,,,,,



Percentage of missing values: 


Year                  0.0
Quarter               0.0
Month                 0.0
DayofMonth            0.0
DayOfWeek             0.0
                    ...  
Div5LongestGTime    100.0
Div5WheelsOff       100.0
Div5TailNum         100.0
Duplicate             0.0
Unnamed: 119        100.0
Length: 120, dtype: float64

In [6]:
dataset

Unnamed: 0,Year,Quarter,Month,DayofMonth,DayOfWeek,FlightDate,Marketing_Airline_Network,Operated_or_Branded_Code_Share_Partners,DOT_ID_Marketing_Airline,IATA_Code_Marketing_Airline,...,Div5Airport,Div5AirportID,Div5AirportSeqID,Div5WheelsOn,Div5TotalGTime,Div5LongestGTime,Div5WheelsOff,Div5TailNum,Duplicate,Unnamed: 119
0,2022,1,1,6,4,2022-01-06,DL,DL,19790,DL,...,,,,,,,,,N,
1,2022,1,1,6,4,2022-01-06,DL,DL,19790,DL,...,,,,,,,,,N,
2,2022,1,1,6,4,2022-01-06,DL,DL,19790,DL,...,,,,,,,,,N,
3,2022,1,1,6,4,2022-01-06,DL,DL,19790,DL,...,,,,,,,,,N,
4,2022,1,1,6,4,2022-01-06,DL,DL,19790,DL,...,,,,,,,,,N,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
563732,2022,1,1,24,1,2022-01-24,UA,UA_CODESHARE,19977,UA,...,,,,,,,,,N,
563733,2022,1,1,24,1,2022-01-24,UA,UA_CODESHARE,19977,UA,...,,,,,,,,,N,
563734,2022,1,1,24,1,2022-01-24,UA,UA_CODESHARE,19977,UA,...,,,,,,,,,N,
563735,2022,1,1,24,1,2022-01-24,UA,UA_CODESHARE,19977,UA,...,,,,,,,,,N,
