In [42]:
import pandas as pd   
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns 

## Data Loading and Inspection

delays_df = pd.read_csv(r'data/airlines_delay.csv') # load the dataset
delays_df.head(5) # show first 5 rows of the dataset

Unnamed: 0,Flight,Time,Length,Airline,AirportFrom,AirportTo,DayOfWeek,Class
0,2313.0,1296.0,141.0,DL,ATL,HOU,1,0
1,6948.0,360.0,146.0,OO,COS,ORD,4,0
2,1247.0,1170.0,143.0,B6,BOS,CLT,3,0
3,31.0,1410.0,344.0,US,OGG,PHX,6,0
4,563.0,692.0,98.0,FL,BMI,ATL,4,0


In [43]:
delays_df.tail(5) # show last 5 rows of the dataset

Unnamed: 0,Flight,Time,Length,Airline,AirportFrom,AirportTo,DayOfWeek,Class
539377,6973.0,530.0,72.0,OO,GEG,SEA,5,1
539378,1264.0,560.0,115.0,WN,LAS,DEN,4,1
539379,5209.0,827.0,74.0,EV,CAE,ATL,2,1
539380,607.0,715.0,65.0,WN,BWI,BUF,4,1
539381,6377.0,770.0,55.0,OO,CPR,DEN,2,1


In [44]:
delays_df.info() # show the dataset information

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 539382 entries, 0 to 539381
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   Flight       539382 non-null  float64
 1   Time         539382 non-null  float64
 2   Length       539382 non-null  float64
 3   Airline      539382 non-null  object 
 4   AirportFrom  539382 non-null  object 
 5   AirportTo    539382 non-null  object 
 6   DayOfWeek    539382 non-null  int64  
 7   Class        539382 non-null  int64  
dtypes: float64(3), int64(2), object(3)
memory usage: 32.9+ MB


### Manipulate and Clean Data

In [45]:
#convert the Flight column from float to int
delays_df['Flight'] = delays_df['Flight'].astype(int)
delays_df.head(5)

Unnamed: 0,Flight,Time,Length,Airline,AirportFrom,AirportTo,DayOfWeek,Class
0,2313,1296.0,141.0,DL,ATL,HOU,1,0
1,6948,360.0,146.0,OO,COS,ORD,4,0
2,1247,1170.0,143.0,B6,BOS,CLT,3,0
3,31,1410.0,344.0,US,OGG,PHX,6,0
4,563,692.0,98.0,FL,BMI,ATL,4,0


In [46]:
#change the column names
delays_df.rename(columns={
    'Flight': 'Flight ID',
    'Time': 'Time of departure',
    'Length': 'Length of Flight'
}, inplace=True)

In [47]:
# Change Time of departure column from minutes to time value
delays_df['Time of departure'] = delays_df['Time of departure'].map(lambda x: pd.to_timedelta(x, unit='m')) 

# Convert Timedelta to datetime
delays_df['Time of departure'] = pd.to_datetime(delays_df['Time of departure'].dt.total_seconds(), unit='s')

# Format Time of depature column as hours and minutes
delays_df['Time of departure'] = delays_df['Time of departure'].dt.strftime('%H:%M')
delays_df.head(5)

Unnamed: 0,Flight ID,Time of departure,Length of Flight,Airline,AirportFrom,AirportTo,DayOfWeek,Class
0,2313,21:36,141.0,DL,ATL,HOU,1,0
1,6948,06:00,146.0,OO,COS,ORD,4,0
2,1247,19:30,143.0,B6,BOS,CLT,3,0
3,31,23:30,344.0,US,OGG,PHX,6,0
4,563,11:32,98.0,FL,BMI,ATL,4,0


In [48]:
# Change Length of Flight column from float to int
delays_df['Length of Flight'] = delays_df['Length of Flight'].astype(int)
delays_df.head(5)

Unnamed: 0,Flight ID,Time of departure,Length of Flight,Airline,AirportFrom,AirportTo,DayOfWeek,Class
0,2313,21:36,141,DL,ATL,HOU,1,0
1,6948,06:00,146,OO,COS,ORD,4,0
2,1247,19:30,143,B6,BOS,CLT,3,0
3,31,23:30,344,US,OGG,PHX,6,0
4,563,11:32,98,FL,BMI,ATL,4,0


In [49]:
# Change DayofWeek column from numbers to days of the week
delays_df['DayOfWeek'] = delays_df['DayOfWeek'].map({
    1: 'Monday',
    2: 'Tuesday',
    3: 'Wednesday',
    4: 'Thursday',
    5: 'Friday',
    6: 'Saturday',
    7: 'Sunday'
})
delays_df.head(5)

Unnamed: 0,Flight ID,Time of departure,Length of Flight,Airline,AirportFrom,AirportTo,DayOfWeek,Class
0,2313,21:36,141,DL,ATL,HOU,Monday,0
1,6948,06:00,146,OO,COS,ORD,Thursday,0
2,1247,19:30,143,B6,BOS,CLT,Wednesday,0
3,31,23:30,344,US,OGG,PHX,Saturday,0
4,563,11:32,98,FL,BMI,ATL,Thursday,0


### Analyze Data

In [50]:
delays_df.describe()

Unnamed: 0,Flight ID,Length of Flight,Class
count,539382.0,539382.0,539382.0
mean,2427.927988,132.202104,0.445443
std,2067.4317,70.117045,0.497015
min,1.0,0.0,0.0
25%,712.0,81.0,0.0
50%,1809.0,115.0,0.0
75%,3745.0,162.0,1.0
max,7814.0,655.0,1.0
