Flight Delay Distribution Prediction
==============

**Author:** *Nicolas Haase*

# 0 Environment Insights

In [1]:
from subprocess import check_output
from psutil import virtual_memory

# RAM
ram = virtual_memory()

total_ram_gb = ram.total / (1024 ** 3)
available_ram_gb = ram.available / (1024 ** 3)
used_ram_gb = ram.used / (1024 ** 3)

print(f"Total RAM: {total_ram_gb:.2f} GB")
print(f"Available RAM: {available_ram_gb:.2f} GB")
print(f"Used RAM: {used_ram_gb:.2f} GB")
print("RAM Usage Percentage:", ram.percent, "%")

# GPU
output = check_output(['system_profiler', 'SPDisplaysDataType'])

print(output.decode('utf-8'))

Total RAM: 16.00 GB
Available RAM: 5.64 GB
Used RAM: 8.76 GB
RAM Usage Percentage: 64.7 %
Graphics/Displays:

    Intel HD Graphics 630:

      Chipset Model: Intel HD Graphics 630
      Type: GPU
      Bus: Built-In
      VRAM (Dynamic, Max): 1536 MB
      Vendor: Intel
      Device ID: 0x591b
      Revision ID: 0x0004
      Automatic Graphics Switching: Supported
      gMux Version: 4.0.29 [3.2.8]
      Metal Support: Metal 3
      Displays:
        Color LCD:
          Display Type: Built-In Retina LCD
          Resolution: 2880 x 1800 Retina
          Framebuffer Depth: 24-Bit Color (ARGB8888)
          Main Display: Yes
          Mirror: Off
          Online: Yes
          Automatically Adjust Brightness: Yes
          Connection Type: Internal

    Radeon Pro 555:

      Chipset Model: Radeon Pro 555
      Type: GPU
      Bus: PCIe
      PCIe Lane Width: x8
      VRAM (Total): 2 GB
      Vendor: AMD (0x1002)
      Device ID: 0x67ef
      Revision ID: 0x00c7
      ROM Revision: 11

# 1 Setup

## 1.1 Used Libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## 1.2 Data Import

In [3]:
# Load data into a DataFrame
df = pd.read_csv('../data/cleaned/flight_data_cleaned.csv')

## 1.3 Workspace Settings

In [4]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# 2 Exploratory Data Analysis

## 2.1 General Data

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13119923 entries, 0 to 13119922
Data columns (total 39 columns):
 #   Column                           Dtype  
---  ------                           -----  
 0   Year                             int64  
 1   Month                            int64  
 2   DayofMonth                       int64  
 3   DayOfWeek                        int64  
 4   FlightDate                       object 
 5   Reporting_Airline                object 
 6   Tail_Number                      object 
 7   Flight_Number_Reporting_Airline  int64  
 8   OriginAirportID                  int64  
 9   Origin                           object 
 10  DestAirportID                    int64  
 11  Dest                             object 
 12  CRSDepTime                       int64  
 13  DepTime                          float64
 14  DepDelay                         float64
 15  DepDel15                         float64
 16  DepartureDelayGroups             float64
 17  DepTim

In [6]:
# Print the first ten rows to get an overview
df.head(10)

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,FlightDate,Reporting_Airline,Tail_Number,Flight_Number_Reporting_Airline,OriginAirportID,Origin,DestAirportID,Dest,CRSDepTime,DepTime,DepDelay,DepDel15,DepartureDelayGroups,DepTimeBlk,TaxiOut,WheelsOff,WheelsOn,TaxiIn,CRSArrTime,ArrTime,ArrDelay,ArrDel15,ArrivalDelayGroups,ArrTimeBlk,CRSElapsedTime,ActualElapsedTime,AirTime,Flights,Distance,DistanceGroup,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
0,2021,8,5,4,2021-08-05,OH,N525AE,5574,10599,BHM,11057,CLT,1914,1906.0,-8.0,0.0,-1.0,1900-1959,17.0,1923.0,2128.0,9.0,2151,2137.0,-14.0,0.0,-1.0,2100-2159,97.0,91.0,65.0,1.0,350.0,2,,,,,
1,2021,8,6,5,2021-08-06,OH,N708PS,5574,10599,BHM,11057,CLT,1914,1948.0,34.0,1.0,2.0,1900-1959,14.0,2002.0,2159.0,23.0,2151,2222.0,31.0,1.0,2.0,2100-2159,97.0,94.0,57.0,1.0,350.0,2,0.0,0.0,0.0,0.0,31.0
2,2021,8,8,7,2021-08-08,OH,N712PS,5574,10599,BHM,11057,CLT,1914,1913.0,-1.0,0.0,-1.0,1900-1959,11.0,1924.0,2119.0,29.0,2151,2148.0,-3.0,0.0,-1.0,2100-2159,97.0,95.0,55.0,1.0,350.0,2,,,,,
3,2021,8,9,1,2021-08-09,OH,N503AE,5574,10599,BHM,11057,CLT,1914,1906.0,-8.0,0.0,-1.0,1900-1959,11.0,1917.0,2116.0,15.0,2151,2131.0,-20.0,0.0,-2.0,2100-2159,97.0,85.0,59.0,1.0,350.0,2,,,,,
4,2021,8,10,2,2021-08-10,OH,N513AE,5574,10599,BHM,11057,CLT,1914,2008.0,54.0,1.0,3.0,1900-1959,22.0,2030.0,2227.0,12.0,2151,2239.0,48.0,1.0,3.0,2100-2159,97.0,91.0,57.0,1.0,350.0,2,0.0,0.0,0.0,0.0,48.0
5,2021,8,11,3,2021-08-11,OH,N523AE,5574,10599,BHM,11057,CLT,1914,2004.0,50.0,1.0,3.0,1900-1959,9.0,2013.0,2213.0,32.0,2151,2245.0,54.0,1.0,3.0,2100-2159,97.0,101.0,60.0,1.0,350.0,2,0.0,0.0,29.0,0.0,25.0
6,2021,8,12,4,2021-08-12,OH,N719PS,5574,10599,BHM,11057,CLT,1914,2036.0,82.0,1.0,5.0,1900-1959,12.0,2048.0,2242.0,18.0,2151,2300.0,69.0,1.0,4.0,2100-2159,97.0,84.0,54.0,1.0,350.0,2,0.0,0.0,0.0,0.0,69.0
7,2021,8,13,5,2021-08-13,OH,N529EA,5574,10599,BHM,11057,CLT,1914,2032.0,78.0,1.0,5.0,1900-1959,26.0,2058.0,2256.0,24.0,2151,2320.0,89.0,1.0,5.0,2100-2159,97.0,108.0,58.0,1.0,350.0,2,0.0,0.0,70.0,0.0,19.0
8,2021,8,16,1,2021-08-16,OH,N537EA,5574,10599,BHM,11057,CLT,1914,2010.0,56.0,1.0,3.0,1900-1959,16.0,2026.0,2222.0,5.0,2151,2227.0,36.0,1.0,2.0,2100-2159,97.0,77.0,56.0,1.0,350.0,2,0.0,0.0,0.0,0.0,36.0
9,2021,8,18,3,2021-08-18,OH,N582NN,5574,10599,BHM,11057,CLT,1914,1918.0,4.0,0.0,0.0,1900-1959,17.0,1935.0,2135.0,11.0,2151,2146.0,-5.0,0.0,-1.0,2100-2159,97.0,88.0,60.0,1.0,350.0,2,,,,,


In [7]:
print(df.dtypes)

Year                                 int64
Month                                int64
DayofMonth                           int64
DayOfWeek                            int64
FlightDate                          object
Reporting_Airline                   object
Tail_Number                         object
Flight_Number_Reporting_Airline      int64
OriginAirportID                      int64
Origin                              object
DestAirportID                        int64
Dest                                object
CRSDepTime                           int64
DepTime                            float64
DepDelay                           float64
DepDel15                           float64
DepartureDelayGroups               float64
DepTimeBlk                          object
TaxiOut                            float64
WheelsOff                          float64
WheelsOn                           float64
TaxiIn                             float64
CRSArrTime                           int64
ArrTime    

In [8]:
# Print the columns
print(list(df))

['Year', 'Month', 'DayofMonth', 'DayOfWeek', 'FlightDate', 'Reporting_Airline', 'Tail_Number', 'Flight_Number_Reporting_Airline', 'OriginAirportID', 'Origin', 'DestAirportID', 'Dest', 'CRSDepTime', 'DepTime', 'DepDelay', 'DepDel15', 'DepartureDelayGroups', 'DepTimeBlk', 'TaxiOut', 'WheelsOff', 'WheelsOn', 'TaxiIn', 'CRSArrTime', 'ArrTime', 'ArrDelay', 'ArrDel15', 'ArrivalDelayGroups', 'ArrTimeBlk', 'CRSElapsedTime', 'ActualElapsedTime', 'AirTime', 'Flights', 'Distance', 'DistanceGroup', 'CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay']


In [9]:
df.describe()

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,Flight_Number_Reporting_Airline,OriginAirportID,DestAirportID,CRSDepTime,DepTime,DepDelay,DepDel15,DepartureDelayGroups,TaxiOut,WheelsOff,WheelsOn,TaxiIn,CRSArrTime,ArrTime,ArrDelay,ArrDel15,ArrivalDelayGroups,CRSElapsedTime,ActualElapsedTime,AirTime,Flights,Distance,DistanceGroup,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
count,13119920.0,13119920.0,13119920.0,13119920.0,13119920.0,13119920.0,13119920.0,13119920.0,13119920.0,13119920.0,13119920.0,13119920.0,13119920.0,13119920.0,13119920.0,13119920.0,13119920.0,13119920.0,13119920.0,13119920.0,13119920.0,13119920.0,13119920.0,13119920.0,13119923.0,13119920.0,13119920.0,2773032.0,2773032.0,2773032.0,2773032.0,2773032.0
mean,2022.087,6.549417,15.7365,3.976194,2425.201,12658.31,12658.81,1329.075,1334.131,12.61023,0.2125152,0.1790956,16.88747,1355.882,1457.809,7.946095,1487.969,1460.716,7.157917,0.2113604,-0.08960054,143.8493,138.3966,113.5631,1.0,822.7798,3.761607,26.31855,3.736387,11.63832,0.1497386,25.90805
std,0.703244,3.39837,8.761782,2.006024,1673.413,1528.991,1529.356,491.6193,507.2742,53.28615,0.4090874,2.301528,9.138675,509.4071,538.9261,6.607015,519.1987,544.2626,55.2752,0.4082734,2.445753,72.77083,72.65465,70.56383,0.0,598.624,2.343945,74.94007,30.93944,29.82068,3.703476,57.54176
min,2021.0,1.0,1.0,1.0,1.0,10135.0,10135.0,1.0,1.0,-96.0,0.0,-2.0,1.0,1.0,1.0,1.0,1.0,1.0,-119.0,0.0,-2.0,-85.0,14.0,8.0,1.0,31.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,2022.0,4.0,8.0,2.0,1058.0,11292.0,11292.0,910.0,913.0,-5.0,0.0,-1.0,11.0,929.0,1043.0,4.0,1102.0,1045.0,-14.0,0.0,-1.0,90.0,85.0,62.0,1.0,391.0,2.0,0.0,0.0,0.0,0.0,0.0
50%,2022.0,7.0,16.0,4.0,2124.0,12889.0,12889.0,1320.0,1324.0,-2.0,0.0,-1.0,15.0,1337.0,1458.0,6.0,1514.0,1501.0,-5.0,0.0,-1.0,127.0,122.0,96.0,1.0,672.0,3.0,6.0,0.0,0.0,0.0,0.0
75%,2023.0,9.0,23.0,6.0,3583.0,14027.0,14027.0,1735.0,1746.0,10.0,0.0,0.0,19.0,1800.0,1913.0,9.0,1923.0,1917.0,10.0,0.0,0.0,175.0,169.0,143.0,1.0,1056.0,5.0,25.0,0.0,15.0,0.0,30.0
max,2023.0,12.0,31.0,7.0,9887.0,16869.0,16869.0,2359.0,2400.0,4413.0,1.0,12.0,222.0,2400.0,2400.0,290.0,2400.0,2400.0,4405.0,1.0,12.0,697.0,764.0,727.0,1.0,5095.0,11.0,3957.0,2363.0,1740.0,1460.0,3581.0


In [10]:
print(df.isnull().sum())

Year                                      0
Month                                     0
DayofMonth                                0
DayOfWeek                                 0
FlightDate                                0
Reporting_Airline                         0
Tail_Number                               0
Flight_Number_Reporting_Airline           0
OriginAirportID                           0
Origin                                    0
DestAirportID                             0
Dest                                      0
CRSDepTime                                0
DepTime                                   0
DepDelay                                  0
DepDel15                                  0
DepartureDelayGroups                      0
DepTimeBlk                                0
TaxiOut                                   0
WheelsOff                                 0
WheelsOn                                  2
TaxiIn                                    2
CRSArrTime                      

In [11]:
print(df.shape)

(13119923, 39)


### Features

We see no missing values in our dataset besides the features CarrierDelay, WeatherDelay, NASDelay, SecurityDelay, LateAircraftDelay. However, those five categories only hold values if the flight is delayed.

**Feature overview and meaning:**
<table>
<tr>
<th>Feature name</th>
<th>Definition</th>
<th>Feature type</th>
</tr>
<tr>
<td>Year</td>
<td>Unique Identifier for each user</td>
<td>Int</td>
<tr>
</tr>
<tr>
<td>Month</td>
<td>Unique Identifier for each product</td>
<td>Object/String</td>
<tr>
</tr>
<tr>
<td>DayOfMonth</td>
<td>Sex of User</td>
<td>Object/String</td>
<tr>
<tr>
<td>DayOfWeek</td>
<td>Age of User in bins</td>
<td>Object/String</td>
<tr>
<tr>
<td>FlightDate</td>
<td>Occupation of User (masked)</td>
<td>Int</td>
<tr>
<tr>
<td>Reporting_Airline</td>
<td>City of User (A,B,C)</td>
<td>Object/String</td>
<tr>
<tr>
<td>Tail_Number</td>
<td>Number of years stay in current city</td>
<td>Object/String</td>
<tr>
<tr>
<td>Marital_Status</td>
<td>Marital Status of User</td>
<td>Int</td>
<tr>
<tr>
<td>Product_Category_1</td>
<td>Product Category (Masked)</td>
<td>Int</td>
<tr>
<tr>
<td>Product_Category_2</td>
<td>Additional Product Category (Masked)</td>
<td>Float</td>
<tr>
<tr>
<td>Product_Category_3</td>
<td>Additional Product Category (Masked)</td>
<td>Float</td>
<tr>
<tr>
<td>**Purchase**</td>
<td>Purchase Amount(target variable)</td>
<td>Int</td>
<tr>
</table>

In [12]:
# Get all unique values from the "Origin" column
num_unique_values = df['Origin'].nunique()
unique_values = sorted(df['Origin'].unique())

# Print the unique values
print("Number of unique values in the 'Origin' column:", num_unique_values)
print("Unique values in the 'Origin' column:")
for value in unique_values:
    print(value, end=", ")

Number of unique values in the 'Origin' column: 374
Unique values in the 'Origin' column:
ABE, ABI, ABQ, ABR, ABY, ACK, ACT, ACV, ACY, ADK, ADQ, AEX, AGS, AKN, ALB, ALO, ALS, ALW, AMA, ANC, APN, ASE, ATL, ATW, ATY, AUS, AVL, AVP, AZA, AZO, BDL, BET, BFF, BFL, BGM, BGR, BHM, BIH, BIL, BIS, BJI, BKG, BLI, BLV, BMI, BNA, BOI, BOS, BPT, BQK, BQN, BRD, BRO, BRW, BTM, BTR, BTV, BUF, BUR, BWI, BZN, CAE, CAK, CDB, CDC, CDV, CGI, CHA, CHO, CHS, CID, CIU, CKB, CLE, CLL, CLT, CMH, CMI, CMX, CNY, COD, COS, COU, CPR, CRP, CRW, CSG, CVG, CWA, CYS, DAB, DAL, DAY, DBQ, DCA, DDC, DEC, DEN, DFW, DHN, DIK, DLG, DLH, DRO, DRT, DSM, DTW, DVL, EAR, EAT, EAU, ECP, EGE, EKO, ELM, ELP, ERI, ESC, EUG, EVV, EWN, EWR, EYW, FAI, FAR, FAT, FAY, FCA, FLG, FLL, FLO, FNT, FOD, FSD, FSM, FWA, GCC, GCK, GEG, GFK, GGG, GJT, GNV, GPT, GRB, GRI, GRK, GRR, GSO, GSP, GST, GTF, GTR, GUC, GUM, HDN, HGR, HHH, HIB, HLN, HNL, HOB, HOU, HPN, HRL, HSV, HTS, HVN, HYA, HYS, IAD, IAG, IAH, ICT, IDA, ILG, ILM, IMT, IND, INL, ISP, ITH, 

## 2.2 Atlanta Airport Data

In [13]:
df_atl.info()

NameError: name 'df_atl' is not defined

In [None]:
print(df_atl.shape)

(1276177, 39)


In [None]:
# Print the first ten rows to get an overview
df_atl.head(10)

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,FlightDate,Reporting_Airline,Tail_Number,Flight_Number_Reporting_Airline,OriginAirportID,Origin,DestAirportID,Dest,CRSDepTime,DepTime,DepDelay,DepDel15,DepartureDelayGroups,DepTimeBlk,TaxiOut,WheelsOff,WheelsOn,TaxiIn,CRSArrTime,ArrTime,ArrDelay,ArrDel15,ArrivalDelayGroups,ArrTimeBlk,CRSElapsedTime,ActualElapsedTime,AirTime,Flights,Distance,DistanceGroup,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
0,2021,8,21,6,2021-08-21,OH,N541EA,5622,10397,ATL,11278,DCA,1630,1628.0,-2.0,0.0,-1.0,1600-1659,25.0,1653.0,1814.0,3.0,1830,1817.0,-13.0,0.0,-1.0,1800-1859,120.0,109.0,81.0,1.0,547.0,3,,,,,
1,2021,8,28,6,2021-08-28,OH,N562NN,5622,10397,ATL,11278,DCA,1630,1624.0,-6.0,0.0,-1.0,1600-1659,18.0,1642.0,1756.0,3.0,1830,1759.0,-31.0,0.0,-2.0,1800-1859,120.0,95.0,74.0,1.0,547.0,3,,,,,
2,2021,8,21,6,2021-08-21,OH,N541EA,5622,11278,DCA,10397,ATL,1350,1348.0,-2.0,0.0,-1.0,1300-1359,30.0,1418.0,1535.0,7.0,1547,1542.0,-5.0,0.0,-1.0,1500-1559,117.0,114.0,77.0,1.0,547.0,3,,,,,
3,2021,8,28,6,2021-08-28,OH,N562NN,5622,11278,DCA,10397,ATL,1350,1344.0,-6.0,0.0,-1.0,1300-1359,17.0,1401.0,1525.0,8.0,1547,1533.0,-14.0,0.0,-1.0,1500-1559,117.0,109.0,84.0,1.0,547.0,3,,,,,
4,2021,8,21,6,2021-08-21,OO,N144SY,5274,12266,IAH,10397,ATL,950,947.0,-3.0,0.0,-1.0,0900-0959,17.0,1004.0,1244.0,10.0,1256,1254.0,-2.0,0.0,-1.0,1200-1259,126.0,127.0,100.0,1.0,689.0,3,,,,,
5,2021,8,21,6,2021-08-21,OO,N144SY,5651,10397,ATL,13930,ORD,1340,1335.0,-5.0,0.0,-1.0,1300-1359,23.0,1358.0,1434.0,7.0,1454,1441.0,-13.0,0.0,-1.0,1400-1459,134.0,126.0,96.0,1.0,606.0,3,,,,,
6,2021,8,20,5,2021-08-20,OO,N142SY,5336,13930,ORD,10397,ATL,1755,1748.0,-7.0,0.0,-1.0,1700-1759,17.0,1805.0,2033.0,4.0,2107,2037.0,-30.0,0.0,-2.0,2100-2159,132.0,109.0,88.0,1.0,606.0,3,,,,,
7,2021,8,26,4,2021-08-26,OO,N163SY,5284,10397,ATL,13930,ORD,645,647.0,2.0,0.0,0.0,0600-0659,11.0,658.0,734.0,12.0,759,746.0,-13.0,0.0,-1.0,0700-0759,134.0,119.0,96.0,1.0,606.0,3,,,,,
8,2021,8,28,6,2021-08-28,OO,N135SY,5284,10397,ATL,13930,ORD,645,744.0,59.0,1.0,3.0,0600-0659,14.0,758.0,825.0,13.0,759,838.0,39.0,1.0,2.0,0700-0759,134.0,114.0,87.0,1.0,606.0,3,39.0,0.0,0.0,0.0,0.0
9,2021,8,28,6,2021-08-28,OO,N204SY,5651,10397,ATL,13930,ORD,1340,1335.0,-5.0,0.0,-1.0,1300-1359,21.0,1356.0,1427.0,13.0,1454,1440.0,-14.0,0.0,-1.0,1400-1459,134.0,125.0,91.0,1.0,606.0,3,,,,,


In [None]:
print(df_atl.isnull().sum())

Year                                     0
Month                                    0
DayofMonth                               0
DayOfWeek                                0
FlightDate                               0
Reporting_Airline                        0
Tail_Number                              0
Flight_Number_Reporting_Airline          0
OriginAirportID                          0
Origin                                   0
DestAirportID                            0
Dest                                     0
CRSDepTime                               0
DepTime                                  0
DepDelay                                 0
DepDel15                                 0
DepartureDelayGroups                     0
DepTimeBlk                               0
TaxiOut                                  0
WheelsOff                                0
WheelsOn                                 0
TaxiIn                                   0
CRSArrTime                               0
ArrTime    

## 2.3 Top Five Airports Data

In [None]:
df_top_five.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7348076 entries, 0 to 13119922
Data columns (total 39 columns):
 #   Column                           Dtype  
---  ------                           -----  
 0   Year                             int64  
 1   Month                            int64  
 2   DayofMonth                       int64  
 3   DayOfWeek                        int64  
 4   FlightDate                       object 
 5   Reporting_Airline                object 
 6   Tail_Number                      object 
 7   Flight_Number_Reporting_Airline  int64  
 8   OriginAirportID                  int64  
 9   Origin                           object 
 10  DestAirportID                    int64  
 11  Dest                             object 
 12  CRSDepTime                       int64  
 13  DepTime                          float64
 14  DepDelay                         float64
 15  DepDel15                         float64
 16  DepartureDelayGroups             float64
 17  DepTimeBlk  

In [None]:
print(df_top_five.shape)

(7348076, 39)


In [None]:
# Print the first ten rows to get an overview
df_top_five.head(10)

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,FlightDate,Reporting_Airline,Tail_Number,Flight_Number_Reporting_Airline,OriginAirportID,Origin,DestAirportID,Dest,CRSDepTime,DepTime,DepDelay,DepDel15,DepartureDelayGroups,DepTimeBlk,TaxiOut,WheelsOff,WheelsOn,TaxiIn,CRSArrTime,ArrTime,ArrDelay,ArrDel15,ArrivalDelayGroups,ArrTimeBlk,CRSElapsedTime,ActualElapsedTime,AirTime,Flights,Distance,DistanceGroup,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
0,2021,8,5,4,2021-08-05,OH,N525AE,5574,10599,BHM,11057,CLT,1914,1906.0,-8.0,0.0,-1.0,1900-1959,17.0,1923.0,2128.0,9.0,2151,2137.0,-14.0,0.0,-1.0,2100-2159,97.0,91.0,65.0,1.0,350.0,2,,,,,
1,2021,8,6,5,2021-08-06,OH,N708PS,5574,10599,BHM,11057,CLT,1914,1948.0,34.0,1.0,2.0,1900-1959,14.0,2002.0,2159.0,23.0,2151,2222.0,31.0,1.0,2.0,2100-2159,97.0,94.0,57.0,1.0,350.0,2,0.0,0.0,0.0,0.0,31.0
2,2021,8,8,7,2021-08-08,OH,N712PS,5574,10599,BHM,11057,CLT,1914,1913.0,-1.0,0.0,-1.0,1900-1959,11.0,1924.0,2119.0,29.0,2151,2148.0,-3.0,0.0,-1.0,2100-2159,97.0,95.0,55.0,1.0,350.0,2,,,,,
3,2021,8,9,1,2021-08-09,OH,N503AE,5574,10599,BHM,11057,CLT,1914,1906.0,-8.0,0.0,-1.0,1900-1959,11.0,1917.0,2116.0,15.0,2151,2131.0,-20.0,0.0,-2.0,2100-2159,97.0,85.0,59.0,1.0,350.0,2,,,,,
4,2021,8,10,2,2021-08-10,OH,N513AE,5574,10599,BHM,11057,CLT,1914,2008.0,54.0,1.0,3.0,1900-1959,22.0,2030.0,2227.0,12.0,2151,2239.0,48.0,1.0,3.0,2100-2159,97.0,91.0,57.0,1.0,350.0,2,0.0,0.0,0.0,0.0,48.0


In [None]:
print(df_atl.isnull().sum())

Year                                     0
Month                                    0
DayofMonth                               0
DayOfWeek                                0
FlightDate                               0
Reporting_Airline                        0
Tail_Number                              0
Flight_Number_Reporting_Airline          0
OriginAirportID                          0
Origin                                   0
DestAirportID                            0
Dest                                     0
CRSDepTime                               0
DepTime                                  0
DepDelay                                 0
DepDel15                                 0
DepartureDelayGroups                     0
DepTimeBlk                               0
TaxiOut                                  0
WheelsOff                                0
WheelsOn                                 0
TaxiIn                                   0
CRSArrTime                               0
ArrTime    

In [None]:
print(df['Origin'].nunique())
# previous 374

374


In [None]:
print(df_top_five['Origin'].nunique())
# previous 319

319


In [None]:
print(df_top_five['Dest'].nunique())
# previous 319

319


# 3 Encoding

# 4 Feature Selection

## 4.1 Pearson Correclation Matrix

## 4.2 Feature Elimination

# 5 Sampling