# Condensing our dataframe to fit our needs
To simplify our data to make it easier to analyze, we'll be limiting our data with a few different constraints:
- Years: 2014-2021
- Top 10 High Risk Airports
- Top 5 Airline Carriers

In [89]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import scipy.stats as stats

In [90]:
# loading the refined data
df = pd.read_csv("delays_transformed.csv")
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 317525 entries, 0 to 317524
Data columns (total 21 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   year                 317525 non-null  int64  
 1   month                317525 non-null  int64  
 2   carrier              317525 non-null  str    
 3   carrier_name         317525 non-null  str    
 4   airport              317525 non-null  str    
 5   airport_name         317525 non-null  str    
 6   arr_flights          317525 non-null  float64
 7   arr_del15            317525 non-null  float64
 8   carrier_ct           317525 non-null  float64
 9   weather_ct           317525 non-null  float64
 10  nas_ct               317525 non-null  float64
 11  security_ct          317525 non-null  float64
 12  late_aircraft_ct     317525 non-null  float64
 13  arr_cancelled        317525 non-null  float64
 14  arr_diverted         317525 non-null  float64
 15  arr_delay            317525 

In [91]:
# limits our dataframe to years 2014-2021
df = df[(df['year'] <= 2021) & (df['year'] >= 2014)]
# verifies the values of what years are included
df['year'].value_counts()
# change the number if you want to check for a specific year's data
# print(df.loc[df['year'] == 2014])


year
2019    20932
2021    19932
2020    18769
2018    17118
2014    13958
2015    13516
2017    12508
2016    12198
Name: count, dtype: int64

In [92]:
# sorts by year, airport, and the sums of delayed flight counts, sorts by year and descending delayed flight counts, and groups by year, getting the top 6 airports per year within 2014-2021 and compiling into a list of unique airports, which comes out to the top 10 most delayed airports
delays_by_year = (df
    .groupby(["year", "airport", "airport_name"])["arr_del15"].sum()
    .reset_index()
    .sort_values(["year", "arr_del15"], ascending=[True, False])
    .groupby("year")
    .head(6)
)

top_10 = delays_by_year['airport'].unique()

df = df[df["airport"].isin(top_10)]

df['airport'].value_counts()

airport
ATL    1152
LAX    1102
ORD    1081
LGA    1080
DFW    1066
EWR    1021
DEN    1021
CLT    1004
SFO     956
MCO     929
Name: count, dtype: int64