# Condensing our dataframe to fit our needs
To simplify our data to make it easier to analyze, we'll be limiting our data with a few different constraints:
- Years: 2014-2021
- Top 10 High Risk Airports
- Top 5 Airline Carriers

In [191]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import scipy.stats as stats

In [192]:
# loading the refined data
df = pd.read_csv("./csv/delays_transformed.csv")
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 317525 entries, 0 to 317524
Data columns (total 25 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   year_month             317525 non-null  str    
 1   year                   317525 non-null  int64  
 2   month                  317525 non-null  int64  
 3   carrier                317525 non-null  str    
 4   carrier_name           317525 non-null  str    
 5   airport_code           317525 non-null  str    
 6   city                   317525 non-null  str    
 7   state                  317525 non-null  str    
 8   airport_full_name      317525 non-null  str    
 9   airport_name_cleansed  317525 non-null  str    
 10  arr_flights            317525 non-null  float64
 11  arr_del15              317525 non-null  float64
 12  carrier_ct             317525 non-null  float64
 13  weather_ct             317525 non-null  float64
 14  nas_ct                 317525 non-null  float64

In [193]:
# limits our dataframe to years 2014-2021
df = df[(df['year'] <= 2021) & (df['year'] >= 2014)]
# verifies the values of what years are included
df['year'].value_counts()
# change the number if you want to check for a specific year's data
# print(df.loc[df['year'] == 2014])


year
2019    20932
2021    19932
2020    18769
2018    17118
2014    13958
2015    13516
2017    12508
2016    12198
Name: count, dtype: int64

In [194]:
# sorts by year, airport, and the sums of delayed flight counts, sorts by year and descending delayed flight counts, and groups by year, getting the top 6 airports per year within 2014-2021 and compiling into a list of unique airports, which comes out to the top 10 most delayed airports
delays_by_year = (df
    .groupby(["year", "airport_code"])["arr_del15"].sum()
    .reset_index()
    .sort_values(["year", "arr_del15"], ascending=[True, False])
    .groupby("year")
    .head(6)
)

top_10 = delays_by_year['airport_code'].unique()

df = df[df["airport_code"].isin(top_10)]

df['airport_code'].value_counts()

airport_code
ATL    1152
LAX    1102
ORD    1081
LGA    1080
DFW    1066
EWR    1021
DEN    1021
CLT    1004
SFO     956
MCO     929
Name: count, dtype: int64

In [195]:
# grouping the carrier flights together by the sum of their delayed flight counts, and grabbing the top 5 carriers with the most delays
top_carriers = (df
    .groupby(['carrier'])["arr_del15"].sum()
    .reset_index()
    .head(5)
)

df = df[df["carrier"].isin(top_carriers['carrier'])]

df['carrier'].value_counts()


carrier
AA    960
DL    960
B6    917
AS    768
9E    257
Name: count, dtype: int64

In [196]:
df.info()

<class 'pandas.DataFrame'>
Index: 3862 entries, 8491 to 136475
Data columns (total 25 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   year_month             3862 non-null   str    
 1   year                   3862 non-null   int64  
 2   month                  3862 non-null   int64  
 3   carrier                3862 non-null   str    
 4   carrier_name           3862 non-null   str    
 5   airport_code           3862 non-null   str    
 6   city                   3862 non-null   str    
 7   state                  3862 non-null   str    
 8   airport_full_name      3862 non-null   str    
 9   airport_name_cleansed  3862 non-null   str    
 10  arr_flights            3862 non-null   float64
 11  arr_del15              3862 non-null   float64
 12  carrier_ct             3862 non-null   float64
 13  weather_ct             3862 non-null   float64
 14  nas_ct                 3862 non-null   float64
 15  security_ct    

# We reduced the rows of the csv file by almost 90%. The new csv file will be saved under "delays_reduced.csv"

NOTE: I renamed airport to airport_code and airport_name to airport_full_name for clarity, as well as added city, state, and the airport_name_cleansed (which is just the airport itself).

In [197]:
df.to_csv('./csv/delays_reduced.csv', index=False, encoding='utf-8')