In [2]:
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
%matplotlib inline

* Set pandas display options

In [27]:
pd.options.display.max_columns = 999
pd.set_option('display.max_rows', 50)

In [4]:
dataseturl = 'https://gist.githubusercontent.com/pfessas/5fc3d85d35201482c6cda4c63a837b48/raw/7f5e2b81f4a654aed7e89dd323558f72950c3ae2/airlines.csv'

In [5]:
raw = pd.read_csv(dataseturl)

In [29]:
raw.head(2)

Unnamed: 0,year,month,carrier,carrier_name,airport,airport_name,arr_flights,arr_del15,carrier_ct,weather_ct,nas_ct,security_ct,late_aircraft_ct,arr_cancelled,arr_diverted,arr_delay,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay
0,2022,8,9E,Endeavor Air Inc.,ABY,"Albany, GA: Southwest Georgia Regional",79.0,12.0,5.34,0.59,2.88,0.0,3.19,0.0,0.0,909.0,384.0,70.0,250.0,0.0,205.0
1,2022,8,9E,Endeavor Air Inc.,ACK,"Nantucket, MA: Nantucket Memorial",124.0,11.0,4.19,0.0,4.49,0.0,2.32,1.0,1.0,675.0,176.0,0.0,231.0,0.0,268.0


In [30]:
raw.shape

(33557, 21)

* Examine missingness

In [37]:
raw.isnull().sum()

year                    0
month                   0
carrier                 0
carrier_name            0
airport                 0
airport_name            0
arr_flights            27
arr_del15              37
carrier_ct             27
weather_ct             27
nas_ct                 27
security_ct            27
late_aircraft_ct       27
arr_cancelled          27
arr_diverted           27
arr_delay              27
carrier_delay          27
weather_delay          27
nas_delay              27
security_delay         27
late_aircraft_delay    27
dtype: int64

* Check all rows with NAs

In [46]:
na_mask = raw.isna().any(axis=1)
sum(na_mask)

37

In [48]:
raw[na_mask].head(5)

Unnamed: 0,year,month,carrier,carrier_name,airport,airport_name,arr_flights,arr_del15,carrier_ct,weather_ct,nas_ct,security_ct,late_aircraft_ct,arr_cancelled,arr_diverted,arr_delay,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay
3252,2022,7,YV,Mesa Airlines Inc.,HSV,"Huntsville, AL: Huntsville International-Carl ...",,,,,,,,,,,,,,,
7985,2022,4,OO,SkyWest Airlines Inc.,LNK,"Lincoln, NE: Lincoln Airport",,,,,,,,,,,,,,,
8412,2022,4,YV,Mesa Airlines Inc.,MLB,"Melbourne, FL: Melbourne International",1.0,,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8439,2022,4,YV,Mesa Airlines Inc.,SHV,"Shreveport, LA: Shreveport Regional",1.0,,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8479,2022,4,YX,Republic Airline,GRB,"Green Bay, WI: Green Bay Austin Straubel Inter...",,,,,,,,,,,,,,,


* Let's drop all NAs for now
* Alternatively we could have used **fillna()** to fill the NAs

In [49]:
df = raw.dropna()

In [50]:
df.isnull().sum().sum()

0

* Check datatypes

In [74]:
df.dtypes

year                     int64
month                    int64
carrier                 object
carrier_name            object
airport                 object
airport_name            object
arr_flights              int32
arr_del15              float64
carrier_ct             float64
weather_ct             float64
nas_ct                 float64
security_ct            float64
late_aircraft_ct       float64
arr_cancelled          float64
arr_diverted           float64
arr_delay              float64
carrier_delay          float64
weather_delay          float64
nas_delay              float64
security_delay         float64
late_aircraft_delay    float64
dtype: object

![](https://miro.medium.com/max/720/1*r-Qgom-7vHyGHYxO7K6Mxw.png)

* Lets cast certain columns to integers

In [69]:
int_cols = {x:'int32' for x in df.columns if x.endswith('_delay')}
int_cols.update({'year':'int32','month':'int32'})
int_cols['arr_flights'] = 'int32'
int_cols

{'arr_delay': 'int32',
 'carrier_delay': 'int32',
 'weather_delay': 'int32',
 'nas_delay': 'int32',
 'security_delay': 'int32',
 'late_aircraft_delay': 'int32',
 'year': 'int32',
 'month': 'int32',
 'arr_flights': 'int32'}

In [88]:
df = df.astype(int_cols)

* Lets cast certain columns to category - pandas efficient way for storing categorical data

In [91]:
for col in df.loc[:,'carrier':'airport_name'].columns:
    print(col)
    df[col] = df[col].astype('category')

carrier
carrier_name
airport
airport_name


* Notice 'int32' and 'category' types - is what we just changed

In [95]:
df.dtypes

year                      int32
month                     int32
carrier                category
carrier_name           category
airport                category
airport_name           category
arr_flights               int32
arr_del15               float64
carrier_ct              float64
weather_ct              float64
nas_ct                  float64
security_ct             float64
late_aircraft_ct        float64
arr_cancelled           float64
arr_diverted            float64
arr_delay                 int32
carrier_delay             int32
weather_delay             int32
nas_delay                 int32
security_delay            int32
late_aircraft_delay       int32
dtype: object

* In terms of memory usage :

In [103]:
print(f'We managed to decrease memory usage by {(1 - (df.memory_usage(deep=True).sum()/raw.memory_usage(deep=True).sum()))*100:.2f}%')

We managed to decrease memory usage by 72.89%


* Let's see the numeric summary

In [106]:
df.describe(include='number').T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
year,33520.0,2021.405459,0.490988,2021.0,2021.0,2021.0,2022.0,2022.0
month,33520.0,5.771152,3.192163,1.0,3.0,6.0,8.0,12.0
arr_flights,33520.0,312.983771,867.966963,1.0,42.0,89.0,213.0,18388.0
arr_del15,33520.0,58.517691,162.376932,0.0,6.0,15.0,41.0,3479.0
carrier_ct,33520.0,22.503472,57.559943,0.0,2.11,6.08,17.58,1147.0
weather_ct,33520.0,2.385413,8.296448,0.0,0.0,0.44,1.96,226.0
nas_ct,33520.0,14.045357,46.093798,0.0,0.56,2.9,8.69,1391.74
security_ct,33520.0,0.230956,1.032661,0.0,0.0,0.0,0.0,58.69
late_aircraft_ct,33520.0,19.352544,63.051497,0.0,1.0,3.75,11.7525,1531.81
arr_cancelled,33520.0,6.981623,30.501279,0.0,0.0,1.0,4.0,1565.0


### Next Steps

In [107]:
df.head() 

Unnamed: 0,year,month,carrier,carrier_name,airport,airport_name,arr_flights,arr_del15,carrier_ct,weather_ct,nas_ct,security_ct,late_aircraft_ct,arr_cancelled,arr_diverted,arr_delay,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay
0,2022,8,9E,Endeavor Air Inc.,ABY,"Albany, GA: Southwest Georgia Regional",79,12.0,5.34,0.59,2.88,0.0,3.19,0.0,0.0,909,384,70,250,0,205
1,2022,8,9E,Endeavor Air Inc.,ACK,"Nantucket, MA: Nantucket Memorial",124,11.0,4.19,0.0,4.49,0.0,2.32,1.0,1.0,675,176,0,231,0,268
2,2022,8,9E,Endeavor Air Inc.,AEX,"Alexandria, LA: Alexandria International",62,4.0,3.18,0.0,0.51,0.0,0.31,0.0,0.0,193,158,0,21,0,14
3,2022,8,9E,Endeavor Air Inc.,AGS,"Augusta, GA: Augusta Regional at Bush Field",162,15.0,5.36,1.31,4.52,0.0,3.81,1.0,0.0,1110,758,49,149,0,154
4,2022,8,9E,Endeavor Air Inc.,ALB,"Albany, NY: Albany International",123,18.0,3.84,1.31,4.53,0.0,8.32,10.0,0.0,984,172,41,313,0,458


* 02 - Next steps
* Organize links by section
* Showcase selecting columns - x2 methods - query, loc
* Showcase filtering rows - x2 methods - use regex
* Instead of dropping NA, use median to fill - x2 single column, dictionary of columns - below dropping
* Create new column year-month-01 - make sure its date index
* Showcase assign, apply, insert, cut
* What day is the first day of the month
* Create quarter column - Split into quarters - datetime
* Extract State in a new column
* Create a New columns with the normalized state name - use [this](https://worldpopulationreview.com/states/state-abbreviations)
* Rename columns with 'ct'; use 'count' instead of 'ct' - use a list comprehension for that
* Rename columns with 'arr'; use .rename function for that - dictionary - 'arrival'
* How many observations per airport we ve got - what is the percentage on the total rows - use value_counts
* How many observations per airport, per year and month we got - use grouping - use transform - "having"
* Output dataset into pkl


* 03 - Pandas Aggregations
* Import dataset
* Which airport tends to have the greatest delays? Please quantify
* Wide to long and vice-verse - pivot


* 04 - Hands - On
* Can we handle the first 3 bullets using a custom function
* Provide a "misery index" for airports. That is, sort the airports in descending order of the probability that a flight departing from that airport has a delay. Take care of outliers: some airports may have a preposterously low number of flights
* Then, go around your data again, but this time the criterion will be the average and median delay you may expect to have with an airline. Again we do not care about outliers.
* Provide a "misery index" for airlines. That is, do the same thing you did for the airports, but this time we are interested in the airlines that make life difficult for passengers. Sort the airlines in descending order of probability that a flight operated by the airline has a delay. This time we do not care about outliers.
* We are interested in the temporal distribution of delays, so create a plot that shows the number of flights and the number of delayed flights per month of year.
* Provide a table that shows, for each possible origin and destination, which airline has the best performance, in terms of mean departure delay. With this table at hand, show how you can determine the best airline for a particular pair of origin and destination airports.

In [123]:
# 
na_cols = raw.columns[raw.isnull().any(axis=0)]
raw[na_cols].describe().T['50%'].to_dict()
pd.read_csv('https://worldpopulationreview.com/static/states/abbr-name.csv')

In [140]:
df.carrier.value_counts()

OO    4637
MQ    2836
G4    2520
DL    2484
9E    2157
AA    2099
WN    2081
UA    2019
YV    1949
F9    1938
OH    1867
YX    1681
AS    1540
B6    1253
NK    1058
QX     976
HA     425
Name: carrier, dtype: int64