This will be something of an initial look at making predictions on flight delays.

We can use a basic linear regression initially

In [1]:
import numpy as np
import pandas as pd
import pgaccess as pg

from sklearn.linear_model import LinearRegression

The columns available are:
* Date
* Carrier
* Tail Number
* Origin
* Destination
* Scheduled Departure Time
* Scheduled Arrival Time
* Number of Flights ???
* Distance

I'll start with the average delay on a route, the average delay for a carrier in that month, and plane speed for a given tail number.  
The important features to take from this list will be the scheduled departure time, scheduled elapsed time, and distance. I don't think both arrival and departure time will be needed.

Lets load the pre-prepared data

In [2]:
# Average delay on a route
routeDelay = pd.read_csv('../data/delays_per_airport_pair.csv', index_col=0)
routeDelay.fillna(0, inplace=True)
# Monthly carrier delay
carrierDelay = pd.read_csv('../data/monthly_carrier_delay.csv')
carrierDelay.set_index(['op_unique_carrier', 'month'], inplace=True)
# Plane Speed
planeSpeed = pd.read_csv('../data/plane_speed.csv')
planeSpeed.set_index(['tail_num'], inplace=True)

routeDelay.shape, carrierDelay.shape, planeSpeed.shape

((372, 372), (320, 4), (6481, 5))

Then we can get the flight data that's been provided

In [3]:
flightData = pg.get_test_data(100000)
flightData

Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,...,dest_airport_id,dest,dest_city_name,crs_dep_time,crs_arr_time,dup,crs_elapsed_time,flights,distance,target
0,2019-12-20,AA,AA_CODESHARE,AA,5121,OH,N503AE,5121,14100,PHL,...,13931,ORF,"Norfolk, VA",1045,1154,N,69.0,1.0,212.0,26.0
1,2019-12-20,AA,AA_CODESHARE,AA,5248,OH,N598NN,5248,15249,TLH,...,11057,CLT,"Charlotte, NC",1955,2129,N,94.0,1.0,386.0,8.0
2,2019-12-20,AA,AA_CODESHARE,AA,5288,OH,N528EG,5288,10257,ALB,...,11278,DCA,"Washington, DC",1658,1827,N,89.0,1.0,318.0,-21.0
3,2019-12-20,AA,AA_CODESHARE,AA,4755,PT,N648AE,4755,10135,ABE,...,14100,PHL,"Philadelphia, PA",1308,1354,N,46.0,1.0,55.0,59.0
4,2019-12-20,AA,AA_CODESHARE,AA,4871,PT,N650AE,4871,14100,PHL,...,10581,BGR,"Bangor, ME",1035,1215,N,100.0,1.0,473.0,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97668,2019-12-20,UA,UA_CODESHARE,UA,3445,YX,N656RW,3445,11618,EWR,...,14122,PIT,"Pittsburgh, PA",1620,1753,N,93.0,1.0,319.0,285.0
97669,2019-12-20,UA,UA_CODESHARE,UA,4040,EV,N16559,4040,11618,EWR,...,10431,AVL,"Asheville, NC",759,1014,N,135.0,1.0,583.0,-5.0
97670,2019-12-20,UA,UA_CODESHARE,UA,4243,EV,N608UX,4243,11042,CLE,...,13930,ORD,"Chicago, IL",610,645,N,95.0,1.0,315.0,17.0
97671,2019-12-20,UA,UA_CODESHARE,UA,4446,EV,N612UX,4446,12451,JAX,...,12266,IAH,"Houston, TX",1900,2039,N,159.0,1.0,817.0,-7.0


Add the columns which we want to include directly

In [9]:
modelData = flightData[['crs_dep_time', 'crs_elapsed_time', 'distance', 'target']].copy()
modelData

Unnamed: 0,crs_dep_time,crs_elapsed_time,distance,target
0,1045,69.0,212.0,26.0
1,1955,94.0,386.0,8.0
2,1658,89.0,318.0,-21.0
3,1308,46.0,55.0,59.0
4,1035,100.0,473.0,6.0
...,...,...,...,...
97668,1620,93.0,319.0,285.0
97669,759,135.0,583.0,-5.0
97670,610,95.0,315.0,17.0
97671,1900,159.0,817.0,-7.0


Get the expected delay for the city pair

In [56]:
def get_airport_flight_delay(r):
    if r.origin_airport_id not in routeDelay.index:
        return 0
    startCityDelays = routeDelay.loc[r.origin_airport_id]
    destCity = str(r.dest_airport_id)
    if destCity not in startCityDelays:
        return 0
    return startCityDelays[destCity]
# There must be a way to vectorize this
modelData['route_delay'] = flightData.apply(
    get_airport_flight_delay,
    axis=1
)
modelData

Unnamed: 0,crs_dep_time,crs_elapsed_time,distance,target,route_delay
0,1045,69.0,212.0,26.0,10.852325
1,1955,94.0,386.0,8.0,5.392123
2,1658,89.0,318.0,-21.0,6.400454
3,1308,46.0,55.0,59.0,3.212973
4,1035,100.0,473.0,6.0,1.538594
...,...,...,...,...,...
97668,1620,93.0,319.0,285.0,10.676174
97669,759,135.0,583.0,-5.0,13.597015
97670,610,95.0,315.0,17.0,7.279741
97671,1900,159.0,817.0,-7.0,10.969767


Get the carrier's normal delay for this month

In [46]:
y = modelData.target
X = modelData.drop(columns='target')

X.isna().any(axis=1)

crs_dep_time          0
crs_elapsed_time      0
distance              0
route_delay         891
dtype: int64

In [37]:
carrierDelay

Unnamed: 0,op_unique_carrier,month,mean_delay,flight_count
0,9E,1,3.846229,37344
1,9E,2,4.716917,35695
2,9E,3,2.283191,43656
3,9E,4,6.712606,41954
4,9E,5,1.303258,42195
...,...,...,...,...
315,ZW,8,9.679506,17354
316,ZW,9,4.777711,17117
317,ZW,10,-3.330630,17883
318,ZW,11,4.054967,17166
