In [None]:
import pandas as pd
import numpy as np
import moving_average as ma
from datetime import timedelta
from datetime import datetime

In [None]:
# Merge train test dataframes
df_airport_train=pd.read_csv('../Taxi time - eleven Data Challenge/0. Airport data/training_set_airport_data.csv')
df_airport_test=pd.read_csv('../Taxi time - eleven Data Challenge/0. Airport data/test_set_airport_data.csv')
df_airport=pd.concat([df_airport_train,df_airport_test]).reset_index(drop=True)

In [None]:
# Convert datetime columns in the right format
df_airport[['Flight Datetime', 'AOBT', 'ATOT']] = df_airport[['Flight Datetime', 'AOBT', 'ATOT']].apply(pd.to_datetime)
# Taxi-out duration
df_airport['TO'] = (df_airport['ATOT'] - df_airport['AOBT']).dt.seconds

In [None]:
# Filtering out wrong data where ATOT<ABOT
df_airport=df_airport[df_airport['ATOT']>=df_airport['AOBT']]

## 2-month moving average (baseline)

In [None]:
# Not to run: Results exported as 2M_MA_2015-2019.csv
# Function to calculate  2 month moving average

df_airport['date-60']=df_airport['Flight Datetime']-timedelta(days=60)
df=df_airport.copy()
#df=df[df['TO']<=3600]

def get_2_month_average(row,df):
    start_date=datetime(2015,1,1,6)
    if row['date-60']<start_date:
        row['date-60']=start_date
    df=df[(df['Flight Datetime']<row['Flight Datetime'])&(df['Flight Datetime']>=row['date-60'])]
    row['2M Average']=df['TO'].mean()
    row['window_size']=df.shape[0]
    return row

df_2ma=df.apply(lambda x: get_2_month_average(x,df),axis=1)
df_2ma=df_2ma.fillna(method='bfill')

### Test set score 

In [None]:
# Import csv file with all moving average predictions
df_2ma=pd.read_csv('2M_MA_2015-2019.csv')
# Filter for just test data(2019)
df_2ma['Flight Datetime']=pd.to_datetime(df_2ma['Flight Datetime'])
df_2ma_test=df_2ma[(df_2ma['Flight Datetime'].dt.year)==2019]

In [None]:
# Moving average scores for test data(2019)
y_true=df_2ma_test['TO']
y_pred=df_2ma_test['2M Average']
scores=ma.get_scores(y_true,y_pred,metrics=['rmse','mae','r2','first_quartile_error','third_quartile_error'])
for k,v in scores.items():
    print("{}:{:0.2f}".format(k,v))

### All data (including outliers)

In [None]:
# Scores for the entire dataset (2015-2019)
df_2ma=pd.read_csv('2M_MA_2015-2019.csv')
y_true=df_2ma['TO']
y_pred=df_2ma['2M Average']
scores=ma.get_scores(y_true,y_pred,metrics=['rmse','mae','r2','first_quartile_error','third_quartile_error'])
for k,v in scores.items():
    print("{}:{:0.2f}".format(k,v))

## Simple moving average (Grouped by runway)

### Window size = 20

In [None]:
# Simple moving average with fixed window size grouped by runway
df_sma=df_airport.groupby('Runway').apply(lambda x: ma.simple_moving_average(x,window_size=20))

In [None]:
# Scores for simple moving average with window_size=20
y_true=df_sma['TO']
y_pred=df_sma['SMA']
scores=ma.get_scores(y_true,y_pred,metrics=['rmse','mae','r2','first_quartile_error','third_quartile_error'])
for k,v in scores.items():
    print("{}:\n{:0.2f}".format(k,v))

### Best window size

In [None]:
# Find the best window size by testing on a range (1,100)
best_window_size=ma.get_best_window_size(df_airport,np.arange(1,101),metric='rmse')
print('Best window size: {}'.format(best_window_size))
df_sma=df_airport.groupby('Runway').apply(lambda x: ma.simple_moving_average(x,window_size=best_window_size))
y_true=df_sma['TO']
y_pred=df_sma['SMA']
scores=ma.get_scores(y_true,y_pred,metrics=['rmse','mae','r2','first_quartile_error','third_quartile_error'])
for k,v in scores.items():
    print("{}:\n{:0.2f}".format(k,v))

## Exponential Moving average (Grouped by runway)

### Alpha = 0.1

In [None]:
# Calculate the exponential moving average per runway for a an alpha value
df_ema=df_airport.groupby('Runway').apply(lambda x: ma.exponential_moving_average(x,alpha=0.1))
y_true=df_ema['TO']
y_pred=df_ema['EMA']
scores=ma.get_scores(y_true,y_pred,metrics=['rmse','mae','r2','first_quartile_error','third_quartile_error'])
for k,v in scores.items():
    print("{}:\n{:0.2f}".format(k,v))