In [None]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
import merging
import preprocess
import scores
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb 
import re

In [None]:
df = pd.read_csv('..\df_preprocessed_2015-2019.csv')

In [None]:
cols = ['Flight Datetime', 'AOBT', 'ATOT']
for col in cols:
    df[col] = pd.to_datetime(df[col])

In [None]:
# Runway by traffic
df=df.groupby('Runway').apply(preprocess.get_runway_traffic).reset_index(drop=True)

In [None]:
df.columns

In [None]:
df.drop(['Flight Datetime', 'AOBT', 'ATOT'],axis=1,inplace=True)

In [None]:
# Removing special characters from variable names
df = df.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
df.drop(['Unnamed0'],axis=1,inplace=True)

In [None]:
cols=['AircraftModel_x', 'Stand', 'Runway', 'summary', 'Manufacturer',
       'Model', 'WakeCategory', 'Final', 'AircraftModel_y', 'OldMovementType','aobt_month', 'aobt_day', 'aobt_hour']
for col in cols:
    df[col]=df[col].astype('category')

## Time Series Train-Test split

In [None]:
train=df[df['aobt_year']!=2019]
test=df[df['aobt_year']==2019]
X_train=train.drop('TO',axis=1)
X_test=test.drop('TO',axis=1)

y_train=train['TO']
y_test=test['TO']

### Model :All variables

In [None]:
reg=lgb.LGBMRegressor(n_estimators=75)
reg.fit(X_train,y_train)
y_pred_train = reg.predict(X_train)
y_pred_test = reg.predict(X_test)

In [None]:
print('Train scores : {}'.format(scores.get_scores(y_train,y_pred_train,['rmse','r2','mae'])))
print('Test scores : {}'.format(scores.get_scores(y_test,y_pred_test,['rmse','r2','mae'])))

In [None]:
lgb.plot_importance(reg,max_num_features=20,importance_type='gain')

### Single feature models

In [None]:
for col in df.columns:
    X_train=train[[col]]
    X_test=test[[col]]   
    reg=lgb.LGBMRegressor(n_estimators=75)
    reg.fit(X_train,y_train)
    y_pred_train = reg.predict(X_train)
    y_pred_test = reg.predict(X_test)
    print("Feature: {}".format(col))
    print('Train scores : {}'.format(scores.get_scores(y_train,y_pred_train,['rmse','r2','mae'])))
    print('Test scores : {}\n'.format(scores.get_scores(y_test,y_pred_test,['rmse','r2','mae']))) 

### Feature subset

In [None]:
train.columns

In [None]:
features=[ 'traffic', 'Q',
        'TO1', 'TO2', 'TO3',  'TORunway1',
       'TORunway2', 'TORunway3',
       'aobt_year', 'aobt_month', 'aobt_day', 'aobt_hour', 'windSpeed',
       'precipAccumulation',   'Lengthft']
X_train=train[features]
X_test=test[features]

In [None]:
reg=lgb.LGBMRegressor(n_estimators=75)
reg.fit(X_train,y_train)
y_pred_train = reg.predict(X_train)
y_pred_test = reg.predict(X_test)

In [None]:
print('Train scores : {}'.format(scores.get_scores(y_train,y_pred_train,['rmse','r2','mae'])))
print('Test scores : {}'.format(scores.get_scores(y_test,y_pred_test,['rmse','r2','mae'])))

In [None]:
lgb.plot_importance(reg,max_num_features=20,importance_type='gain')

#### Errors Analysis

In [None]:
pd.options.display.max_columns = 60
pd.options.display.max_rows = 60

In [None]:
errors=abs(y_pred_test-y_test)

In [None]:
indices=errors.sort_values( ascending=False)[:20].index
bigerrors=df.iloc[indices]
bigerrors['predicted']=pd.Series(y_pred_test,index=y_test.index)[indices]

In [None]:
bigerrors['predicted']=pd.Series(y_pred_test,index=y_test.index)[indices]

In [None]:
bigerrors[['TO','predicted','TO1','TO2','TO3','traffic','runway_traffic','TORunway1',
       'TORunway2', 'TORunway3']]