In [None]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
import merging
import preprocess
import scores
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb 
import re

In [None]:
df_airport = pd.read_csv('Taxi time - eleven Data Challenge/0. Airport data/training_set_airport_data.csv')

In [None]:
df_airport = preprocess.calc_TO_time(df_airport)
df_airport = merging.merge_distance(df_airport)
df_airport = merging.merge_traffic(df_airport)
df_airport = merging.get_weather_data(df_airport)
df_airport = merging.merge_tech(df_airport)

In [None]:
df_airport.head(3)

In [None]:
df=preprocess.get_previous_taxi_times(df_airport,5)

In [None]:
df=preprocess.get_ma(df,30)

In [None]:
# Proxy for delay
df['delay']=(df['AOBT']-df['Flight Datetime']).dt.seconds

In [None]:
# Runway by traffic
df=df.groupby('Runway').apply(preprocess.get_runway_traffic).reset_index(drop=True)

In [None]:
# Scheduled flights for the day/hour
datecounts=df.set_index('Flight Datetime').groupby(pd.Grouper(freq='D')).size().reset_index(name='Scheduled Flights day')
hourcounts=df.set_index('Flight Datetime').groupby(pd.Grouper(freq='H')).size().reset_index(name='Scheduled Flights hour')
df=df.sort_values('Flight Datetime').reset_index(drop=True)
df=pd.merge_asof(df,datecounts,on='Flight Datetime')
df=pd.merge_asof(df,hourcounts,on='Flight Datetime')

In [None]:
df[['TO','Scheduled Flights hour','MA_30',]].corr()

In [None]:
df['aobt_year']=df['AOBT'].dt.year
df['aobt_month']=df['AOBT'].dt.month
df['aobt_day']=df['AOBT'].dt.weekday
df['aobt_hour']=df['AOBT'].dt.hour
df.drop(['Flight Datetime', 'AOBT', 'ATOT'],axis=1,inplace=True)

In [None]:
# Removing special characters from variable names
import re
df = df.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
df.drop(['Unnamed0'],axis=1,inplace=True)

In [None]:
cols=['AircraftModel_x', 'Stand', 'Runway', 'summary', 'Manufacturer',
       'Model', 'WakeCategory', 'Final', 'AircraftModel_y', 'OldMovementType','aobt_month', 'aobt_day', 'aobt_hour']
for col in cols:
    df[col]=df[col].astype('category')

## Taxi time analysis

In [None]:
plt.figure(figsize=(15,3))
ax = sns.boxplot(x=df['TO'])

In [None]:
q1,q3=np.percentile(df['TO'],(25,75))
iqr=q3-q1
lb=q1-1.5*iqr
ub=q3+1.5*iqr
print("Percentage of outliers: {} ".format(100*(df[(df['TO']>ub)|(df['TO']<lb)].shape[0])/(df.shape[0])))

In [None]:
df[(df['TO']>3600)].shape

## Feature analysis: Traffic

In [None]:
print('Correlation with taxi times:{}'.format(np.corrcoef(df['traffic'],df['TO'])[0][1]))

In [None]:
#df.groupby('traffic')['TO'].mean().plot(kind='bar')
sns.barplot(df['traffic'],df['TO'])

In [None]:
#df['traffic'].value_counts(sort=False).plot(kind='bar')
sns.countplot(df['traffic'])

## Feature analysis: Previous taxi times

In [None]:
import plotly.express as px
fig = px.scatter(x=df['TO1'],y=df['TO'])
fig.show()

In [None]:
df[['TO','TO1','TO2', 'TO3', 'TO4', 'TO5']].corr(method='pearson')

## Feature analysis: Distance

In [None]:
print('Correlation with taxi times:{}'.format(np.corrcoef(df['distance'],df['TO'])[0][1]))

In [None]:
sns.distplot(df['distance'],bins=10)

In [None]:
new_df=df.copy()
new_df['cat_distance']=pd.cut(new_df['distance'],10)
new_df.groupby('cat_distance')['TO'].mean()

In [None]:
# plt.figure(figsize=(20,5))
sns.barplot(new_df['TO'],new_df['cat_distance'],orient='h')

## Feature analysis: Delay

In [None]:
print('Correlation with taxi times:{}'.format(np.corrcoef(df['delay'],df['TO'])[0][1]))

In [None]:
sns.distplot(df['delay'],bins=10)

In [None]:
new_df=df.copy()
new_df['cat_delay']=pd.cut(new_df['delay'],10)
new_df.groupby('cat_delay')['TO'].mean()

In [None]:
sns.countplot(y=new_df['cat_delay'],orient='h')

## Feature analysis: Weather

In [None]:
weather_numeric_cols=['precipIntensity', 'precipProbability', 'temperature',
       'apparentTemperature', 'dewPoint', 'humidity', 'pressure', 'windSpeed',
       'windGust', 'windBearing', 'cloudCover', 'uvIndex', 'visibility',
       'precipAccumulation', 'ozone']

In [None]:
sns.barplot(y=df['summary'],x=df['TO'])

In [None]:
sns.countplot(y=df['summary'])

In [None]:
df[weather_numeric_cols+['TO']].corr().round(decimals=2)

## Feature analysis: AC characteristics

In [None]:
ac_chars=['Manufacturer', 'Model',
       'Engines', 'Wingspanft', 'Lengthft', 'WakeCategory', 'Final',
       'AircraftModel_y', 'OldAircraftLength', 'OldAircraftSpan',
       'OldNoEngines', 'OldMovementType']
cat_cols=df[ac_chars].select_dtypes('category').columns
numeric_cols=df[ac_chars].select_dtypes(['int64','float','int32']).columns

In [None]:
numeric_cols

In [None]:
fig,axes=plt.subplots(1,2,figsize=(25,15))
sns.barplot(y=df['Manufacturer'],x=df['TO'],ax=axes[0])
sns.countplot(y=df['Manufacturer'],ax=axes[1])

In [None]:
fig,axes=plt.subplots(1,2,figsize=(25,15))
sns.barplot(y=df['Model'],x=df['TO'],ax=axes[0])
sns.countplot(y=df['Model'],ax=axes[1])

In [None]:
fig,axes=plt.subplots(1,2,figsize=(25,5))
sns.barplot(y=df['Engines'],x=df['TO'],ax=axes[0],orient='h')
sns.countplot(y=df['Engines'],ax=axes[1])

In [None]:
fig,axes=plt.subplots(1,2,figsize=(25,5))
sns.barplot(y=df['WakeCategory'],x=df['TO'],ax=axes[0],orient='h')
sns.countplot(y=df['WakeCategory'],ax=axes[1])

In [None]:
fig,axes=plt.subplots(1,2,figsize=(25,30))
sns.barplot(y=df['Final'],x=df['TO'],ax=axes[0],orient='h')
sns.countplot(y=df['Final'],ax=axes[1])

In [None]:
fig,axes=plt.subplots(1,2,figsize=(25,30))
sns.barplot(y=df['AircraftModel_y'],x=df['TO'],ax=axes[0],orient='h')
sns.countplot(y=df['AircraftModel_y'],ax=axes[1])

In [None]:
fig,axes=plt.subplots(1,2,figsize=(25,2))
sns.barplot(y=df['OldMovementType'],x=df['TO'],ax=axes[0],orient='h')
sns.countplot(y=df['OldMovementType'],ax=axes[1])

In [None]:
df[list(numeric_cols)+['TO']].corr()

## Feature Analysis: Stand/Runway

In [None]:
fig,axes=plt.subplots(1,2,figsize=(25,30))
sns.barplot(y=df['Stand'],x=df['TO'],ax=axes[0],orient='h')
sns.countplot(y=df['Stand'],ax=axes[1])

In [None]:
fig,axes=plt.subplots(1,2,figsize=(25,5))
sns.barplot(y=df['Runway'],x=df['TO'],ax=axes[0],orient='h')
sns.countplot(y=df['Runway'],ax=axes[1])

## Feature Analysis time attributes

In [None]:
fig,axes=plt.subplots(1,2,figsize=(25,5))
sns.barplot(y=df['aobt_year'],x=df['TO'],ax=axes[0],orient='h')
sns.countplot(y=df['aobt_year'],ax=axes[1])

In [None]:
fig,axes=plt.subplots(1,2,figsize=(25,10))
sns.barplot(y=df['aobt_month'],x=df['TO'],ax=axes[0],orient='h')
sns.countplot(y=df['aobt_month'],ax=axes[1])

In [None]:
fig,axes=plt.subplots(1,2,figsize=(25,10))
sns.barplot(y=df['aobt_hour'],x=df['TO'],ax=axes[0],orient='h')
sns.countplot(y=df['aobt_hour'],ax=axes[1])

In [None]:
fig,axes=plt.subplots(1,2,figsize=(25,5))
sns.barplot(y=df['aobt_day'],x=df['TO'],ax=axes[0],orient='h')
sns.countplot(y=df['aobt_day'],ax=axes[1])

## Normal Train-Test split

In [None]:
from sklearn.model_selection import train_test_split
X=df.drop(['TO'],axis=1)
y=df['TO']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
import lightgbm as lgb
reg=lgb.LGBMRegressor(n_estimators=1000)
reg.fit(X_train,y_train)
y_pred_train = reg.predict(X_train)
y_pred_test = reg.predict(X_test)

In [None]:
print('Train scores : {}'.format(scores.get_scores(y_train,y_pred_train,['rmse','r2','mae'])))
print('Test scores : {}'.format(scores.get_scores(y_test,y_pred_test,['rmse','r2','mae'])))

In [None]:
lgb.plot_importance(reg,max_num_features=20)

## Time Series Train-Test split

### All variables

In [None]:
train=df[df['aobt_year']!=2018]
test=df[df['aobt_year']==2018]
X_train=train.drop('TO',axis=1)
X_test=test.drop('TO',axis=1)

y_train=train['TO']
y_test=test['TO']

In [None]:
reg=lgb.LGBMRegressor(n_estimators=75)
reg.fit(X_train,y_train)
y_pred_train = reg.predict(X_train)
y_pred_test = reg.predict(X_test)

In [None]:
print('Train scores : {}'.format(scores.get_scores(y_train,y_pred_train,['rmse','r2','mae'])))
print('Test scores : {}'.format(scores.get_scores(y_test,y_pred_test,['rmse','r2','mae'])))

In [None]:
lgb.plot_importance(reg,max_num_features=20,importance_type='gain')

### Single feature models

In [None]:
for col in df.columns:
    X_train=train[[col]]
    X_test=test[[col]]   
    reg=lgb.LGBMRegressor(n_estimators=75)
    reg.fit(X_train,y_train)
    y_pred_train = reg.predict(X_train)
    y_pred_test = reg.predict(X_test)
    print("Feature: {}".format(col))
    print('Train scores : {}'.format(scores.get_scores(y_train,y_pred_train,['rmse','r2','mae'])))
    print('Test scores : {}\n'.format(scores.get_scores(y_test,y_pred_test,['rmse','r2','mae']))) 

### Feature subset

In [None]:
train.columns

In [None]:
features=[ 'distance', 'traffic',
       'summary', 'precipIntensity', 'precipProbability', 'temperature',
       'apparentTemperature', 'dewPoint', 'humidity',  'windSpeed',
       'windGust', 'windBearing',  'uvIndex', 'visibility',
        'Engines',
       'Wingspanft', 'Lengthft', 
       'TO1', 'TO2', 'TO3', 'TO4', 'TO5', 'delay',
       'aobt_year', 'aobt_month', 'aobt_day', 'aobt_hour','ScheduledFlightsday','MA_30']
X_train=train[features]
X_test=test[features]

In [None]:
reg=lgb.LGBMRegressor(n_estimators=75)
reg.fit(X_train,y_train)
y_pred_train = reg.predict(X_train)
y_pred_test = reg.predict(X_test)

In [None]:
print('Train scores : {}'.format(scores.get_scores(y_train,y_pred_train,['rmse','r2','mae'])))
print('Test scores : {}'.format(scores.get_scores(y_test,y_pred_test,['rmse','r2','mae'])))

In [None]:
lgb.plot_importance(reg,max_num_features=20,importance_type='gain')

#### Errors Analysis

In [None]:
pd.options.display.max_columns = 60
pd.options.display.max_rows = 60

In [None]:
np.sort(y_pred_test)

In [None]:
errors=abs(y_pred_test-y_test)

In [None]:
indices=errors.sort_values( ascending=False)[:20].index
bigerrors=df.iloc[indices]
bigerrors['predicted']=pd.Series(y_pred_test,index=y_test.index)[indices]

In [None]:
bigerrors['predicted']=pd.Series(y_pred_test,index=y_test.index)[indices]

In [None]:
bigerrors[bigerrors['TO']>5400].shape[0]/bigerrors.shape[0]

In [None]:
bigerrors[['TO','predicted','TO1','TO2','TO3','TO4','TO5','traffic','runway_traffic']]

## Removing outliers

#### TO>5400

In [None]:
threshold=5400
df=df_airport[df_airport['TO']<=threshold]
print("Filtering out {} values ({:0.2f}%)".format(df_airport[df_airport['TO']>threshold].shape[0],100*(df_airport[df_airport['TO']>threshold].shape[0]/df_airport.shape[0])))

In [None]:
df=preprocess.get_previous_taxi_times(df,5)
df=preprocess.get_ma(df,30)
df=preprocess.get_ma(df,60)
df=preprocess.get_ma(df,100)
df=preprocess.get_no_takeoffs(df)
df['delay']=(df['AOBT']-df['Flight Datetime']).dt.seconds
df=df.groupby('Runway').apply(preprocess.get_runway_traffic).reset_index(drop=True)
datecounts=df.set_index('Flight Datetime').groupby(pd.Grouper(freq='D')).size().reset_index(name='Scheduled Flights day')
hourcounts=df.set_index('Flight Datetime').groupby(pd.Grouper(freq='H')).size().reset_index(name='Scheduled Flights hour')
df=df.sort_values('Flight Datetime').reset_index(drop=True)
df=pd.merge_asof(df,datecounts,on='Flight Datetime')
df=pd.merge_asof(df,hourcounts,on='Flight Datetime')
df['aobt_year']=df['AOBT'].dt.year
df['aobt_month']=df['AOBT'].dt.month
df['aobt_day']=df['AOBT'].dt.weekday
df['aobt_hour']=df['AOBT'].dt.hour
df.drop(['Flight Datetime', 'AOBT', 'ATOT'],axis=1,inplace=True)
df = df.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
df.drop(['Unnamed0'],axis=1,inplace=True)
cols=['AircraftModel_x', 'Stand', 'Runway', 'summary', 'Manufacturer',
       'Model', 'WakeCategory', 'Final', 'AircraftModel_y', 'OldMovementType','aobt_month', 'aobt_day', 'aobt_hour']
for col in cols:
    df[col]=df[col].astype('category')

In [None]:
train=df[df['aobt_year']!=2018]
test=df[df['aobt_year']==2018]
X_train=train.drop('TO',axis=1)
X_test=test.drop('TO',axis=1)

y_train=train['TO']
y_test=test['TO']

In [None]:
reg=lgb.LGBMRegressor(n_estimators=75)
reg.fit(X_train,y_train)
y_pred_train = reg.predict(X_train)
y_pred_test = reg.predict(X_test)

In [None]:
print('Train scores : {}'.format(scores.get_scores(y_train,y_pred_train,['rmse','r2','mae'])))
print('Test scores : {}'.format(scores.get_scores(y_test,y_pred_test,['rmse','r2','mae'])))

In [None]:
lgb.plot_importance(reg,max_num_features=20,importance_type='gain')

#### To>3600

In [None]:
threshold=3600
df=df_airport[df_airport['TO']<=threshold]
print("Filtering out {} values ({:0.2f}%)".format(df_airport[df_airport['TO']>threshold].shape[0],100*(df_airport[df_airport['TO']>threshold].shape[0]/df_airport.shape[0])))

In [None]:
df=preprocess.get_previous_taxi_times(df,5)
df=preprocess.get_ma(df,30)
df=preprocess.get_ma(df,60)
df=preprocess.get_ma(df,100)
df=preprocess.get_no_takeoffs(df)
df['delay']=(df['AOBT']-df['Flight Datetime']).dt.seconds
df=df.groupby('Runway').apply(preprocess.get_runway_traffic).reset_index(drop=True)
datecounts=df.set_index('Flight Datetime').groupby(pd.Grouper(freq='D')).size().reset_index(name='Scheduled Flights day')
hourcounts=df.set_index('Flight Datetime').groupby(pd.Grouper(freq='H')).size().reset_index(name='Scheduled Flights hour')
df=df.sort_values('Flight Datetime').reset_index(drop=True)
df=pd.merge_asof(df,datecounts,on='Flight Datetime')
df=pd.merge_asof(df,hourcounts,on='Flight Datetime')
df['aobt_year']=df['AOBT'].dt.year
df['aobt_month']=df['AOBT'].dt.month
df['aobt_day']=df['AOBT'].dt.weekday
df['aobt_hour']=df['AOBT'].dt.hour
df.drop(['Flight Datetime', 'AOBT', 'ATOT'],axis=1,inplace=True)
df = df.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
df.drop(['Unnamed0'],axis=1,inplace=True)
cols=['AircraftModel_x', 'Stand', 'Runway', 'summary', 'Manufacturer',
       'Model', 'WakeCategory', 'Final', 'AircraftModel_y', 'OldMovementType','aobt_month', 'aobt_day', 'aobt_hour']
for col in cols:
    df[col]=df[col].astype('category')

In [None]:
train=df[df['aobt_year']!=2018]
test=df[df['aobt_year']==2018]
X_train=train.drop('TO',axis=1)
X_test=test.drop('TO',axis=1)

y_train=train['TO']
y_test=test['TO']

In [None]:
reg=lgb.LGBMRegressor(n_estimators=75)
reg.fit(X_train,y_train)
y_pred_train = reg.predict(X_train)
y_pred_test = reg.predict(X_test)

In [None]:
print('Train scores : {}'.format(scores.get_scores(y_train,y_pred_train,['rmse','r2','mae'])))
print('Test scores : {}'.format(scores.get_scores(y_test,y_pred_test,['rmse','r2','mae'])))

In [None]:
lgb.plot_importance(reg,max_num_features=20,importance_type='gain')

## Isolation Forest for Outlier removal

In [None]:
df=preprocess.ohe(df,['AircraftModel_x', 'Stand', 'Runway', 'summary', 'Manufacturer',
       'Model', 'WakeCategory', 'Final', 'AircraftModel_y', 'OldMovementType','aobt_month', 'aobt_day', 'aobt_hour'])

In [None]:
train=df[df['aobt_year']!=2018]
test=df[df['aobt_year']==2018]
X_train=train.drop('TO',axis=1)
X_test=test.drop('TO',axis=1)
y_train=train['TO']
y_test=test['TO']

In [None]:
from sklearn.ensemble import IsolationForest
iso = IsolationForest(contamination=0.1)
yhat = iso.fit_predict(X_train)
mask = yhat != -1
X_train, y_train = X_train[mask, :], y_train[mask]