In [1]:
# Import packages 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR

RSEED = 42

In [None]:
# function to classify data as delyed or not
def del_ontime(dataf):
    if dataf["target"] == 0:
        return "ot"
    else:
        return "del"

# import of data
df = pd.read_csv("data/cleaned_data.csv")
df = df[df['target'] <= 480]
df["route"] = df["DEPSTN"] + df["ARRSTN"]
df["strat"] = df.apply(del_ontime, axis=1)

df.head(10)

Unnamed: 0,DEPSTN,ARRSTN,AC_TYPE,target,flight_duration,month,dayparts,hour_bucket,route,strat
0,CMN,TUN,32A,260.0,145.0,Jan,Morning,h10,CMNTUN,del
1,MXP,TUN,31B,20.0,110.0,Jan,Afternoon,h15,MXPTUN,del
2,TUN,IST,32A,0.0,155.0,Jan,Night,h04,TUNIST,ot
3,DJE,NTE,736,0.0,170.0,Jan,Afternoon,h14,DJENTE,ot
4,TUN,ALG,320,22.0,80.0,Jan,Afternoon,h14,TUNALG,del
5,TLS,TUN,736,53.0,115.0,Jan,Afternoon,h16,TLSTUN,del
6,TUN,BCN,32A,10.0,105.0,Jan,Early Morning,h07,TUNBCN,del
7,TUN,ORY,32A,15.0,140.0,Jan,Early Morning,h07,TUNORY,del
8,TUN,FCO,32A,16.0,80.0,Jan,Early Morning,h07,TUNFCO,del
9,TUN,NCE,31A,21.0,90.0,Jan,Early Morning,h07,TUNNCE,del


In [None]:
# definition of categorical and numerical values, which will be used for regression
cat_vars = ['hour_bucket', 'strat', "ARRSTN", "AC_TYPE", "month"]
num_vars = ['flight_duration']

df1 = df[num_vars].copy()
df2 = pd.get_dummies(df[cat_vars], dtype = int, drop_first=True)
df1 = pd.concat([df1, df2], axis=1)

# df1 is constructed as X

X = df1.copy()
y = df["target"]
X

Unnamed: 0,flight_duration,hour_bucket_h01,hour_bucket_h02,hour_bucket_h03,hour_bucket_h04,hour_bucket_h05,hour_bucket_h06,hour_bucket_h07,hour_bucket_h08,hour_bucket_h09,...,month_Dec,month_Feb,month_Jan,month_Jul,month_Jun,month_Mar,month_May,month_Nov,month_Oct,month_Sep
0,145.0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,110.0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,155.0,0,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,170.0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,80.0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93674,50.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
93675,145.0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
93676,45.0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
93677,35.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, stratify=X["strat_ot"], random_state = RSEED)

# drop categorical parameter. It's just used for stratification/splitting, not for regression!
X_train = X_train.drop(["strat_ot"], axis=1)
X_test = X_test.drop(["strat_ot"], axis=1)
X_train

Unnamed: 0,flight_duration,hour_bucket_h01,hour_bucket_h02,hour_bucket_h03,hour_bucket_h04,hour_bucket_h05,hour_bucket_h06,hour_bucket_h07,hour_bucket_h08,hour_bucket_h09,...,month_Dec,month_Feb,month_Jan,month_Jul,month_Jun,month_Mar,month_May,month_Nov,month_Oct,month_Sep
60345,175.0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
54503,150.0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
62497,120.0,0,0,0,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0
16132,295.0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
83894,130.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65868,150.0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
53843,145.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
48342,110.0,0,0,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
65935,170.0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [None]:
# run SVR with rbf as kernel parameter

svr_reg = SVR(kernel='rbf')

In [6]:
svr_reg.fit(X_train, y_train)

In [None]:
# calculate RMSE
svr_y_pred = svr_reg.predict(X_test)
print('Root Mean Squared Error (RMSE):', np.sqrt(metrics.mean_squared_error(y_test, svr_y_pred)))

Root Mean Squared Error (RMSE): 79.58451529673299


In [14]:
y_test_pred_svr_rbf = pd.Series(svr_y_pred)
y_test_pred_svr_rbf.describe()

count    23114.000000
mean        18.251407
std          5.932474
min         -7.893984
25%         16.470754
50%         18.192458
75%         19.543056
max         37.267289
dtype: float64

In [None]:
# run SVR with polynomial as kernel parameter

svr_reg_poly = SVR(kernel='poly')

In [11]:
svr_reg_poly.fit(X_train, y_train)

In [None]:
# calculate RMSE
svr_y_pred_poly = svr_reg.predict(X_test)
print('Root Mean Squared Error (RMSE):', np.sqrt(metrics.mean_squared_error(y_test, svr_y_pred_poly)))

Root Mean Squared Error (RMSE): 79.58451529673299
