In [96]:
# import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as ply

from sklearn.preprocessing import StandardScaler, normalize, Imputer, OneHotEncoder
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.naive_bayes import BernoulliNB
from sklearn.multioutput import MultiOutputRegressor
from sklearn.svm import SVR

In [97]:
# load data

df = pd.read_csv('data/otp.csv')

In [98]:
df.head()

Unnamed: 0,Route,Departing_Port,Arriving_Port,Airline,Month,Sectors_Scheduled,Sectors_Flown,Cancellations,Departures_On_Time,Arrivals_On_Time,Departures_Delayed,Arrivals_Delayed,Year,Month_Num
0,Adelaide-Brisbane,Adelaide,Brisbane,All Airlines,37987,155.0,155,0.0,123.0,120.0,32.0,35.0,2004,1
1,Adelaide-Canberra,Adelaide,Canberra,All Airlines,37987,75.0,75,0.0,72.0,72.0,3.0,3.0,2004,1
2,Adelaide-Gold Coast,Adelaide,Gold Coast,All Airlines,37987,40.0,40,0.0,36.0,35.0,4.0,5.0,2004,1
3,Adelaide-Melbourne,Adelaide,Melbourne,All Airlines,37987,550.0,548,2.0,478.0,487.0,70.0,61.0,2004,1
4,Adelaide-Perth,Adelaide,Perth,All Airlines,37987,191.0,191,0.0,169.0,168.0,22.0,23.0,2004,1


In [99]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80615 entries, 0 to 80614
Data columns (total 14 columns):
Route                 80615 non-null object
Departing_Port        80615 non-null object
Arriving_Port         80615 non-null object
Airline               80615 non-null object
Month                 80615 non-null int64
Sectors_Scheduled     80615 non-null float64
Sectors_Flown         80615 non-null int64
Cancellations         80299 non-null float64
Departures_On_Time    80615 non-null float64
Arrivals_On_Time      80615 non-null float64
Departures_Delayed    80610 non-null float64
Arrivals_Delayed      80615 non-null float64
Year                  80615 non-null int64
Month_Num             80615 non-null int64
dtypes: float64(6), int64(4), object(4)
memory usage: 8.6+ MB


In [100]:
df.shape

(80615, 14)

In [101]:
df.columns

Index(['Route', 'Departing_Port', 'Arriving_Port', 'Airline', 'Month',
       'Sectors_Scheduled', 'Sectors_Flown', 'Cancellations',
       'Departures_On_Time', 'Arrivals_On_Time', 'Departures_Delayed',
       'Arrivals_Delayed', 'Year', 'Month_Num'],
      dtype='object')

In [102]:
df.isnull().sum()

Route                   0
Departing_Port          0
Arriving_Port           0
Airline                 0
Month                   0
Sectors_Scheduled       0
Sectors_Flown           0
Cancellations         316
Departures_On_Time      0
Arrivals_On_Time        0
Departures_Delayed      5
Arrivals_Delayed        0
Year                    0
Month_Num               0
dtype: int64

In [103]:
df.fillna(np.mean, inplace=True)

In [76]:
df.describe()

Unnamed: 0,Month,Sectors_Scheduled,Sectors_Flown,Departures_On_Time,Arrivals_On_Time,Arrivals_Delayed,Year,Month_Num
count,80615.0,80615.0,80615.0,80615.0,80615.0,80615.0,80615.0,80615.0
mean,41138.184755,354.073783,348.574769,291.119963,286.801848,61.771656,2012.176841,6.418284
std,1605.069509,2293.829113,2258.781735,1891.814034,1861.989924,412.717325,4.401643,3.438358
min,37987.0,0.0,0.0,0.0,0.0,0.0,2004.0,1.0
25%,39845.0,38.0,37.0,31.0,31.0,6.0,2009.0,3.0
50%,41306.0,93.0,92.0,76.0,76.0,14.0,2013.0,6.0
75%,42522.0,186.0,183.0,152.0,149.0,33.0,2016.0,9.0
max,43647.0,50949.0,49908.0,43537.0,43291.0,12840.0,2019.0,12.0


In [78]:
dummies_df = pd.get_dummies(df, drop_first=True)

In [79]:
dummies_df.shape

(80615, 2160)

In [80]:
dummies_df.columns

Index(['Month', 'Sectors_Scheduled', 'Sectors_Flown', 'Departures_On_Time',
       'Arrivals_On_Time', 'Arrivals_Delayed', 'Year', 'Month_Num',
       'Route_Adelaide-Brisbane', 'Route_Adelaide-Canberra',
       ...
       'Departures_Delayed_697.0', 'Departures_Delayed_1810.0',
       'Departures_Delayed_448.0', 'Departures_Delayed_8562.0',
       'Departures_Delayed_2199.0', 'Departures_Delayed_844.0',
       'Departures_Delayed_2889.0', 'Departures_Delayed_638.0',
       'Departures_Delayed_633.0', 'Departures_Delayed_11933.0'],
      dtype='object', length=2160)

In [81]:
dummies_df.describe()

Unnamed: 0,Month,Sectors_Scheduled,Sectors_Flown,Departures_On_Time,Arrivals_On_Time,Arrivals_Delayed,Year,Month_Num,Route_Adelaide-Brisbane,Route_Adelaide-Canberra,...,Departures_Delayed_697.0,Departures_Delayed_1810.0,Departures_Delayed_448.0,Departures_Delayed_8562.0,Departures_Delayed_2199.0,Departures_Delayed_844.0,Departures_Delayed_2889.0,Departures_Delayed_638.0,Departures_Delayed_633.0,Departures_Delayed_11933.0
count,80615.0,80615.0,80615.0,80615.0,80615.0,80615.0,80615.0,80615.0,80615.0,80615.0,...,80615.0,80615.0,80615.0,80615.0,80615.0,80615.0,80615.0,80615.0,80615.0,80615.0
mean,41138.184755,354.073783,348.574769,291.119963,286.801848,61.771656,2012.176841,6.418284,0.010271,0.007604,...,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05
std,1605.069509,2293.829113,2258.781735,1891.814034,1861.989924,412.717325,4.401643,3.438358,0.100825,0.08687,...,0.003522,0.003522,0.003522,0.003522,0.003522,0.003522,0.003522,0.003522,0.003522,0.003522
min,37987.0,0.0,0.0,0.0,0.0,0.0,2004.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,39845.0,38.0,37.0,31.0,31.0,6.0,2009.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,41306.0,93.0,92.0,76.0,76.0,14.0,2013.0,6.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,42522.0,186.0,183.0,152.0,149.0,33.0,2016.0,9.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,43647.0,50949.0,49908.0,43537.0,43291.0,12840.0,2019.0,12.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [59]:
X = dummies_df.drop(['Cancellations', 'Departures_On_Time', 'Arrivals_On_Time',
            'Departures_Delayed', 'Arrivals_Delayed'], axis=1)
y = dummies_df[['Cancellations', 'Departures_On_Time', 'Arrivals_On_Time',
            'Departures_Delayed', 'Arrivals_Delayed']]

In [88]:
X.shape

(80615, 253)

In [89]:
y.shape

(80615, 5)

In [60]:
# split data into train, test sets

X_train, y_train, X_test, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

In [63]:
# a = imputer.fit_transform(dummies_df)

In [90]:
classifier = MultiOutputRegressor(SVR())
classifier.fit(X,y)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').