In [1]:
import datetime, warnings, scipy
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

In [2]:
merged_flights = pd.read_csv('Data/merged_flights.csv', index_col=0)
merged_flights.head()

Unnamed: 0,SCHEDULED_DATE,DAY_OF_WEEK,AIRLINE,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,DEPARTURE_TIME,TAXI_OUT,WHEELS_OFF,...,SPEED,IATA_CODE,AIRPORT,CITY,STATE,COUNTRY,LATITUDE,LONGITUDE,Enplanements,area
0,2015-01-01,4,AS,N407AS,ANC,SEA,00:05:00,23:54:00,21.0,00:15:00,...,423.805,ANC,Ted Stevens Anchorage International Airport,Anchorage,AK,USA,61.17432,-149.99619,2713843,west
1,2015-01-01,4,AS,N309AS,ANC,SEA,00:45:00,00:41:00,17.0,00:58:00,...,425.882,ANC,Ted Stevens Anchorage International Airport,Anchorage,AK,USA,61.17432,-149.99619,2713843,west
2,2015-01-01,4,DL,N3743H,ANC,SEA,00:45:00,00:31:00,25.0,00:56:00,...,413.714,ANC,Ted Stevens Anchorage International Airport,Anchorage,AK,USA,61.17432,-149.99619,2713843,west
3,2015-01-01,4,AS,N413AS,ANC,PDX,00:50:00,00:46:00,11.0,00:57:00,...,430.326,ANC,Ted Stevens Anchorage International Airport,Anchorage,AK,USA,61.17432,-149.99619,2713843,west
4,2015-01-01,4,US,N804AW,ANC,PHX,01:52:00,01:43:00,21.0,02:04:00,...,474.056,ANC,Ted Stevens Anchorage International Airport,Anchorage,AK,USA,61.17432,-149.99619,2713843,west


In [3]:
merged_flights.columns

Index(['SCHEDULED_DATE', 'DAY_OF_WEEK', 'AIRLINE', 'TAIL_NUMBER',
       'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT', 'SCHEDULED_DEPARTURE',
       'DEPARTURE_TIME', 'TAXI_OUT', 'WHEELS_OFF', 'SCHEDULED_TIME',
       'DISTANCE', 'SCHEDULED_ARRIVAL', 'ARRIVAL_DELAY', 'DEPARTURE_DELAY',
       'SPEED', 'IATA_CODE', 'AIRPORT', 'CITY', 'STATE', 'COUNTRY', 'LATITUDE',
       'LONGITUDE', 'Enplanements', 'area'],
      dtype='object')

In [None]:
#print("Number of airports: {}".format(len(merged_flights['ORIGIN_AIRPORT'].unique())))
#print("List of airports: {}".format(merged_flights['ORIGIN_AIRPORT'].unique()))

In [None]:
#merged_flights['DEPARTURE_DELAY']

#### Drop unnecessary (?) columns, repeated info

In [4]:
merged_flights=merged_flights.drop(columns=['TAIL_NUMBER','SCHEDULED_DEPARTURE', 'DEPARTURE_TIME','AIRPORT','COUNTRY','IATA_CODE'])
merged_flights

Unnamed: 0,SCHEDULED_DATE,DAY_OF_WEEK,AIRLINE,ORIGIN_AIRPORT,DESTINATION_AIRPORT,TAXI_OUT,WHEELS_OFF,SCHEDULED_TIME,DISTANCE,SCHEDULED_ARRIVAL,ARRIVAL_DELAY,DEPARTURE_DELAY,SPEED,CITY,STATE,LATITUDE,LONGITUDE,Enplanements,area
0,2015-01-01,4,AS,ANC,SEA,21.0,00:15:00,205.0,1448,04:30:00,-22.0,-11.0,423.805,Anchorage,AK,61.17432,-149.99619,2713843,west
1,2015-01-01,4,AS,ANC,SEA,17.0,00:58:00,204.0,1448,05:09:00,-14.0,-4.0,425.882,Anchorage,AK,61.17432,-149.99619,2713843,west
2,2015-01-01,4,DL,ANC,SEA,25.0,00:56:00,210.0,1448,05:15:00,-24.0,-14.0,413.714,Anchorage,AK,61.17432,-149.99619,2713843,west
3,2015-01-01,4,AS,ANC,PDX,11.0,00:57:00,215.0,1542,05:25:00,-18.0,-4.0,430.326,Anchorage,AK,61.17432,-149.99619,2713843,west
4,2015-01-01,4,US,ANC,PHX,21.0,02:04:00,323.0,2552,09:15:00,-10.0,-9.0,474.056,Anchorage,AK,61.17432,-149.99619,2713843,west
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2818548,2015-06-25,4,B6,HYA,JFK,9.0,13:24:00,72.0,196,14:21:00,-12.0,6.0,163.333,Hyannis,MA,41.66934,-70.28036,31027,south
2818549,2015-06-26,5,B6,HYA,JFK,5.0,13:20:00,72.0,196,14:21:00,-18.0,6.0,163.333,Hyannis,MA,41.66934,-70.28036,31027,south
2818550,2015-06-27,6,B6,HYA,JFK,16.0,13:24:00,72.0,196,14:21:00,-7.0,-1.0,163.333,Hyannis,MA,41.66934,-70.28036,31027,south
2818551,2015-06-29,1,B6,HYA,JFK,6.0,13:11:00,72.0,196,14:21:00,-22.0,-4.0,163.333,Hyannis,MA,41.66934,-70.28036,31027,south


### Linear Regression - testing only on numerical features

In [6]:
LR_train =merged_flights.drop(columns=['SCHEDULED_DATE','DAY_OF_WEEK','WHEELS_OFF','SCHEDULED_ARRIVAL','LONGITUDE','LATITUDE','Enplanements'])
LR_train

Unnamed: 0,AIRLINE,ORIGIN_AIRPORT,DESTINATION_AIRPORT,TAXI_OUT,SCHEDULED_TIME,DISTANCE,ARRIVAL_DELAY,DEPARTURE_DELAY,SPEED,CITY,STATE,area
0,AS,ANC,SEA,21.0,205.0,1448,-22.0,-11.0,423.805,Anchorage,AK,west
1,AS,ANC,SEA,17.0,204.0,1448,-14.0,-4.0,425.882,Anchorage,AK,west
2,DL,ANC,SEA,25.0,210.0,1448,-24.0,-14.0,413.714,Anchorage,AK,west
3,AS,ANC,PDX,11.0,215.0,1542,-18.0,-4.0,430.326,Anchorage,AK,west
4,US,ANC,PHX,21.0,323.0,2552,-10.0,-9.0,474.056,Anchorage,AK,west
...,...,...,...,...,...,...,...,...,...,...,...,...
2818548,B6,HYA,JFK,9.0,72.0,196,-12.0,6.0,163.333,Hyannis,MA,south
2818549,B6,HYA,JFK,5.0,72.0,196,-18.0,6.0,163.333,Hyannis,MA,south
2818550,B6,HYA,JFK,16.0,72.0,196,-7.0,-1.0,163.333,Hyannis,MA,south
2818551,B6,HYA,JFK,6.0,72.0,196,-22.0,-4.0,163.333,Hyannis,MA,south


In [7]:
LR_model = LinearRegression(fit_intercept=True)

LR_X = LR_train[["TAXI_OUT",'SCHEDULED_TIME','DISTANCE','DEPARTURE_DELAY','SPEED']]
LR_Y = LR_train["ARRIVAL_DELAY"]

LR_model.fit(LR_X, LR_Y)

#LR_xfit = np.linspace(6, 12, 2) # testing on train set
LR_yfit = LR_model.predict(LR_X)

print(LR_yfit)


[  3.15103541   2.79455302   4.47660407 ...   1.20680703 -10.45168076
 -17.81469887]


In [8]:
# Calculate MSE and R2

# The mean squared error 
print("Mean squared error - Train: %.2f" % mean_squared_error(LR_Y, LR_yfit))
# Explained variance score: 1 is perfect prediction
print('Variance score - Train: %.2f' % r2_score(LR_Y, LR_yfit))

Mean squared error - Train: 949.13
Variance score - Train: 0.43


#### Polynomial

In [10]:
PR_model = make_pipeline(PolynomialFeatures(3), LinearRegression(fit_intercept=True))

PR_model.fit(LR_X, LR_Y)

PR_yfit = PR_model.predict(LR_X)

print(PR_yfit)
print("Mean squared error - Train: %.2f" % mean_squared_error(LR_Y, PR_yfit))
print('Variance score - Train: %.2f' % r2_score(LR_Y, PR_yfit))

[ -2.50822911  -1.9837487   -1.31602452 ...  -2.67817313 -11.91491176
 -24.59792674]
Mean squared error - Train: 368.11
Variance score - Train: 0.78


### Linear Regression - One-Hot encoding

In [11]:
LR_model = LinearRegression(fit_intercept=True)

LR_X = LR_train.drop(columns=['ARRIVAL_DELAY'])
LR_Y = LR_train["ARRIVAL_DELAY"]

In [12]:
LR_X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2818553 entries, 0 to 2818552
Data columns (total 11 columns):
 #   Column               Dtype  
---  ------               -----  
 0   AIRLINE              object 
 1   ORIGIN_AIRPORT       object 
 2   DESTINATION_AIRPORT  object 
 3   TAXI_OUT             float64
 4   SCHEDULED_TIME       float64
 5   DISTANCE             int64  
 6   DEPARTURE_DELAY      float64
 7   SPEED                float64
 8   CITY                 object 
 9   STATE                object 
 10  area                 object 
dtypes: float64(4), int64(1), object(6)
memory usage: 258.0+ MB


In [13]:
# Find object columns for one-hot encoding
df_categories = LR_X.select_dtypes(include=['object']).copy()
df_categories.head()

Unnamed: 0,AIRLINE,ORIGIN_AIRPORT,DESTINATION_AIRPORT,CITY,STATE,area
0,AS,ANC,SEA,Anchorage,AK,west
1,AS,ANC,SEA,Anchorage,AK,west
2,DL,ANC,SEA,Anchorage,AK,west
3,AS,ANC,PDX,Anchorage,AK,west
4,US,ANC,PHX,Anchorage,AK,west


In [87]:
#df_categories = pd.get_dummies(df_categories, columns=['AIRLINE','ORIGIN_AIRPORT','DESTINATION_AIRPORT','CITY','STATE','area'], prefix = ['Airline','Origin','Dest','city','state','area'])
#print(df_categories.head())

Probably need to drop some of the categorical features, (CITY, STATE, area ?)

If we encode all of those features, we will end up with over a thousand columns

In [38]:
X = LR_train.drop(columns=['ARRIVAL_DELAY'])
Y = LR_train["ARRIVAL_DELAY"]
X = pd.get_dummies(X, columns=['AIRLINE','ORIGIN_AIRPORT','DESTINATION_AIRPORT','CITY','STATE','area'], prefix = ['Airline','Origin','Dest','city','state','area'])
X

Unnamed: 0,TAXI_OUT,SCHEDULED_TIME,DISTANCE,DEPARTURE_DELAY,SPEED,Airline_AA,Airline_AS,Airline_B6,Airline_DL,Airline_EV,...,state_VT,state_WA,state_WI,state_WV,state_WY,area_islands,area_midwest,area_northeast,area_south,area_west
0,21.0,205.0,1448,-11.0,423.805,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,17.0,204.0,1448,-4.0,425.882,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,25.0,210.0,1448,-14.0,413.714,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
3,11.0,215.0,1542,-4.0,430.326,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,21.0,323.0,2552,-9.0,474.056,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2818548,9.0,72.0,196,6.0,163.333,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
2818549,5.0,72.0,196,6.0,163.333,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
2818550,16.0,72.0,196,-1.0,163.333,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
2818551,6.0,72.0,196,-4.0,163.333,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0


### Training model on encoded features

In [16]:
LR_X = LR_train.drop(columns=['ARRIVAL_DELAY']).loc[:1000,:]
LR_Y = LR_train.loc[:1000,"ARRIVAL_DELAY"]

In [17]:
LR_X = pd.get_dummies(LR_X, columns=['AIRLINE','ORIGIN_AIRPORT','DESTINATION_AIRPORT','CITY','STATE','area'], prefix = ['Airline','Origin','Dest','city','state','area'])
LR_X

Unnamed: 0,TAXI_OUT,SCHEDULED_TIME,DISTANCE,DEPARTURE_DELAY,SPEED,Airline_AS,Airline_DL,Airline_UA,Airline_US,Origin_ANC,...,Dest_ORD,Dest_OTZ,Dest_PDX,Dest_PHX,Dest_SCC,Dest_SEA,Dest_SFO,city_Anchorage,state_AK,area_west
0,21.0,205.0,1448,-11.0,423.805,1,0,0,0,1,...,0,0,0,0,0,1,0,1,1,1
1,17.0,204.0,1448,-4.0,425.882,1,0,0,0,1,...,0,0,0,0,0,1,0,1,1,1
2,25.0,210.0,1448,-14.0,413.714,0,1,0,0,1,...,0,0,0,0,0,1,0,1,1,1
3,11.0,215.0,1542,-4.0,430.326,1,0,0,0,1,...,0,0,1,0,0,0,0,1,1,1
4,21.0,323.0,2552,-9.0,474.056,0,0,0,1,1,...,0,0,0,1,0,0,0,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
996,21.0,212.0,1448,-49.0,409.811,0,1,0,0,1,...,0,0,0,0,0,1,0,1,1,1
997,18.0,79.0,399,-7.0,303.038,1,0,0,0,1,...,0,0,0,0,0,0,0,1,1,1
998,12.0,95.0,539,-1.0,340.421,1,0,0,0,1,...,0,0,0,0,0,0,0,1,1,1
999,18.0,205.0,1448,-51.0,423.805,1,0,0,0,1,...,0,0,0,0,0,1,0,1,1,1


In [18]:
LR_model.fit(LR_X, LR_Y)
LR_yfit = LR_model.predict(LR_X)

print(LR_yfit)
print("Mean squared error - Train: %.2f" % mean_squared_error(LR_Y, LR_yfit))
print('Variance score - Train: %.2f' % r2_score(LR_Y, LR_yfit))

[ -1.51392868  -3.26502713 -10.40339111 ...  -5.10615488  -7.9794169
  -2.15524914]
Mean squared error - Train: 533.85
Variance score - Train: 0.16


### Polynomial Regression with one-hot encoding

In [19]:
PR_model = make_pipeline(PolynomialFeatures(2), LinearRegression(fit_intercept=True))

In [20]:
PR_X = LR_train.drop(columns=['ARRIVAL_DELAY']).loc[:10000,:]
PR_Y = LR_train.loc[:10000,"ARRIVAL_DELAY"]

PR_X = pd.get_dummies(PR_X, columns=['AIRLINE','ORIGIN_AIRPORT','DESTINATION_AIRPORT','CITY','STATE','area'], prefix = ['Airline','Origin','Dest','city','state','area'])
PR_X

Unnamed: 0,TAXI_OUT,SCHEDULED_TIME,DISTANCE,DEPARTURE_DELAY,SPEED,Airline_AA,Airline_AS,Airline_B6,Airline_DL,Airline_F9,...,Dest_SMF,Dest_SMX,Dest_STL,Dest_TPA,Dest_TUS,city_Anchorage,city_Los Angeles,state_AK,state_CA,area_west
0,21.0,205.0,1448,-11.0,423.805,0,1,0,0,0,...,0,0,0,0,0,1,0,1,0,1
1,17.0,204.0,1448,-4.0,425.882,0,1,0,0,0,...,0,0,0,0,0,1,0,1,0,1
2,25.0,210.0,1448,-14.0,413.714,0,0,0,1,0,...,0,0,0,0,0,1,0,1,0,1
3,11.0,215.0,1542,-4.0,430.326,0,1,0,0,0,...,0,0,0,0,0,1,0,1,0,1
4,21.0,323.0,2552,-9.0,474.056,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9996,18.0,245.0,1744,51.0,427.102,1,0,0,0,0,...,0,0,0,0,0,0,1,0,1,1
9997,19.0,77.0,308,13.0,240.000,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,1
9998,15.0,250.0,1947,4.0,467.280,0,0,0,1,0,...,0,0,0,0,0,0,1,0,1,1
9999,15.0,60.0,209,38.0,209.000,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,1


In [21]:
PR_model.fit(PR_X, PR_Y)
PR_yfit = PR_model.predict(PR_X)
print(PR_yfit)

[ -0.69181675   0.67148357  -4.12965626 ...   1.61905437  29.41077381
 161.01772457]


In [22]:
print("Mean squared error - Train: %.2f" % mean_squared_error(PR_Y, PR_yfit))
print('Variance score - Train: %.2f' % r2_score(PR_Y, PR_yfit))

Mean squared error - Train: 185.37
Variance score - Train: 0.83


### Regression Tree

In [23]:
from sklearn.tree import DecisionTreeRegressor

Tree_model = DecisionTreeRegressor(max_depth=2)

Tree_X = LR_train.drop(columns=['ARRIVAL_DELAY']).loc[:10000,:]
Tree_Y = LR_train.loc[:10000,"ARRIVAL_DELAY"]
#Tree_X = LR_train.drop(columns=['ARRIVAL_DELAY'])
#Tree_Y = LR_train["ARRIVAL_DELAY"]

Tree_X = pd.get_dummies(Tree_X, columns=['AIRLINE','ORIGIN_AIRPORT','DESTINATION_AIRPORT','CITY','STATE','area'], prefix = ['Airline','Origin','Dest','city','state','area'])
Tree_X

Unnamed: 0,TAXI_OUT,SCHEDULED_TIME,DISTANCE,DEPARTURE_DELAY,SPEED,Airline_AA,Airline_AS,Airline_B6,Airline_DL,Airline_F9,...,Dest_SMF,Dest_SMX,Dest_STL,Dest_TPA,Dest_TUS,city_Anchorage,city_Los Angeles,state_AK,state_CA,area_west
0,21.0,205.0,1448,-11.0,423.805,0,1,0,0,0,...,0,0,0,0,0,1,0,1,0,1
1,17.0,204.0,1448,-4.0,425.882,0,1,0,0,0,...,0,0,0,0,0,1,0,1,0,1
2,25.0,210.0,1448,-14.0,413.714,0,0,0,1,0,...,0,0,0,0,0,1,0,1,0,1
3,11.0,215.0,1542,-4.0,430.326,0,1,0,0,0,...,0,0,0,0,0,1,0,1,0,1
4,21.0,323.0,2552,-9.0,474.056,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9996,18.0,245.0,1744,51.0,427.102,1,0,0,0,0,...,0,0,0,0,0,0,1,0,1,1
9997,19.0,77.0,308,13.0,240.000,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,1
9998,15.0,250.0,1947,4.0,467.280,0,0,0,1,0,...,0,0,0,0,0,0,1,0,1,1
9999,15.0,60.0,209,38.0,209.000,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,1


In [24]:
Tree_model.fit(Tree_X, Tree_Y)

Tree_yfit = Tree_model.predict(Tree_X)

In [25]:
print(Tree_yfit)

[ -5.39444116  -5.39444116  -5.39444116 ...  -5.39444116  26.82688928
 206.43902439]


In [26]:
print("Mean squared error - Train: %.2f" % mean_squared_error(Tree_Y, Tree_yfit))
print('Variance score - Train: %.2f' % r2_score(Tree_Y, Tree_yfit))

Mean squared error - Train: 374.27
Variance score - Train: 0.65


## Neural Networks

In [27]:
import pandas
from keras.models import Sequential
from keras.layers import Dense # fully connected layers
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [28]:
NN_X = LR_train.drop(columns=['ARRIVAL_DELAY']).loc[:10000,:]
NN_Y = LR_train.loc[:10000,"ARRIVAL_DELAY"]

NN_X = pd.get_dummies(NN_X, columns=['AIRLINE','ORIGIN_AIRPORT','DESTINATION_AIRPORT','CITY','STATE','area'], prefix = ['Airline','Origin','Dest','city','state','area'])
NN_X

Unnamed: 0,TAXI_OUT,SCHEDULED_TIME,DISTANCE,DEPARTURE_DELAY,SPEED,Airline_AA,Airline_AS,Airline_B6,Airline_DL,Airline_F9,...,Dest_SMF,Dest_SMX,Dest_STL,Dest_TPA,Dest_TUS,city_Anchorage,city_Los Angeles,state_AK,state_CA,area_west
0,21.0,205.0,1448,-11.0,423.805,0,1,0,0,0,...,0,0,0,0,0,1,0,1,0,1
1,17.0,204.0,1448,-4.0,425.882,0,1,0,0,0,...,0,0,0,0,0,1,0,1,0,1
2,25.0,210.0,1448,-14.0,413.714,0,0,0,1,0,...,0,0,0,0,0,1,0,1,0,1
3,11.0,215.0,1542,-4.0,430.326,0,1,0,0,0,...,0,0,0,0,0,1,0,1,0,1
4,21.0,323.0,2552,-9.0,474.056,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9996,18.0,245.0,1744,51.0,427.102,1,0,0,0,0,...,0,0,0,0,0,0,1,0,1,1
9997,19.0,77.0,308,13.0,240.000,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,1
9998,15.0,250.0,1947,4.0,467.280,0,0,0,1,0,...,0,0,0,0,0,0,1,0,1,1
9999,15.0,60.0,209,38.0,209.000,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,1


In [29]:
# define base model
def baseline_model():
    
    # create model
    model = Sequential()
    
    # ensure the input layer has the right number of input features - input_dim argument
    # specify the number of neurons or nodes in the layer as the first argument
    # specify the activation function using the activation argument
    model.add(Dense(13, input_dim=117, kernel_initializer='normal', activation='relu'))
    
    # No activation function is used for the output layer because it is a regression problem and we 
    # are interested in predicting numerical values directly without transform
    model.add(Dense(1, kernel_initializer='normal'))
    
    # Compile model - specify the loss function to use to evaluate a set of weights
    # the optimizer is used to search through different weights for the network and any optional metrics 
    # we would like to collect and report during training
    model.compile(loss='mean_squared_error', optimizer='adam')
    
    return model

Adam optimizer - efficient stochastic gradient descent algorithm. This is a popular version of gradient descent because it automatically tunes itself and gives good results in a wide range of problems.

More info:
https://machinelearningmastery.com/adam-optimization-algorithm-for-deep-learning/

It used to be the case that Sigmoid and Tanh activation functions were preferred for all layers. These days, better performance is achieved using the ReLU activation function.

We could also try other activation functions here (?)

#### Fitting Keras Model

Training occurs over epochs and each epoch is split into batches.
* Epoch: One pass through all of the rows in the training dataset.
* Batch: One or more samples considered by the model within an epoch before weights are updated.

These configurations can be chosen experimentally by trial and error.

In [32]:
# fit the keras model on the dataset - we definitely need to run it with much more epochs
NN_model = baseline_model()
NN_model.fit(NN_X, NN_Y, epochs=15, batch_size=10)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7f811bbd0e20>

### Evaluaion of Keras Model
10-fold cross validation to evaluate the model.

In [91]:
#!pip install scikeras

In [94]:
import scikeras
from scikeras.wrappers import KerasRegressor

estimator = KerasRegressor(model=baseline_model, epochs=10, batch_size=5, verbose=0)
kfold = KFold(n_splits=3)
results = cross_val_score(estimator, NN_X, NN_Y, cv=kfold)
print("Results: %.2f (%.2f) MSE" % (results.mean(), results.std()))

Results: 0.75 (0.03) MSE


In [33]:
# is MSE  normalized here? I'm not sure what this value means

### Predicting on train set

In [34]:
# make probability predictions with the model
predictions = NN_model.predict(NN_X)
# round predictions 
#rounded = [round(x[0]) for x in predictions]
print(predictions)

[[-10.565712 ]
 [ -7.4390817]
 [-13.32297  ]
 ...
 [ -2.1978047]
 [ 32.39311  ]
 [176.2954   ]]


In [35]:
print("Mean squared error - Train: %.2f" % mean_squared_error(NN_Y, predictions))
print('Variance score - Train: %.2f' % r2_score(NN_Y, predictions))

Mean squared error - Train: 207.70
Variance score - Train: 0.81
