In [72]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import tensorflow as tf
import numpy as np

In [23]:
# Reading the dataset
df = pd.read_csv("../data/aggregated.csv")

# Let's have a look
df.head(3)

Unnamed: 0,MONTH,DAY_OF_WEEK,FL_DATE,UNIQUE_CARRIER,FL_NUM,ORIGIN,ORIGIN_CITY_NAME,DEST,DEST_CITY_NAME,CRS_DEP_TIME,ARR_DEL15,CRS_ELAPSED_TIME,DISTANCE,Unnamed: 13
0,2.0,6.0,2017-02-25,B6,28.0,MCO,"Orlando, FL",EWR,"Newark, NJ",1000.0,0.0,156.0,937.0,
1,2.0,7.0,2017-02-26,B6,28.0,MCO,"Orlando, FL",EWR,"Newark, NJ",739.0,0.0,153.0,937.0,
2,2.0,1.0,2017-02-27,B6,28.0,MCO,"Orlando, FL",EWR,"Newark, NJ",1028.0,0.0,158.0,937.0,


### I will drop some of the columns:
- `FL_DATE`: Because I have the month and the day of the week in separate columns.
- `ORIGIN_CITY_NAME`: Because the airport code will be enough.
- `DEST_CITY_NAME`: Because the airport code will be enough.
- `Unnamed: 13`: Because it is null for every row.

In [24]:
# Getting rid of the necessary columns
df.drop(['FL_DATE',
        'ORIGIN_CITY_NAME',
        'DEST_CITY_NAME',
        'Unnamed: 13'], axis=1, inplace=True)

# We are left with this
df.head(3)

Unnamed: 0,MONTH,DAY_OF_WEEK,UNIQUE_CARRIER,FL_NUM,ORIGIN,DEST,CRS_DEP_TIME,ARR_DEL15,CRS_ELAPSED_TIME,DISTANCE
0,2.0,6.0,B6,28.0,MCO,EWR,1000.0,0.0,156.0,937.0
1,2.0,7.0,B6,28.0,MCO,EWR,739.0,0.0,153.0,937.0
2,2.0,1.0,B6,28.0,MCO,EWR,1028.0,0.0,158.0,937.0


In [25]:
# Let's have a look at our null values
df.isna().sum()[df.isna().sum()!=0]

ARR_DEL15           71020
CRS_ELAPSED_TIME       10
dtype: int64

### I will drop rows where `ARR_DEL15` is null since that is what we are predicting.
### I will drop rows where `CRS_ELAPSED_TIME` is null since there are only 10 such rows.

In [28]:
# Getting rid of thos null values
df.dropna(inplace=True)

# Our data still looks mostly the same, but it is cleaned up now
df.head(3)

Unnamed: 0,MONTH,DAY_OF_WEEK,UNIQUE_CARRIER,FL_NUM,ORIGIN,DEST,CRS_DEP_TIME,ARR_DEL15,CRS_ELAPSED_TIME,DISTANCE
0,2.0,6.0,B6,28.0,MCO,EWR,1000.0,0.0,156.0,937.0
1,2.0,7.0,B6,28.0,MCO,EWR,739.0,0.0,153.0,937.0
2,2.0,1.0,B6,28.0,MCO,EWR,1028.0,0.0,158.0,937.0


### I will use `LabelEncoder` to turn `ORIGIN`,  `DEST`, and `UNIQUE_CARRIER` to numbers.

In [37]:
# Here's a LabelEncoder fit to ORIGIN
le = LabelEncoder().fit(df['ORIGIN'])
# Here I transform the column to those numbers
df['ORIGIN'] = le.transform(df['ORIGIN'])

# Here's a LabelEncoder fit to DEST
le = LabelEncoder().fit(df['DEST'])
# Here I transform the column to those numbers
df['DEST'] = le.transform(df['DEST'])

# Here's a LabelEncoder fit to UNIQUE_CARRIER
le = LabelEncoder().fit(df['UNIQUE_CARRIER'])
# Here I transform the column to those numbers
df['UNIQUE_CARRIER'] = le.transform(df['UNIQUE_CARRIER'])

In [38]:
# Here is the data
df.head(3)

Unnamed: 0,MONTH,DAY_OF_WEEK,UNIQUE_CARRIER,FL_NUM,ORIGIN,DEST,CRS_DEP_TIME,ARR_DEL15,CRS_ELAPSED_TIME,DISTANCE
0,2.0,6.0,2,28.0,186,98,1000.0,0.0,156.0,937.0
1,2.0,7.0,2,28.0,186,98,739.0,0.0,153.0,937.0
2,2.0,1.0,2,28.0,186,98,1028.0,0.0,158.0,937.0


In [53]:
# We are ready to do a train and test data split
# Let's first separate our input and output variables

# Input variables
X = df.drop(['ARR_DEL15'], axis=1)

# Output variables
y = df['ARR_DEL15']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,
                                                    stratify=df['ARR_DEL15'], random_state = 42)

In [68]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(32, input_length=9, output_dim=32),
    
    tf.keras.layers.GlobalAveragePooling1D(),
    
    tf.keras.layers.Dropout(0.2),
    
    tf.keras.layers.Dense(32, activation = 'relu'),
    
    tf.keras.layers.Dropout(0.2),
    
    tf.keras.layers.Dense(df['ARR_DEL15'].nunique(), activation='softmax')
])

In [69]:
model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 9, 32)             1024      
_________________________________________________________________
global_average_pooling1d_2 ( (None, 32)                0         
_________________________________________________________________
dropout_4 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_8 (Dense)              (None, 32)                1056      
_________________________________________________________________
dropout_5 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_9 (Dense)              (None, 2)                 66        
Total params: 2,146
Trainable params: 2,146
Non-trainable params: 0
____________________________________________________

In [70]:
model.compile(loss = 'sparse_categorical_crossentropy', optimizer='adam', metrics = ['accuracy'])

In [73]:
%%time

#converting to numpy arrays prior to fitting into the model
X_train = np.asarray(X_train)
y_train = np.asarray(y_train)
X_test = np.asarray(X_test)
y_test = np.asarray(y_test)

#No need to run a loop, keras does it for us unlike in case of pytorch.
#Though pytorch offers more flexibility in logic

#change verbose, to display training state differently
history = model.fit(x_train, y_train, epochs=30, validation_data=(X_test, y_test), verbose=1)

Epoch 1/30


InvalidArgumentError:  indices[0,3] = 5518 is not in [0, 32)
	 [[node sequential_5/embedding_2/embedding_lookup (defined at <timed exec>:11) ]] [Op:__inference_train_function_1217]

Errors may have originated from an input operation.
Input Source operations connected to node sequential_5/embedding_2/embedding_lookup:
 sequential_5/embedding_2/embedding_lookup/968 (defined at /Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/contextlib.py:113)

Function call stack:
train_function
