In [1]:
import os
import pathlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

# Importing Tensorflow
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

import tensorflow_docs as tfdocs
import tensorflow_docs.plots
import tensorflow_docs.modeling

# tf version check
print(tf.__version__)

2.1.0


In [12]:
# cleaning the data
def clean_data(data):
    # removing un-needed columns
    cols_to_keep = [
#         "DAY_OF_MONTH",
        "DAY_OF_WEEK",
        "OP_CARRIER",
        "ORIGIN",
        "DEST",
        "DEP_DEL15",
        "DEP_TIME_BLK",
#         "CANCELLED",
#         "DIVERTED",
        "DISTANCE"
    ]
    data = data[cols_to_keep]
    
    # removing NA values
    data = data.dropna()
    
    # converting from floats to ints
    data["DEP_DEL15"] = data["DEP_DEL15"].astype(int)
#     data["CANCELLED"] = data["CANCELLED"].astype(int)
#     data["DIVERTED"] = data["DIVERTED"].astype(int)

    # converting categorical variables
    data['DAY_OF_WEEK'] = data['DAY_OF_WEEK'].map({
        1: 'Monday', 2: 'Tuesday', 3: 'Wednesday', 4: 'Thursday', 5: 'Friday', 6: 'Saturday', 7: 'Sunday'
    })
    data = pd.get_dummies(data, prefix='', prefix_sep='')

    return data

raw_jan19 = pd.read_csv("../data/Jan_2019_ontime.csv")
raw_jan20 = pd.read_csv("../data/Jan_2020_ontime.csv")

cleaned_jan19 = clean_data(raw_jan19)
cleaned_jan20 = clean_data(raw_jan20)

cleaned_jan19.head()


Unnamed: 0,DEP_DEL15,DISTANCE,Friday,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday,9E,...,1400-1459,1500-1559,1600-1659,1700-1759,1800-1859,1900-1959,2000-2059,2100-2159,2200-2259,2300-2359
0,0,300.0,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
1,0,596.0,0,0,0,0,0,1,0,1,...,1,0,0,0,0,0,0,0,0,0
2,0,229.0,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
3,0,223.0,0,0,0,0,0,1,0,1,...,0,1,0,0,0,0,0,0,0,0
4,0,579.0,0,0,0,0,0,1,0,1,...,0,0,0,0,0,1,0,0,0,0


In [13]:
# separating out the training and testing sets
train_dataset = cleaned_jan19.sample(frac=0.8,random_state=0)
test_dataset = cleaned_jan19.drop(train_dataset.index)

In [14]:
# looking at the overall statistics
train_stats = train_dataset.describe()
train_stats.pop("DEP_DEL15")
train_stats = train_stats.transpose()
train_stats

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
DISTANCE,454104.0,802.248307,592.679774,31.0,365.0,641.0,1044.0,4983.0
Friday,454104.0,0.138587,0.345516,0.0,0.0,0.0,0.0,1.0
Monday,454104.0,0.135942,0.342728,0.0,0.0,0.0,0.0,1.0
Saturday,454104.0,0.106141,0.308018,0.0,0.0,0.0,0.0,1.0
Sunday,454104.0,0.125685,0.331494,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...
1900-1959,454104.0,0.054452,0.226908,0.0,0.0,0.0,0.0,1.0
2000-2059,454104.0,0.044624,0.206477,0.0,0.0,0.0,0.0,1.0
2100-2159,454104.0,0.030055,0.170738,0.0,0.0,0.0,0.0,1.0
2200-2259,454104.0,0.022297,0.147647,0.0,0.0,0.0,0.0,1.0


In [15]:
# separating the target values from the labels
train_labels = train_dataset.pop("DEP_DEL15")
test_labels = test_dataset.pop("DEP_DEL15")

In [16]:
# normalizing the data
def norm(x):
  return (x - train_stats['mean']) / train_stats['std']
normed_train_data = norm(train_dataset)
normed_test_data = norm(test_dataset)

In [17]:
# building the model
def build_model():
  model = keras.Sequential([
    layers.Dense(64, activation='relu', input_shape=[len(train_dataset.keys())]),
    layers.Dense(64, activation='relu'),
    layers.Dense(1)
  ])

  optimizer = tf.keras.optimizers.RMSprop(0.001)

  model.compile(loss='mse',
                optimizer=optimizer,
                metrics=['mae', 'mse'])
  return model

model = build_model()

In [18]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 64)                47168     
_________________________________________________________________
dense_4 (Dense)              (None, 64)                4160      
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 65        
Total params: 51,393
Trainable params: 51,393
Non-trainable params: 0
_________________________________________________________________


In [19]:
example_batch = normed_train_data[:10]
example_result = model.predict(example_batch)
example_result

array([[-0.27063495],
       [ 0.13461733],
       [ 0.21080488],
       [ 0.08117649],
       [-0.00315377],
       [-0.7707465 ],
       [-0.64812535],
       [ 0.69005334],
       [ 1.5410504 ],
       [-0.4537267 ]], dtype=float32)

In [None]:
EPOCHS = 1000

history = model.fit(
  normed_train_data, train_labels,
  epochs=EPOCHS, validation_split = 0.2, verbose=0,
  callbacks=[tfdocs.modeling.EpochDots()])