In [1]:
import os
import pathlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

# Importing Tensorflow
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# tf version check
print(tf.__version__)

2.1.0


In [29]:
# cleaning the data
def clean_data(data):
    # removing un-needed columns
    cols_to_keep = [
        "DAY_OF_MONTH",
        "DAY_OF_WEEK",
        "OP_CARRIER",
        "ORIGIN",
        "DEST",
        "DEP_DEL15",
        "DEP_TIME_BLK",
        "CANCELLED",
        "DIVERTED",
        "DISTANCE"
    ]
    data = data[cols_to_keep]
    
    # removing NA values
    data = data.dropna()
    
    # converting from floats to ints
    data["DEP_DEL15"] = data["DEP_DEL15"].astype(int)
    data["CANCELLED"] = data["CANCELLED"].astype(int)
    data["DIVERTED"] = data["DIVERTED"].astype(int)

    # converting categorical variables
    data['DAY_OF_WEEK'] = data['DAY_OF_WEEK'].map({
        1: 'Monday', 2: 'Tuesday', 3: 'Wednesday', 4: 'Thursday', 5: 'Friday', 6: 'Saturday', 7: 'Sunday'
    })
    data = pd.get_dummies(data, prefix='', prefix_sep='')

    return data

raw_jan19 = pd.read_csv("../data/Jan_2019_ontime.csv")
raw_jan20 = pd.read_csv("../data/Jan_2020_ontime.csv")

cleaned_jan19 = clean_data(raw_jan19)
cleaned_jan20 = clean_data(raw_jan20)

cleaned_jan19.head()


Unnamed: 0,DAY_OF_MONTH,DEP_DEL15,CANCELLED,DIVERTED,DISTANCE,Friday,Monday,Saturday,Sunday,Thursday,...,1400-1459,1500-1559,1600-1659,1700-1759,1800-1859,1900-1959,2000-2059,2100-2159,2200-2259,2300-2359
0,1,0,0,0,300.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,596.0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,1,0,0,0,229.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,223.0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,1,0,0,0,579.0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [35]:
# separating out the training and testing sets
train_dataset = cleaned_jan19.sample(frac=0.8,random_state=0)
test_dataset = cleaned_jan19.drop(train_dataset.index)

In [36]:
# looking at the overall statistics
train_stats = train_dataset.describe()
train_stats.pop("DEP_DEL15")
train_stats = train_stats.transpose()
train_stats

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
DAY_OF_MONTH,454104.0,15.788885,8.958844,1.0,8.0,16.0,24.0,31.0
CANCELLED,454104.0,0.000647,0.025436,0.0,0.0,0.0,0.0,1.0
DIVERTED,454104.0,0.002255,0.047433,0.0,0.0,0.0,0.0,1.0
DISTANCE,454104.0,802.248307,592.679774,31.0,365.0,641.0,1044.0,4983.0
Friday,454104.0,0.138587,0.345516,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...
1900-1959,454104.0,0.054452,0.226908,0.0,0.0,0.0,0.0,1.0
2000-2059,454104.0,0.044624,0.206477,0.0,0.0,0.0,0.0,1.0
2100-2159,454104.0,0.030055,0.170738,0.0,0.0,0.0,0.0,1.0
2200-2259,454104.0,0.022297,0.147647,0.0,0.0,0.0,0.0,1.0


In [37]:
# separating the target values from the labels
train_labels = train_dataset.pop("DEP_DEL15")
test_labels = test_dataset.pop("DEP_DEL15")