# Simple Machine Learning Example

In [None]:
from pprint import pprint
from datetime import datetime, timedelta
from DataCleaner import DataCleaner
from read_json import loadData

import pandas as pd
import numpy as np

import keras

from keras.models import Sequential
from keras.layers import Dense

In [None]:
pathToBlobs = "data/random_one_day"
dicts = loadData(pathToBlobs)
frame = pd.DataFrame(dicts)
print(frame.shape)
cleaner = DataCleaner()
frame = cleaner.clean_data(frame)

In [None]:
batch_size = 8 # Number of rows to process before updating the weight
num_classes = 4 # Number of delay categories ["On schedule","Delayed","Very Delayed","Ahead of schedule"]
epochs = 20 # How many full iterations through the dataset

In [None]:
frame_ml = frame.sample(frac=1) # Shuffle the data
frame_ml = frame_ml[['Delay', 'Line', 'RecordedAtTime']]
frame_ml = frame_ml.dropna()
frame_ml['Line'].unique()

## Converting continues delay to four categories

In [None]:
def delay_to_category(delay):
    if delay <-1:
        return "Ahead of schedule"
    if delay > 200:
        return "Very Delayed"
    if delay > 100:
        return "Delayed"
    return "On schedule"

frame_ml['Delay'] = frame_ml['Delay'].apply(lambda x: delay_to_category(x))
frame_ml.head(10).T

## Converting datetime

In [None]:
frame_ml['RecordedAtTime'] = frame_ml['RecordedAtTime'].astype(str).astype('datetime64[ns]')
frame_ml['time_datetime'] = pd.to_datetime(frame['RecordedAtTime'])
    
frame_ml['Hour'] = frame_ml['time_datetime'].dt.hour
frame_ml['Minute'] = frame_ml['time_datetime'].dt.minute

frame_ml['Seconds'] = frame_ml['Hour'] * 60 * 60 + frame_ml['Minute'] * 60
frame_ml['Seconds'] = frame_ml['Seconds'] / (24.0 * 60.0 * 60.0)

frame_ml = frame_ml.drop(['time_datetime', 'RecordedAtTime', 'Hour','Minute'], 1)  # Removes the date and time columns

## Converting IDs to one-hot encoding

In [None]:
categories = list(frame_ml['Line'].unique()) # Unique Lines
categories.sort()
frame_ml['Line'] = pd.Categorical(frame_ml['Line'],categories=categories,ordered=True) # Converts Line to category datatype
x_train = pd.get_dummies(frame_ml,columns=['Line']) # One-hot encoding: One columns is transformend into one column for every possible value

## Seperating input and output, and converting Delay

In [None]:
categories_y = list(frame_ml['Delay'].unique())
categories_y.sort()

# The Delay is seperated from the input and converted to one-hot encoding
y_labels = (x_train.pop('Delay'))
y_labels = pd.Categorical(y_labels,categories=categories_y,ordered=True)
y_labels = pd.get_dummies(y_labels)

In [None]:
x_train.head() #Input

In [None]:
y_labels.head() # Output

## Always convert from Pandas DataFrame to Numpy Array before using Keras and TensorFlow

In [None]:
x_train = np.array(x_train)
y_labels = np.array(y_labels)

## Model architecture

In [None]:
model = Sequential()
model.add(Dense(300,activation='relu',input_shape=(x_train.shape[1],)))
model.add(Dense(200,activation='relu'))
model.add(Dense(100,activation='relu'))
model.add(Dense(num_classes, activation='softmax'))
model.summary()

## Defining loss function

In [None]:
model.compile(loss=keras.losses.mean_squared_error, optimizer='sgd',
              metrics=['accuracy'])

## Training the model

In [None]:
model.fit(x_train, y_labels,
          batch_size=batch_size,
          epochs=epochs,
          verbose=1,
          validation_data=(x_train, y_labels))
score = model.evaluate(x_train, y_labels, verbose=0)

## Use the trained model to predict the delay

In [None]:
def show_delay_prediction(prediction, categories):
    predicted = np.argmax(prediction)
    print("Confidence:")
    for i in range(0, len(categories)):
        line = "    "
        if i == predicted:
            line += "* "
        else:
            line += "- "
        line += str(categories[i]) + ": " + str(round(prediction[i]*100)) + "%"
        print(line)

In [None]:
score = model.evaluate(x_train, y_labels, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])
print(" ")
predictions = model.predict(x_train, verbose=1)

In [None]:
for p in range(0, 5):
    print("Bus line "+ str(frame.iloc[p].Line) + " is at " + str(timedelta(seconds=x_train[p][0]*(24.0 * 60.0 * 60.0))) + " predicted to be " + str(categories_y[np.argmax(predictions[p])]).lower())
    show_delay_prediction(predictions[p],categories_y)
    print(" ")