In [None]:
# LIBRARIES

import os 
dirpath = os.getcwd()

import pandas as pd
import plotly.graph_objects as go
import numpy as np
from datetime import datetime
from scipy import signal

In [None]:
# MACHINE LEARNING

# Load model and test dataframe 
from pyspark.ml.pipeline import PipelineModel
model_ml = PipelineModel.load(dirpath + "/ML_model")

ml_test_df = spark.read.load("ml_test_df.parquet")

# Calculate predictions
prediction = model_ml.transform(ml_test_df)

prediction.show()

In [None]:
# Evaluation of the machine learning algorithm 

from pyspark.ml.evaluation import RegressionEvaluator

# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(
    labelCol="total_cars_int", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(prediction)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

In [None]:
treeModel = model_ml.stages[1]
# summary only
print(treeModel)

In [None]:
lr_evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="total_cars_int",metricName="r2")
print("R Squared (R2) on test data = %g" % lr_evaluator.evaluate(prediction))

In [None]:
prediction_pandas = prediction.toPandas()
prediction_pandas["timestamp"]=pd.to_datetime(prediction_pandas[['year', 'month', 'day', 'hour', 'minute']]).sort_values()
prediction_pandas = prediction_pandas.sort_values(by='timestamp',ascending=True)
prediction_pandas = prediction_pandas.reset_index(drop=True)

In [None]:
fig = go.Figure()

fig.add_trace(go.Scatter(y=prediction_pandas.total_cars_int,
                    mode='lines',
                    name='Test '))

fig.add_trace(go.Scatter(y=prediction_pandas.prediction,
                    mode='lines',
                    name='Prediction '))
fig.show()

In [None]:
# As it can be seen, the prediction show continuous values when it should be discrete (cars are integers)
# To improve it, I am going to do a little bit of signal processing: filtering, scaling and applying an offset

fs = 1000  # Sampling frequency

fc = 10  # Cut-off frequency of the filter
w = fc / (fs / 2) # Normalize the frequency
b, a = signal.butter(5, w, 'low')
output = signal.filtfilt(b, a, prediction_pandas.prediction)

output_processed = []

offset = 0.15
size_factor = 1.75

for item in output: 
    item_processed = round(size_factor*(item-offset))
    output_processed.append(item_processed)
    
prediction_pandas["filtered_prediction"] = output
prediction_pandas["filtered_prediction_processed"] = output_processed

prediction_pandas["timestamp"]=pd.to_datetime(prediction_pandas[['year', 'month', 'day', 'hour', 'minute']]).sort_values()
prediction_pandas = prediction_pandas.sort_values(by='timestamp',ascending=True)
prediction_pandas = prediction_pandas.reset_index(drop=True)

In [None]:
fig = go.Figure()

fig.add_trace(go.Scatter(y=prediction_pandas.total_cars_int,
                    mode='lines',
                    name='Test'))
fig.add_trace(go.Scatter(y = prediction_pandas.filtered_prediction_processed,
                    mode='lines',
                    name='Prediction',
                    line=dict(color='green')))

fig.update_layout(title='Machine learning algorithm evaluation',
                   xaxis_title='Timestamp',
                   yaxis_title='Number of cars')

fig.show()

In [None]:
# Measurements for evaluation in the Machine Learning algortihm
count_accurate_result_ml = 0
for i in range(len(prediction_pandas)):
    if prediction_pandas.iloc[i].total_cars_int == prediction_pandas.iloc[i].filtered_prediction_processed: 
        count_accurate_result_ml = count_accurate_result_ml + 1

total_accuracy_ml = count_accurate_result_ml/len(prediction_pandas)

print("The accuracy in the predictions is: " + str(total_accuracy_ml))

correlation_ml = prediction_pandas['total_cars_int'].corr(prediction_pandas['filtered_prediction_processed'])

print("The correlation between the test and the predictions is: "+str(correlation_ml))


In [None]:
# DEEP LEARNING ALGORITHM

from keras.models import model_from_json
from numpy import loadtxt

# load json and create model
json_file = open('DL_model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
DL_model = model_from_json(loaded_model_json)
# load weights into new model
DL_model.load_weights("DL_model_weights.h5")
print("Loaded model from disk")

DL_model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])


In [None]:
# Data for the calculation of the predictions
X_test_sc = loadtxt('X_test_sc.csv', delimiter=',')
X_test = loadtxt('X_test.csv', delimiter=',')
y_test = loadtxt('y_test.csv', delimiter=',')

# Predicting the Test set results
y_pred = DL_model.predict(X_test_sc)

# Plotting predictions
df_results = pd.DataFrame(y_test, columns = ['test'])  
df_results["prediction"] = y_pred


output_processed = []
scale_factor = 1.6

for item in y_pred: 
    #print(item)
    item_processed = np.round(scale_factor*item)
    output_processed.append(int(item_processed))


df_results["prediction_processed"] = output_processed 

df_test = pd.DataFrame(X_test, columns = ['season0','season1','minute', 'hour','day','month','year']).astype(int)
df_test = df_test.drop(columns = ["season0","season1"])

df_test_expanded = df_test.join(df_results)

list_timestamps = []

for i in range(len(df_test_expanded)):
    timestamp = pd.Timestamp(year=df_test_expanded["year"][i], month = df_test_expanded["month"][i], day =df_test_expanded["day"][i], hour = df_test_expanded["hour"][i], minute = df_test_expanded["minute"][i])
    list_timestamps.append(timestamp)
    
df_test_expanded["timestamp"] = list_timestamps



df_test_expanded = df_test_expanded.sort_values(by='timestamp',ascending=True)
df_test_expanded = df_test_expanded.reset_index(drop=True)


In [None]:
fig = go.Figure()

fig.add_trace(go.Scatter(y=df_test_expanded.test,
                    mode='lines',
                    name='Test'))
fig.add_trace(go.Scatter(y = df_test_expanded.prediction,
                    mode='lines',
                    name='Prediction'))

fig.show()

In [None]:
fig = go.Figure()

fig.add_trace(go.Scatter(y=df_test_expanded.test,
                    mode='lines',
                    name='Test'))
fig.add_trace(go.Scatter(y = df_test_expanded.prediction_processed,
                    mode='lines',
                    name='Prediction',
                    line=dict(color='orange')))

fig.update_layout(title='Deep learning algorithm evaluation',
                   xaxis_title='Timestamp',
                   yaxis_title='Number of cars')


fig.show()

In [None]:
# Measurements for evaluation in the Deep Learning algortihm
count_accurate_result_dl = 0
for i in range(len(df_test_expanded)):
    if df_test_expanded.iloc[i].test == df_test_expanded.iloc[i].prediction_processed: 
        count_accurate_result_dl = count_accurate_result_dl + 1

total_accuracy_dl = count_accurate_result_ml/len(df_test_expanded)

print("The accuracy in the predictions is: " + str(total_accuracy_dl))

correlation_dl = df_test_expanded['test'].corr(df_test_expanded['prediction_processed'])

print("The correlation between the test and the predictions is: "+str(correlation_dl))