In [None]:
# LIBRARIES

import os 
dirpath = os.getcwd()

from numpy import loadtxt

import pandas as pd

In [None]:
# MACHINE LEARNING MODEL

# Load the model
df_location_ml = spark.read.load("df_location_ml.parquet")

from pyspark.ml.feature import VectorAssembler

vectorAssembler = VectorAssembler.load(dirpath + "/vector_assembler")
 
from pyspark.ml.regression import DecisionTreeRegressor

classifier_ml =  DecisionTreeRegressor(labelCol='total_cars_int', featuresCol='features')

df_location_ml_red = df_location_ml.select(['total_cars_int','minute' ,'hour','day','month','season','year'])

splits = df_location_ml_red.randomSplit([0.7, 0.3])
ml_train_df = splits[0]
ml_test_df = splits[1]

In [None]:
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[vectorAssembler, classifier_ml])

model_ml = pipeline.fit(ml_train_df)

model_ml.save(dirpath + "/ML_model")

ml_train_df.write.format("parquet").save("ml_train_df.parquet")
ml_test_df.write.format("parquet").save("ml_test_df.parquet")

In [None]:
# Check the importance of each feature in the model

# Code from https://www.timlrx.com/2018/06/19/feature-selection-using-feature-importance-score-creating-a-pyspark-estimator/

treeModel = model_ml.stages[-1]
print(treeModel.featureImportances)

def ExtractFeatureImp(featureImp, dataset, featuresCol):
    list_extract = []
    for i in dataset.schema[featuresCol].metadata["ml_attr"]["attrs"]:
        list_extract = list_extract + dataset.schema[featuresCol].metadata["ml_attr"]["attrs"][i]
    varlist = pd.DataFrame(list_extract)
    varlist['score'] = varlist['idx'].apply(lambda x: featureImp[x])
    return(varlist.sort_values('score', ascending = False))

ExtractFeatureImp(treeModel.featureImportances, df_location_ml, "features")

In [None]:
# DEEP LEARNING MODEL

# Load numpy arrays from csv files
X_train_sc = loadtxt('X_train_sc.csv', delimiter=',')
y_train = loadtxt('y_train.csv', delimiter=',')

# Importing the Keras libraries and packages
import keras
from keras.models import Sequential
from keras.layers import Dense

In [None]:
#Initializing Neural Network
classifier = Sequential()

# Adding the input layer and the first hidden layer. Input_dim is 6 because I have 7 columns
classifier.add(Dense(output_dim = 24, init = 'he_uniform', activation = 'relu', input_dim = 7))

classifier.add(Dense(output_dim = 12, init = 'he_uniform', activation = 'relu'))
# Adding the second hidden layer
classifier.add(Dense(output_dim = 6, init = 'he_uniform', activation = 'tanh'))
#classifier.add(Dense(output_dim = 4, init = 'he_uniform', activation = 'relu'))
# Adding the output layer
classifier.add(Dense(output_dim = 1, init = 'he_uniform', activation = 'sigmoid'))

# Compiling Neural Network
classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

# Fitting our model 
classifier.fit(X_train_sc, y_train, batch_size = 10, nb_epoch = 10)

DL_model_json = classifier.to_json()
with open("DL_model.json", "w") as json_file:
    json_file.write(DL_model_json)
# serialize weights to HDF5
classifier.save_weights("DL_model_weights.h5")