# The model is trained in this Notebook


In [None]:
#Data Proccesing
import folium
import pandas as pd
#Model
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from tensorflow import keras
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import pickle
from sklearn.utils import shuffle
from tensorflow.keras.layers import Input, Dense,add
from tensorflow.keras.models import Model
from datetime import timedelta
from tqdm import tqdm
Project_Path='Local Path'

First I load the Dataset 'Legal_Illegal', the weather file 'Final_Weather_Data', and the distance domain file from the 19 points of interest 'Distance_Data'. I split the Dataset into 80% train-set and 20% test-set and then I save them in separate files so I always have the same train-set and test-set to know if each change helps.

In [None]:
Distance_Data=pd.read_csv(Project_Path+ '/Data/Distance.csv',sep=',',index_col=0)
Final_Weather_Data=pd.read_csv(Project_Path+ '/Data/Final_Weather_Data.csv',low_memory=False,sep=',',index_col=0)
Legal_illegal=pd.read_csv(Project_Path+ '/Data/Scan_Data_Reg_2.3.csv',sep=',',index_col=0)

train_data,test_data = train_test_split(Legal_illegal,test_size=0.2,random_state=42)
train_data.to_csv(Project_Path+ '/Data/Train.csv')
test_data.to_csv(Project_Path+ '/Data/Test.csv')

train_data=pd.read_csv(Project_Path+ '/Data/Train.csv',sep=',',index_col=0)
test_data=pd.read_csv(Project_Path+ '/Data/Test.csv',sep=',',index_col=0)

The cells below are functions used throughout the file during training

With this function I normalize the values to an interval of 0.1-0.9

In [None]:
def Normalization(Targets):
    Targets = (((0.9-0.1) * (Targets - 0)) / (1 - 0)) + 0.1
    return Targets

With this function I do the inverse normalization of the "Normalization" function above

In [None]:
def Inverse_Normalization(Targets):
    Targets=(Targets-0.1)/0.8
    return Targets

With this function I calculate the actual mae of normalized predictions and targets

In [None]:
def MAE(y_true, y_pred):
    y_true=Inverse_Normalization(y_true)
    y_pred=Inverse_Normalization(y_pred)
    MAE=tf.keras.metrics.mean_absolute_error(y_true, y_pred)
    return MAE

With this function I calculate the true mse of normalized predictions and targets

In [None]:
def MSE(y_true, y_pred):
    y_true=Inverse_Normalization(y_true)
    y_pred=Inverse_Normalization(y_pred)
    MSE=tf.keras.metrics.mean_squared_error(y_true, y_pred)
    return MSE

In [None]:
def MAPE(y_true, y_pred):
    y_true=Inverse_Normalization(y_true)
    y_pred=Inverse_Normalization(y_pred)
    MAPE=tf.keras.metrics.mean_absolute_percentage_error (y_true, y_pred)
    return MAPE

This function accepts train-set and test-set. It applies standardization to them and returns them

In [None]:
def Scaller(Train,Test):
    Standar_Scaller = StandardScaler()
    Scalled_Train_data=Standar_Scaller.fit_transform(Train)
    Scalled_val_data = Standar_Scaller.transform(Test)
    return Scalled_Train_data,Scalled_val_data

I declare Initializers for the hidden layer kernels and outputs

In [None]:
Kernel_Sigmoid_Initializer = tf.keras.initializers.GlorotUniform()
Kernel_Relu_Initializer =tf.keras.initializers.HeUniform()

This function constructs Residual Neural Networks. Accepts the attribute size to use as input size. Accepts a variable that will be used to initialize the output level bias 

In [None]:
def Residual_NN(Input_Shape,Bias_Sigmoid_Initializer):
    Input_Layer = Input(shape=(Input_Shape,))
    Dense_Layer1 = Dense(512, activation='relu',kernel_initializer=Kernel_Relu_Initializer)(Input_Layer)
    Dense_Layer2 = Dense(256, activation='relu',kernel_initializer=Kernel_Relu_Initializer)(Dense_Layer1)
    Dense_Layer3 = Dense(128, activation='relu',kernel_initializer=Kernel_Relu_Initializer)(Dense_Layer2)
    Dense_Layer4 = Dense(64, activation='relu',kernel_initializer=Kernel_Relu_Initializer)(Dense_Layer3)
    Dense_Layer5 = Dense(128, activation='relu',kernel_initializer=Kernel_Relu_Initializer)(Dense_Layer4)
    Residual_Add = add([Dense_Layer3, Dense_Layer5])
    Dense_Layer6 = Dense(32, activation='relu',kernel_initializer=Kernel_Relu_Initializer)(Residual_Add)
    Output_Layer = Dense(1, activation='sigmoid',kernel_initializer=Kernel_Sigmoid_Initializer,bias_initializer=Bias_Sigmoid_Initializer)(Dense_Layer6)

    model2 = Model(inputs=Input_Layer, outputs=Output_Layer)
    model2.compile(optimizer='adamax', loss=MSE, metrics=[MAE,MAPE])
    return model2

# Training procces

First I must mention that in the previous notebook I created the dataset I did not edit it to the final form for training. For the reason that the dataset I created in the previous notebook had to have some features that will not be used in training but will be used to apply the smoothing data augmentation technique to it later. For now I will not implement this technique but will train our model without it. To do this I need to make some changes to the format of the dataset I created in the previous notebook so that it is ready for training

.

The following function prepares the dataset so that it is ready for training. First it deletes an unneeded feature. Then it changes the order of some columns and then adds the distances from the points of interest and the weather

In [None]:
def Prepare_Dataset(Dataset): 
    Dataset=Dataset.drop(['Time_Int'], axis=1)
    a=Dataset['Slot_Timeint']
    b=Dataset['Ilegality_Rate']
    Dataset=Dataset.drop(['Slot_Timeint'], axis=1)
    Dataset=Dataset.drop(['Ilegality_Rate'], axis=1)
    Dataset.insert(8, "Real_Time", a, True)
    Dataset.insert(9, "Real_Rate", b, True)
    
    Dataset=pd.merge(Dataset, Final_Weather_Data, on='Key')
    Dataset=Dataset.drop(['Key'], axis=1)
    
    Dataset=pd.merge(Dataset, Distance_Data, on='Slot_id')
    Dataset=Dataset.drop(['Slot_id'], axis=1)
    return Dataset

The function below applies an exponential decay technique to the learning rate after epoch 40

In [None]:
def scheduler(epoch, lr):
    if epoch < 40:
        return lr
    else:
        return lr * tf.math.exp(-0.25)

callback = tf.keras.callbacks.LearningRateScheduler(scheduler)

I run a 'type' 4-fold cross validation procedure. Using only the training-set to find the best training time. At the same time I save the history of mse and mae

In [None]:
train_data=pd.read_csv(Project_Path+ '/Data/Train.csv',sep=',',index_col=0)
TrainDF=train_data
k = 4
num_epochs = 60

all_mape_histories = []
all_mae_histories = []
all_loss_histories = []
all_Tmae_histories = []
all_Tloss_histories = []

for i in tqdm(range(0,k)):
        print('processing fold #', i)
        TrainDF=shuffle(TrainDF) # Shuffles the training set
        Train,Val = train_test_split(TrainDF,test_size=0.2,random_state=42) #Το χωρίζει σε 80%-20% 
        
        Val=Prepare_Dataset(Val) #It calls the function to make the final changes to the dataset so that it is ready for training
        Train=Prepare_Dataset(Train) #It calls the function to make the final changes to the dataset so that it is ready for training
        
        #It is divided into target and features
        Final_Train_targets=Train['Real_Rate'] 
        Final_Train_data=Train.drop(['Real_Rate'], axis=1)
        
        Final_val_targets=Val['Real_Rate']
        Final_val_data=Val.drop(['Real_Rate'], axis=1)

        #Calls the function to do standardization 
        Final_Train_data,Final_val_data=Scaller(Final_Train_data,Final_val_data)
        
        #It calls the function to do normalization (0.1-0.09) on the targets
        Final_Train_targets = Normalization(Final_Train_targets)
        Final_val_targets = Normalization(Final_val_targets)
        
        #Finds the mean of the targets and uses it to initialize the biases when the model is built  
        Target=Final_Train_targets
        Bias_Initial_Out=Target.mean()
        Bias_Initializer=tf.keras.initializers.Constant(Bias_Initial_Out)
        
        
        #Builds the Keras model
        model = Residual_NN(Input_Shape=Final_Train_data.shape[1],Bias_Sigmoid_Initializer=Bias_Initializer)

        history = model.fit(Final_Train_data, Final_Train_targets,
                            validation_data=(Final_val_data, Final_val_targets),
                            epochs=num_epochs,callbacks=callback,batch_size=16, verbose=1)
        #Saves the results

        #mape_history = history.history['val_MAPE']
        mae_history = history.history['val_MAE']
        loss_history = history.history['val_loss']
        Tmae_history = history.history['MAE']
        Tloss_history = history.history['loss']
        
        #all_mape_histories.append(mape_history)
        all_mae_histories.append(mae_history)
        all_loss_histories.append(loss_history)
        all_Tmae_histories.append(Tmae_history)
        all_Tloss_histories.append(Tloss_history)
        

Here I find the averages of the results per season from the four iterations

In [None]:
average_mae_history = [
    np.mean([x[i] for x in all_mae_histories]) for i in range(num_epochs)]
average_loss_history = [
    np.mean([x[i] for x in all_loss_histories]) for i in range(num_epochs)]
average_Tmae_history = [
    np.mean([x[i] for x in all_Tmae_histories]) for i in range(num_epochs)]
average_Tloss_history = [
    np.mean([x[i] for x in all_Tloss_histories]) for i in range(num_epochs)]

I save them to external text files so I can have them and print out whatever chart I need later

In [None]:
with open(Project_Path+ '/Results/No Smoothing/ValMae.txt', "w") as file:
    file.write(str(average_mae_history))
with open(Project_Path+ '/Results/No Smoothing/ValLoss.txt', "w") as file:
    file.write(str(average_loss_history))
with open(Project_Path+ '/Results/No Smoothing/TrainMae.txt', "w") as file:
    file.write(str(average_Tmae_history))
with open(Project_Path+ '/Results/No Smoothing/TrainLoss.txt', "w") as file:
    file.write(str(average_Tloss_history))

I load the list containing the Validation mae to find the appropriate training season

In [None]:
with open(Project_Path+ '/Results/No Smoothing/ValMae.txt', "r") as file:
    average_mae_history = eval(file.readline())

I am printing the result

In [None]:
def smooth_curve(points, factor=0.5):
  smoothed_points = []
  for point in points:
    if smoothed_points:
      previous = smoothed_points[-1]
      smoothed_points.append(previous * factor + point * (1 - factor))
    else:
      smoothed_points.append(point)
  return smoothed_points

smooth_mae_history = smooth_curve(average_mae_history[0:])

plt.plot(range(1, len(smooth_mae_history) + 1), smooth_mae_history)
plt.title('4-Fold Cross Validation')
plt.xlabel('Epochs')
plt.ylabel('Validation_Mae')
plt.show()

I find what is the best season. That is, the one with the youngest Mae

In [None]:
Min_Mae=min(smooth_mae_history)
Best_Epoch=smooth_mae_history.index(Min_Mae)
Best_Epoch

Now knowing the best training time I train the model and check it with the test-set

In [None]:
train_data=pd.read_csv(Project_Path+ '/Data/Train.csv',sep=',',index_col=0)
test_data=pd.read_csv(Project_Path+ '/Data/Test.csv',sep=',',index_col=0)

#Calls the function to make the final changes to the dataset so that it is ready for training
TestDF=Prepare_Dataset(test_data)
TrainDF=Prepare_Dataset(train_data)

#It is divided into target and features
train_targets=TrainDF['Real_Rate']
train_data=TrainDF.drop(['Real_Rate'], axis=1)
test_targets=TestDF['Real_Rate']
test_data=TestDF.drop(['Real_Rate'], axis=1)

#Calls the function to make the final changes to the dataset so that it is ready for training
test_targets = Normalization(test_targets)
train_targets = Normalization(train_targets)

#Finds the mean of the targets and uses it to initialize the biases when the model is built  
Target=train_targets
Bias_Initial_Out=Target.mean()
Bias_Initializer=tf.keras.initializers.Constant(Bias_Initial_Out)

#Calls the function to do standardization 
train_data,test_data=Scaller(train_data,test_data)

I have been training for 44 epochs

In [None]:
model = Residual_NN(Input_Shape=train_data.shape[1],Bias_Sigmoid_Initializer=Bias_Initializer)
model.fit(train_data,train_targets,callbacks=callback,
          epochs=44, batch_size=16, verbose=1)

test with test-set

In [None]:
test_mse_score, test_mae_score, test_mape_score = model.evaluate(test_data, test_targets)


# Smoothing

Here I do the above procedure by applying smoothing to both the train-set and the validation/test-set

.

The following function accepts a dataset and returns a smoothed-dataset. Essentially using one data sample creates three, that is, it triples the data set. I have described the process in detail in the report.

In [None]:
def Smouth(Legal_illegal):
    #Takes the recorded time-slots and converts them so that they are displayed with their center "eg: 7:00->7:30"
    Time_Slots=[21600,25200,28800,32400,36000,39600,43200,46800,50400,54000,57600,61200,64800,68400,72000]
    Time_SlotsCenter=[]
    for i in range (0,len(Time_Slots)):
        Time_SlotsCenter.append((Time_Slots[i]+1800)/timedelta(days=1).total_seconds())
    Time_Slots=Time_SlotsCenter

    
    Scan_List2=Legal_illegal.values.tolist()
    NewData=[]
    Slots=[]
    for i in range(0,len(Scan_List2)):
        Helper=[]
        Helper2=[]
        Rate=Scan_List2[i][6] #Actual rate of parking violations 
        Real_Time=Scan_List2[i][5] #Real time of the check 

        #It measures the distances from all time slots for each control
        #By measuring the distance from the actual time of the control with the centers of the timeslots
        #It finds the three closest time-slots for each check
        Distances=[]
        for j in range (0,len(Time_Slots)):
            Distances.append(abs(Time_Slots[j]-Real_Time))
        Slots=np.column_stack((Time_Slots, Distances))
        Slots = sorted(Slots, key=lambda x: x[1])
        Slot1,Slot2,Slot3=Slots[0][0],Slots[1][0],Slots[2][0] #3 nearest time-slots
        D1,D2,D3=Slots[0][1],Slots[1][1],Slots[2][1] #Time Distances from the 3 nearest time-slots

        #Creates a data sample with all the characteristics from the original setting the closest "Slot1" as a time slot
        #Puts the actual 'Rate' as the delinquency rate
        #Like a distance of 0, because after we put the actual percentage it is as if we consider it to be exactly at
        #center of the time-slot. I explain the formula in more detail in the paper.
        Helper=Scan_List2[i][:11]
        Helper.append(Slot1)
        Helper.append(Rate)
        Helper.append(0)
        NewData.append(Helper)
        
        #Creates a second data sample with all the attributes from the original one
        #as the second closest time slot
        #Puts as delinquency rate what is obtained by applying the Gaussian
        #Sets the distance 'D2' from the second closest timeslot
        Helper=Scan_List2[i][:11]
        Helper.append(Slot2)
        #Formula is "-distance in minutes/210 minutes"
        #'0.14583' is 210 minutes, according to my time normalization
        X1=(-D2/0.14583) 
        X2=np.exp(X1)
        Helper.append(X2*Rate)
        Helper.append(D2)
        NewData.append(Helper)

        #Creates a third data sample with all the characteristics from the original one by setting it as a time slot
        #the third closest
        #Puts as delinquency rate what is obtained by applying the Gaussian
        #Sets the distance 'D3' from the third closest timeslot
        Helper=Scan_List2[i][:11]
        Helper.append(Slot3)
        X1=(-D3/0.14583)
        X2=np.exp(X1)
        Helper.append(X2*Rate)
        Helper.append(D3)
        NewData.append(Helper)

    Col=['Slot_id','Key','Date_Sin','Slot_Timeint','Covid','Time_Int','Ilegality_Rate','Holidays','Capacity','Week_Day_Sin','Month_Sin','Real_Time','Real_Rate','Time_Distance']
    Legal_illegal = pd.DataFrame (NewData, columns = Col)
    
    
    #Create the final dataset by deleting the features that are not needed.
    #Repositions an attribute to be in the correct position.
    Legal_illegal=Legal_illegal.drop(['Slot_Timeint'], axis=1)
    Legal_illegal=Legal_illegal.drop(['Time_Int'], axis=1)
    Legal_illegal=Legal_illegal.drop(['Ilegality_Rate'], axis=1)
    a=Legal_illegal['Time_Distance']
    Legal_illegal=Legal_illegal.drop(['Time_Distance'], axis=1)
    Legal_illegal.insert(8, "Time_Distance", a, True)
    return Legal_illegal


Since we used one sample above to create 2 more and gave the same characteristics apart from the delinquency rate and the timeslot. We have also given a 'Key', the key is datetime and I use it afterwards to merge the hourly weather values. Since the 2 new samples represent another time the 'Key' must also be changed. In the following function we find the correct keys.

In [None]:
def Get_Weather(Legal_illegal):
    #Converts time to normal format
    Time=Legal_illegal['Real_Time']*timedelta(days=1).total_seconds()
    Time=Time/3600
    Time=Time.astype(int)
    Time=Time.values.tolist()
    NewT=[]
    #If the time is 9:00, it converts it to 09:00 to have the same format as the weather key
    for i in range (0,len(Time)):
        Str=str(Time[i])
        if Time[i]>=10:
            NewT.append(Str)
        else:
            NewT.append('0'+Str)

    #Puts ":00" at the end of each hour to have the same format as the weather key
    Time=pd.DataFrame(NewT,columns=["Hour"])
    Time= Time["Hour"].map(str)+ ':00'
    #Also takes the date
    T_List=Legal_illegal.values.tolist()
    Date=[]
    for i in range (0,len(Legal_illegal)):
        D,H=T_List[i][1].split(' ')
        Date.append(D)

    #Sets the date as a new key
    Legal_illegal=Legal_illegal.drop(['Key'], axis=1)
    Legal_illegal.insert(1, "Key", Date, True)
    
    #Adds the time to the new key
    Key_Weather=Legal_illegal['Key'].map(str) + ' ' + Time
    Legal_illegal=Legal_illegal.drop(['Key'], axis=1)
    Legal_illegal.insert(1, "Key", Key_Weather, True)
    
    #Merge with the weather data
    Legal_illegal=pd.merge(Legal_illegal, Final_Weather_Data, on='Key')
    
    Legal_illegal=Legal_illegal.drop(['Key'], axis=1)
    return Legal_illegal


The function below merges the file from the field distances from the 19 points of interest and deletes the slot id because we don't use it in training

In [None]:
def Get_Slot_Distances(Legal_illegal):
    Legal_illegal['Slot_id'] = Legal_illegal['Slot_id'].astype(float)
    Legal_illegal['Slot_id'] = Legal_illegal['Slot_id'].astype(int)
    Legal_illegal=pd.merge(Legal_illegal, Distance_Data, on='Slot_id')
    Legal_illegal=Legal_illegal.drop(['Slot_id'], axis=1)
    return Legal_illegal

The function below essentially combines the three functions above and these constitute the smoothing process. Where having as input the data-set I created in the previous notebook I get the smoothed dataset as output

In [None]:
def Apply_Smoothing(Legal_illegal):
    Legal_illegal=Smouth(Legal_illegal)
    Legal_illegal=Get_Weather(Legal_illegal)
    Legal_illegal=Get_Slot_Distances(Legal_illegal)
    return Legal_illegal

I load the files I use as train/test-set

In [None]:
train_data=pd.read_csv(Project_Path+ '/Data/Train.csv',sep=',',index_col=0)
test_data=pd.read_csv(Project_Path+ '/Data/Test.csv',sep=',',index_col=0)

The function below applies an exponential decay technique to the learning rate after epoch 20

In [None]:
def scheduler(epoch, lr):
    if epoch < 20:
        return lr
    else:
        return lr * tf.math.exp(-0.25)

callback = tf.keras.callbacks.LearningRateScheduler(scheduler)


I do the same process as I did before, just in each iteration I apply smoothing to both the validation and the training set

In [None]:
train_data=pd.read_csv(Project_Path+ '/Data/Train.csv',sep=',',index_col=0)
TrainDF=train_data
k = 4
num_val_samples = len(train_data) // k
num_epochs = 60
all_mae_histories = []
all_loss_histories = []
all_Tmae_histories = []
all_Tloss_histories = []

for i in tqdm(range(0,k)):
        print('processing fold #', i)
        TrainDF=shuffle(TrainDF)
        Train,Val = train_test_split(TrainDF,test_size=0.2,random_state=42)
        
      
        Val=Apply_Smoothing(Val) #smoothing
        Train=Apply_Smoothing(Train) #smoothing
        
        Final_Train_targets=Train['Real_Rate']
        Final_Train_data=Train.drop(['Real_Rate'], axis=1)
        
        Final_val_targets=Val['Real_Rate']
        Final_val_data=Val.drop(['Real_Rate'], axis=1)

        
        Final_Train_data,Final_val_data=Scaller(Final_Train_data,Final_val_data)
        
        Final_Train_targets = Normalization(Final_Train_targets)
        Final_val_targets = Normalization(Final_val_targets)
        
        Target=Final_Train_targets
        Target.append(Final_val_targets)
        Bias_Initial_Out=Target.mean()
        Bias_Initializer=tf.keras.initializers.Constant(Bias_Initial_Out)
        
        
        # Build the Keras model (already compiled)
        model = Residual_NN(Input_Shape=Final_Train_data.shape[1],Bias_Sigmoid_Initializer=Bias_Initializer)
        # Train the model (in silent mode, verbose=0)
        history = model.fit(Final_Train_data, Final_Train_targets,
                            validation_data=(Final_val_data, Final_val_targets),
                            epochs=num_epochs,callbacks=callback,batch_size=16, verbose=1)
        mae_history = history.history['val_MAE']
        loss_history = history.history['val_loss']
        Tmae_history = history.history['MAE']
        Tloss_history = history.history['loss']
        all_mae_histories.append(mae_history)
        all_loss_histories.append(loss_history)
        all_Tmae_histories.append(Tmae_history)
        all_Tloss_histories.append(Tloss_history)

Here I find the averages of the results per season from the four iterations

In [None]:
average_mae_history = [
    np.mean([x[i] for x in all_mae_histories]) for i in range(num_epochs)]
average_loss_history = [
    np.mean([x[i] for x in all_loss_histories]) for i in range(num_epochs)]
average_Tmae_history = [
    np.mean([x[i] for x in all_Tmae_histories]) for i in range(num_epochs)]
average_Tloss_history = [
    np.mean([x[i] for x in all_Tloss_histories]) for i in range(num_epochs)]

I save them to external text files so I can have them and print out whatever chart I need later

In [None]:
with open(Project_Path+ '/Results/Full Smoothing/ValMae.txt', "w") as file:
    file.write(str(average_mae_history))
with open(Project_Path+ '/Results/Full Smoothing/ValLoss.txt', "w") as file:
    file.write(str(average_loss_history))
with open(Project_Path+ '/Results/Full Smoothing/TrainMae.txt', "w") as file:
    file.write(str(average_Tmae_history))
with open(Project_Path+ '/Results/Full Smoothing/TrainLoss.txt', "w") as file:
    file.write(str(average_Tloss_history))

I load the list containing the Validation mae to find the appropriate training season

In [None]:
with open(Project_Path+ '/Results/Full Smoothing/ValMae.txt', "r") as file:
    average_mae_history = eval(file.readline())

I am printing the result

In [None]:
def smooth_curve(points, factor=0.6):
  smoothed_points = []
  for point in points:
    if smoothed_points:
      previous = smoothed_points[-1]
      smoothed_points.append(previous * factor + point * (1 - factor))
    else:
      smoothed_points.append(point)
  return smoothed_points

smooth_mae_history = smooth_curve(average_mae_history[0:])

plt.plot(range(1, len(smooth_mae_history) + 1), smooth_mae_history,label='Smoothed Train set on Validation set')
plt.title('Full Smoothing 4-Fold Cross Validation')
plt.xlabel('Epochs')
plt.ylabel('Validation Mae')
plt.legend()
plt.show()

I find what is the best season. That is, the one with the youngest Mae

In [None]:
#5
Min_Mae=min(smooth_mae_history)
Best_Epoch=smooth_mae_history.index(Min_Mae)
Best_Epoch

Now knowing the best training time I train the model and check it with the test-set

In [None]:
train_data=pd.read_csv(Project_Path+ '/Data/Train.csv',sep=',',index_col=0)
test_data=pd.read_csv(Project_Path+ '/Data/Test.csv',sep=',',index_col=0)
TestDF=Apply_Smoothing(test_data) #smoothing
TrainDF=Apply_Smoothing(train_data) #smoothing

train_targets=TrainDF['Real_Rate']
train_data=TrainDF.drop(['Real_Rate'], axis=1)

test_targets=TestDF['Real_Rate']
test_data=TestDF.drop(['Real_Rate'], axis=1)

test_targets = Normalization(test_targets)
train_targets = Normalization(train_targets)

Target=test_targets
Target.append(train_targets)
Bias_Initial_Out=Target.mean()
Bias_Initializer=tf.keras.initializers.Constant(Bias_Initial_Out)

train_data,test_data=Scaller(train_data,test_data)


I have been training for 24 epochs

In [None]:
# Get a fresh, compiled model.
model = Residual_NN(Input_Shape=train_data.shape[1],Bias_Sigmoid_Initializer=Bias_Initializer)
# Train it on the entirety of the data.
model.fit(train_data,train_targets,callbacks=callback,
          epochs=24, batch_size=16, verbose=1)


I check with the smoothed test-set

In [None]:
test_mse_score, test_mae_score, test_mape_score = model.evaluate(test_data, test_targets)


# Smoothing On Train evaluate on non-smouthed validation set

As I mentioned in the paper, I also did experiments with a smoothed train-set but a non-smoothed validation/test-set

.

The following function takes the data set I created in the previous notebook and converts it into a format so that it can be a validation/test-set to test with the smoothed train set. It does not cause data augmentation or smoothing. It's pretty much the same as the first "Prepare_Dataset" function that formatted our data just adds a 'Time_Distance' attribute used in smoothing.

In [None]:
def Prepare_Test(Test): 
    Test['Time_Distance']=0
    Test=Test.drop(['Time_Int'], axis=1)
    a=Test['Slot_Timeint']
    b=Test['Ilegality_Rate']
    Test=Test.drop(['Slot_Timeint'], axis=1)
    Test=Test.drop(['Ilegality_Rate'], axis=1)
    Test.insert(9, "Real_Time", a, True)
    Test.insert(10, "Real_Rate", b, True)
    
    Test=pd.merge(Test, Final_Weather_Data, on='Key')
    Test=Test.drop(['Key'], axis=1)
    
    Test=pd.merge(Test, Distance_Data, on='Slot_id')
    Test=Test.drop(['Slot_id'], axis=1)
    return Test

The function below applies an exponential decay technique to the learning rate after epoch 20

In [None]:
def scheduler(epoch, lr):
    if epoch < 20:
        return lr
    else:
        return lr * tf.math.exp(-0.25)

callback = tf.keras.callbacks.LearningRateScheduler(scheduler)

I do the same process as I did previously, simply in each iteration I apply smoothing only to the training set

In [None]:
train_data=pd.read_csv(Project_Path+ '/Data/Train.csv',sep=',',index_col=0)
TrainDF=train_data
k = 4
num_val_samples = len(train_data) // k
num_epochs = 60
all_mae_histories = []
all_loss_histories = []
all_Tmae_histories = []
all_Tloss_histories = []

for i in tqdm(range(0,k)):
        print('processing fold #', i)
        TrainDF=shuffle(TrainDF)
        Train,Val = train_test_split(TrainDF,test_size=0.2,random_state=42)
        
      
        Val=Prepare_Test(Val)
        Train=Apply_Smoothing(Train)
        
        Final_Train_targets=Train['Real_Rate']
        Final_Train_data=Train.drop(['Real_Rate'], axis=1)
        
        Final_val_targets=Val['Real_Rate']
        Final_val_data=Val.drop(['Real_Rate'], axis=1)

        
        Final_Train_data,Final_val_data=Scaller(Final_Train_data,Final_val_data)
        
        Final_Train_targets = Normalization(Final_Train_targets)
        Final_val_targets = Normalization(Final_val_targets)
        
        Target=Final_Train_targets
        Target.append(Final_val_targets)
        Bias_Initial_Out=Target.mean()
        Bias_Initializer=tf.keras.initializers.Constant(Bias_Initial_Out)
        
        
        # Build the Keras model (already compiled)
        model = Residual_NN(Input_Shape=Final_Train_data.shape[1],Bias_Sigmoid_Initializer=Bias_Initializer)
        # Train the model (in silent mode, verbose=0)
        history = model.fit(Final_Train_data, Final_Train_targets,
                            validation_data=(Final_val_data, Final_val_targets),
                            epochs=num_epochs,callbacks=callback,batch_size=16, verbose=1)
        mae_history = history.history['val_MAE']
        loss_history = history.history['val_loss']
        Tmae_history = history.history['MAE']
        Tloss_history = history.history['loss']
        all_mae_histories.append(mae_history)
        all_loss_histories.append(loss_history)
        all_Tmae_histories.append(Tmae_history)
        all_Tloss_histories.append(Tloss_history)

Here I find the averages of the results per season from the four iterations

In [None]:
average_mae_history = [
    np.mean([x[i] for x in all_mae_histories]) for i in range(num_epochs)]
average_loss_history = [
    np.mean([x[i] for x in all_loss_histories]) for i in range(num_epochs)]
average_Tmae_history = [
    np.mean([x[i] for x in all_Tmae_histories]) for i in range(num_epochs)]
average_Tloss_history = [
    np.mean([x[i] for x in all_Tloss_histories]) for i in range(num_epochs)]

I save them to external text files so I can have them and print out whatever chart I need later

In [None]:
with open(Project_Path+ '/Results/Smooth On Train Real Test/ValMae.txt', "w") as file:
    file.write(str(average_mae_history))
with open(Project_Path+ '/Results/Smooth On Train Real Test/ValLoss.txt', "w") as file:
    file.write(str(average_loss_history))
with open(Project_Path+ '/Results/Smooth On Train Real Test/TrainMae.txt', "w") as file:
    file.write(str(average_Tmae_history))
with open(Project_Path+ '/Results/Smooth On Train Real Test/TrainLoss.txt', "w") as file:
    file.write(str(average_Tloss_history))

I load the list containing the Validation mae to find the appropriate training season

In [None]:
with open(Project_Path+ '/Results/Smooth On Train Real Test/ValLoss.txt', "r") as file:
    average_mae_history = eval(file.readline())
with open(Project_Path+ '/Results/Full Smoothing/ValLoss.txt', "r") as file:
    average_mae_history2 = eval(file.readline())
with open(Project_Path+ '/Results/No Smoothing/ValLoss.txt', "r") as file:
    average_mae_history3 = eval(file.readline())    

I am printing the result

In [None]:
def smooth_curve(points, factor=0.4):
  smoothed_points = []
  for point in points:
    if smoothed_points:
      previous = smoothed_points[-1]
      smoothed_points.append(previous * factor + point * (1 - factor))
    else:
      smoothed_points.append(point)
  return smoothed_points

smooth_mae_history = smooth_curve(average_mae_history3[0:])
smooth_mae_history2 = smooth_curve(average_mae_history2[0:])
smooth_mae_history3 = smooth_curve(average_mae_history[0:])
fig = plt.figure()
plt.plot(range(1, len(smooth_mae_history) + 1), smooth_mae_history,label= 'Train Set on Validation Set' )
plt.plot(range(1, len(smooth_mae_history3) + 1), smooth_mae_history3,label= 'Smoothed Train Set on Validation Set')
plt.plot(range(1, len(smooth_mae_history2) + 1), smooth_mae_history2,label= 'Smoothed Train Set on Smoothed Validation Set')

plt.title('Influence of Smoothing')
plt.xlabel('Epochs')
plt.ylabel('Validation Mae')
plt.legend()
#fig.savefig(Project_Path+ '/Data/Validation_RMSE.pdf')
plt.show()

I find what is the best season. That is, the one with the youngest Mae

In [None]:
#5
Min_Mae=min(smooth_mae_history)
Best_Epoch=smooth_mae_history.index(Min_Mae)
Best_Epoch

Now knowing the best training time I train the model and check it with the test-set

In [None]:
train_data=pd.read_csv(Project_Path+ '/Data/Train.csv',sep=',',index_col=0)
test_data=pd.read_csv(Project_Path+ '/Data/Test.csv',sep=',',index_col=0)
TestDF=Prepare_Test(test_data) #No smoothing
TrainDF=Apply_Smoothing(train_data) #Smoothing

train_targets=TrainDF['Real_Rate']
train_data=TrainDF.drop(['Real_Rate'], axis=1)

test_targets=TestDF['Real_Rate']
test_data=TestDF.drop(['Real_Rate'], axis=1)

test_targets = Normalization(test_targets)
train_targets = Normalization(train_targets)

Target=test_targets
Target.append(train_targets)
Bias_Initial_Out=Target.mean()
Bias_Initializer=tf.keras.initializers.Constant(Bias_Initial_Out)

train_data,test_data=Scaller(train_data,test_data)



I have been training for 26 epochs

In [None]:
# Get a fresh, compiled model.
model = Residual_NN(Input_Shape=train_data.shape[1],Bias_Sigmoid_Initializer=Bias_Initializer)
# Train it on the entirety of the data.
model.fit(train_data,train_targets,callbacks=callback,
          epochs=26, batch_size=16, verbose=1)

I check with the test-set

In [None]:
test_mse_score, test_mae_score, test_mape_score = model.evaluate(test_data, test_targets)


I save the model 

In [None]:
#model.save(Project_Path+ '/DNN_Regressor')

I print the predictions

In [None]:
predicted_value=model.predict(test_data)
true_value=test_targets
plt.figure(figsize=(7,7))
plt.scatter(true_value, predicted_value, c='crimson')
plt.yscale('log')
plt.xscale('log')

p1 = max(max(predicted_value), max(true_value))
p2 = min(min(predicted_value), min(true_value))
plt.plot([p1, p2], [p1, p2], 'b-')
plt.xlabel('True Values', fontsize=15)
plt.ylabel('Predictions', fontsize=15)
plt.axis('equal')
plt.show()

# Final Model

In [None]:
def scheduler(epoch, lr):
    if epoch < 22:
        return lr
    else:
        return lr * tf.math.exp(-0.25)

callback = tf.keras.callbacks.LearningRateScheduler(scheduler)

In [None]:
Legal_illegal=pd.read_csv(Project_Path+ '/Data/Scan_Data_Reg_2.3.csv',sep=',',index_col=0)
TrainDF=Apply_Smoothing(Legal_illegal) #smoothing

train_targets=TrainDF['Real_Rate']
train_data=TrainDF.drop(['Real_Rate'], axis=1)
#train_targets = Normalization(train_targets)

Target=train_targets
Bias_Initial_Out=Target.mean()
Bias_Initializer=tf.keras.initializers.Constant(Bias_Initial_Out)

Standar_Scaller = StandardScaler()
train_data=Standar_Scaller.fit_transform(train_data)
with open('Standar_Scaller.pkl', 'wb') as f:
    pickle.dump(Standar_Scaller, f,  protocol=2)

In [None]:
# Get a fresh, compiled model.
model = Residual_NN(Input_Shape=train_data.shape[1],Bias_Sigmoid_Initializer=Bias_Initializer)
# Train it on the entirety of the data.
model.fit(train_data,train_targets,callbacks=callback,
          epochs=24, batch_size=16, verbose=1)

In [None]:
# filename = 'finalized_model.sav'
# pickle.dump(model, open(filename, 'wb'))