In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras import layers
import os
import re
import matplotlib.pyplot as plt

In [None]:
tf.__version__

## Show files in the project dir

In [None]:
projectDir='C:/work/projects/position-data-generator'
fl = os.listdir(projectDir)
print(fl)

## Show one of the files - daily EU trajectory example

In [None]:
#assuming the files are in the notebook's directory




files = [f for f in os.listdir(projectDir) if re.match(r'day_[0-9]+_UE3_output\.csv', f)]

def date_chars(s):
    start = s.find("day_") + len("day_")
    end   = s.find("_UE")
    substring = s[start:end]
    return(int(substring))

#sort files by day
files = sorted(files, key = date_chars) 

for file in files:
    df = pd.read_csv(projectDir+"/"+file)
    x = df.x 
    y = df.y  
    
    plt.scatter(x, y)
    plt.show()

## create time series

In [None]:
files = [f for f in os.listdir(projectDir) if re.match(r'day_[0-9]+_UE3_output\.csv', f)]

#sort files by day
files = sorted(files, key = date_chars) 

def fillMinutesFromMidnight(firstHour, firstMinute, location, multiplier):
    minutesNow = firstHour * 60 + firstMinute
    #print(minutesNow)
    return pd.Series(location, index=range(0, minutesNow))

def fillMinutesTillMidnight(lastHour, lastMinute, location, multiplier):
    minutesSpend = (24 * 60) - (lastHour * 60 + lastMinute) - 1
    #print(minutesSpend)
    return pd.Series(location, index=range(0, minutesSpend))                    

def fillTheGap(perCells,currentHour,nextHour,currentMinute,nextMin,currentLocation):
    
    gap = (nextHour*60+nextMin) - (currentHour*60+currentMinute)
    
    if gap>1:
        for i in range(0, int(gap)-1):
            perCells = pd.concat((perCells, pd.Series(currentLocation, dtype='float64')))  
    
    return perCells
        
    
    

def fillOutCell(cell, hour, minute, multiplier):
    length      =  minute.size
    serCells      = pd.Series(dtype='float64')
    serHours      = pd.Series(dtype='float64')
    serMinut      = pd.Series(dtype='float64')
    firstHour     =  hour.values[0]
    firstMinute   =  minute.values[0]
    firstLocation =  cell.values[0]
    
    
    currentHour   = firstHour
    currentMinute = firstMinute
    currentLocation = firstLocation
    
    serCells = pd.concat((serCells, pd.Series(currentLocation, dtype='float64')))
    serHours = pd.concat((serHours, pd.Series(currentHour, dtype='float64')))
    serMinut = pd.concat((serMinut, pd.Series(currentMinute, dtype='float64')))
    
    for i in range(1, length):
        if((currentHour != hour.values[i]) | (currentMinute!=minute.values[i])):
            currentHour     = hour.values[i]
            currentMinute   = minute.values[i]
            currentLocation = cell.values[i]
            serCells = pd.concat((serCells, pd.Series(currentLocation, dtype='float64')))
            serHours = pd.concat((serHours, pd.Series(currentHour, dtype='float64')))
            serMinut = pd.concat((serMinut, pd.Series(currentMinute, dtype='float64')))
            
    
    #now times without duplicates, but may have gaps between minutes
    length      =  serMinut.size
    #for i in range(0, length):
    #    print(serHours.values[i], serMinut.values[i])
    firstHour     =  serHours.values[0]
    firstMinute   =  serMinut.values[0]
    firstLocation =  serCells.values[0]
    
    perCells      = pd.Series(dtype='float64')
    
    
    currentHour   = firstHour
    currentMinute = firstMinute
    currentLocation = firstLocation
    
    perCells = pd.concat((perCells, pd.Series(currentLocation, dtype='float64')))  
    
    for i in range(1, length):
        nextHour = serHours.values[i]
        nextMin  = serMinut.values[i]
        nextLoc  = serCells.values[i]
        perCells = fillTheGap(perCells,currentHour,nextHour,currentMinute,nextMin,currentLocation)
        currentHour   = nextHour
        currentMinute = nextMin
        currentLocation = nextLoc    
        perCells = pd.concat((perCells, pd.Series(currentLocation, dtype='float64')))      
    
    return perCells

countWrongs = 0

series = pd.Series(dtype='float64')

MINS_IN_DAY=1440

for file in files:
    df          =  pd.read_csv(projectDir+"/"+file)
    cell        =  df.cell
    hour        =  df.hour
    minute      =  df.minute
    length      =  minute.size
    firstHour   =  hour.values[0]
    lastHour    =  hour.values[length-1]
    firstMinute =  minute.values[0]
    lastMinute  =  minute.values[length-1]
    firstLocation = cell[0]
    lastLocation  = cell[length-1]
    
    
    multiplier = 1 # not to go in the resolution smaller than 1 minute
    minutesFromMidnight = fillMinutesFromMidnight(firstHour, firstMinute, firstLocation, multiplier)
    minutesTillMidnight = fillMinutesTillMidnight(lastHour, lastMinute, lastLocation, multiplier)
    cell = fillOutCell(cell, hour, minute,multiplier )#add absent locations in absent times
    oneDaySeries = pd.concat((minutesFromMidnight, cell, minutesTillMidnight))
    
    
    size = oneDaySeries.size
    if(size!=MINS_IN_DAY):
        countWrongs+=1
    else:
        series = pd.concat((series, oneDaySeries))
    
    
print(countWrongs)
    

In [None]:
print("series size ", series.size)
DAYS_COLLECTED=series.size/MINS_IN_DAY
print("days collected ", DAYS_COLLECTED)


## plotting function

In [None]:
import matplotlib.pyplot as plt
def plot_series(x, y, format="-", start=0, end=None, 
                title=None, xlabel=None, ylabel=None, legend=None ):
    """
    Visualizes time series data

    Args:
      x (array of int) - contains values for the x-axis
      y (array of int or tuple of arrays) - contains the values for the y-axis
      format (string) - line style when plotting the graph
      start (int) - first time step to plot
      end (int) - last time step to plot
      title (string) - title of the plot
      xlabel (string) - label for the x-axis
      ylabel (string) - label for the y-axis
      legend (list of strings) - legend for the plot
    """

    # Setup dimensions of the graph figure
    plt.figure(figsize=(18, 6))
    
    # Check if there are more than two series to plot
    if type(y) is tuple:

      # Loop over the y elements
      for y_curr in y:

        # Plot the x and current y values
        plt.plot(x[start:end], y_curr[start:end], format)

    else:
      # Plot the x and y values
      plt.plot(x[start:end], y[start:end], format)

    # Label the x-axis
    plt.xlabel(xlabel)

    # Label the y-axis
    plt.ylabel(ylabel)

    # Set the legend
    if legend:
      plt.legend(legend)

    # Set the title
    plt.title(title)

    # Overlay a grid on the graph
    plt.grid(True)

    # Draw the graph on screen
    plt.show()

## split the data set

In [None]:
time = np.arange(DAYS_COLLECTED * MINS_IN_DAY , dtype="float32")

# Define the split time
split_time1 = int(MINS_IN_DAY*DAYS_COLLECTED*0.2)
split_time2 = int(MINS_IN_DAY*DAYS_COLLECTED*0.3)
print("split1 ", split_time1)
print("split2 ", split_time2)

# Get the train set 
time_train = time[:split_time1]
x_train = series[:split_time1]

# Get the validation set
time_valid = time[split_time1:split_time2]
x_valid = series[split_time1:split_time2]



## Plot series

### Day 1

In [None]:
plot_series(time[:MINS_IN_DAY], series[:MINS_IN_DAY], xlabel='Time', ylabel='Value')

### Day 2

In [None]:
plot_series(time[MINS_IN_DAY:MINS_IN_DAY*2], series[MINS_IN_DAY:MINS_IN_DAY*2], xlabel='Time', ylabel='Value')

### Day 200

In [None]:
plot_series(time[MINS_IN_DAY*199:MINS_IN_DAY*200], series[MINS_IN_DAY*199:MINS_IN_DAY*200], xlabel='Time', ylabel='Value')

### Day 300

In [None]:
plot_series(time[MINS_IN_DAY*299:MINS_IN_DAY*300], series[MINS_IN_DAY*299:MINS_IN_DAY*300], xlabel='Time', ylabel='Value')

In [None]:



# Parameters

#MINS_TO_WINDOW=MINS_IN_DAY
MINS_TO_WINDOW=MINS_IN_DAY
window_size = MINS_TO_WINDOW
batch_size = 4
shuffle_buffer_size = MINS_TO_WINDOW

In [None]:
def windowed_dataset(series, window_size, batch_size, shuffle_buffer):
    """Generates dataset windows

    Args:
      series (array of float) - contains the values of the time series
      window_size (int) - the number of time steps to average
      batch_size (int) - the batch size
      shuffle_buffer(int) - buffer size to use for the shuffle method

    Returns:
      dataset (TF Dataset) - TF Dataset containing time windows
    """
  
    # Generate a TF Dataset from the series values
    dataset = tf.data.Dataset.from_tensor_slices(series)
    
    # Window the data but only take those with the specified size
    dataset = dataset.window(window_size + 1, shift=1, drop_remainder=True)
    
    # Flatten the windows by putting its elements in a single batch
    dataset = dataset.flat_map(lambda window: window.batch(window_size + 1))

    # Create tuples with features and labels 
    dataset = dataset.map(lambda window: (window[:-1], window[-1]))

    # Shuffle the windows
    # dataset = dataset.shuffle(shuffle_buffer)
    
    # Create batches of windows
    dataset = dataset.batch(batch_size).prefetch(1)
    
    return dataset

In [None]:
train_set = windowed_dataset(x_train, window_size, batch_size, shuffle_buffer_size)

In [None]:
# Reset states generated by Keras
tf.keras.backend.clear_session()

# Build the model
model = tf.keras.models.Sequential([
  tf.keras.layers.Conv1D(filters=window_size, kernel_size=3,
                      strides=1, padding="causal",
                      activation="relu",
                      input_shape=[window_size, 1]),
  tf.keras.layers.LSTM(window_size, return_sequences=True),
  tf.keras.layers.LSTM(window_size),
  tf.keras.layers.Dense(1),
  tf.keras.layers.Lambda(lambda x: x )
])

# Print the model summary
model.summary()

In [None]:
# Get initial weights
init_weights = model.get_weights()

## Create custom Loss for the geometrical x,y coordinates difference

In [None]:
class CustomAccuracy(tf.keras.losses.Loss):
    def __init__(self):
        super().__init__()
    def call(self, y_true, y_pred):
        
        #tf.print("y_true", y_true)
        #tf.print("y_pred",y_pred)
        
        tensSize = tf.size(y_pred)
        #tf.print("tensSize",tensSize)
        
        divider = tf.fill((tensSize,1), 128.0)
        #tf.print("divider",divider)
        
        ytrue=tf.math.divide(y_true,divider)
        #tf.print("ytrue", ytrue)
        
        xtrue=tf.math.subtract(y_true, tf.math.multiply(ytrue,divider))
        #tf.print("xtrue",xtrue)
        
        
        ypred=tf.math.divide(y_pred,divider)
        #tf.print("ypred",ypred)
        
        xpred=tf.math.subtract(y_pred, tf.math.multiply(ypred,divider))
        #tf.print("xpred",xpred)
        
        xdiff=tf.math.abs(tf.math.subtract(xtrue,xpred))
        #tf.print("xdiff",xdiff)
        
        ydiff=tf.math.abs(tf.math.subtract(ytrue,ypred))
        #tf.print("ydiff",ydiff)
        
        
        absDiff=tf.sqrt(tf.math.add(tf.square(xdiff),tf.square(ydiff)))
        #tf.print("absDiff",absDiff)
        
        mse = tf.reduce_mean(tf.square(absDiff))
        #tf.print("mse", mse)
        
        rmse = tf.math.sqrt(mse)
        #tf.print("rmse", rmse)
        
        return rmse

## Create model and fit

In [None]:
# Set the learning rate scheduler
#lr_schedule = tf.keras.callbacks.LearningRateScheduler(
#     lambda epoch: 1e-8 * 10**(epoch / 20))
lr_schedule = tf.keras.callbacks.LearningRateScheduler(
     lambda epoch: 1e-4)

# Initialize the optimizer
#optimizer = tf.keras.optimizers.SGD(momentum=0.9)
#optimizer=tf.keras.optimizers.Adam(learning_rate=0.000008)
# Set the training parameters
model.compile(loss=tf.keras.losses.Huber(), optimizer=optimizer)
model.compile(loss=CustomAccuracy(), optimizer=optimizer)

# Train the model
history = model.fit(train_set, epochs=10, callbacks=[lr_schedule])
#history = model.fit(train_set, epochs=2)

   1369/Unknown - 2546s 2s/step - loss: 58.8587

In [None]:
 # Get mae and loss from history log
mae=history.history['mae']
loss=history.history['loss']

# Get number of epochs
epochs=range(len(loss)) 

# Plot mae and loss
plot_series(
    x=epochs, 
    y=(mae, loss), 
    title='MAE and Loss', 
    xlabel='Epochs',
    legend=['MAE', 'Loss']
    )

# Only plot the last 80% of the epochs
zoom_split = int(epochs[-1] * 0.2)
epochs_zoom = epochs[zoom_split:]
mae_zoom = mae[zoom_split:]
loss_zoom = loss[zoom_split:]

# Plot zoomed mae and loss
plot_series(
    x=epochs_zoom, 
    y=(mae_zoom, loss_zoom), 
    title='MAE and Loss', 
    xlabel='Epochs',
    legend=['MAE', 'Loss']
    )

In [None]:
def model_forecast(model, series, window_size, batch_size):
    """Uses an input model to generate predictions on data windows

    Args:
      model (TF Keras Model) - model that accepts data windows
      series (array of float) - contains the values of the time series
      window_size (int) - the number of time steps to include in the window
      batch_size (int) - the batch size

    Returns:
      forecast (numpy array) - array containing predictions
    """

    # Generate a TF Dataset from the series values
    dataset = tf.data.Dataset.from_tensor_slices(series)

    # Window the data but only take those with the specified size
    dataset = dataset.window(window_size, shift=1, drop_remainder=True)

    # Flatten the windows by putting its elements in a single batch
    dataset = dataset.flat_map(lambda w: w.batch(window_size))
    
    # Create batches of windows
    dataset = dataset.batch(batch_size).prefetch(1)
    
    # Get predictions on the entire dataset
    forecast = model.predict(dataset)
    
    return forecast

In [None]:
# Reduce the original series
forecast_series = series[split_time-window_size:-1]

# Use helper function to generate predictions
forecast = model_forecast(model, forecast_series, window_size, batch_size)

# Drop single dimensional axes
results = forecast.squeeze()

# Plot the results
plot_series(time_valid, (x_valid, results))

In [None]:
## Compute the MAE and MSE
print(tf.keras.metrics.mean_squared_error(x_valid, results).numpy())
print(tf.keras.metrics.mean_absolute_error(x_valid, results).numpy())