# Process training Data to train a RNN

In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals
import pathlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import pprint as pprint
import math
import time

In [2]:
try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers

In [3]:
import tensorflow_docs as tfdocs
import tensorflow_docs.plots
import tensorflow_docs.modeling
import os
import json
import pickle
import sklearn
from sklearn import neighbors

### 1. Get the data from CSV files
First Import it using pandas

In [4]:
def class_to_action (class_int):
    with open('./data/class_to_action (28).json') as json_file:
        CtA = json.load(json_file)
        
    action = CtA[class_int]
    return action

In [5]:
def closest_category(X, pred):
    tree = sklearn.neighbors.KDTree(X, leaf_size=2)
    dist, ind = tree.query(pred, k=1)
    return ind

In [6]:

def get_data (file):
    raw_dataset = pd.read_csv(file)
    dataset = raw_dataset.copy()
    training_size = len(dataset)
    N_CATEGORIES = 28

    dataset["A0"] = np.nan
    dataset["A1"] = np.nan

    for index, row in dataset.iterrows():
        CS_array =np.array(eval(row["current_state"]))
        dataset["current_state"][index] = CS_array
        GS_array =np.array(eval(row["goal_state"]))
        dataset["goal_state"][index] = GS_array
        GS_array =np.array(eval(row["out_action"]))
        dataset["out_action"][index] = GS_array

    # Usar estado final 
    dataset["Difference"] =   dataset["goal_state"]-dataset["current_state"]

    dataset[["C0",'C1',"C2","C3",'C4',"C5"]] = pd.DataFrame(dataset.current_state.values.tolist(), index= dataset.index)
    dataset[["D0",'D1',"D2","D3",'D4',"D5"]] = pd.DataFrame(dataset.Difference.values.tolist(), index= dataset.index)
    dataset[["A0","A1"]] = pd.DataFrame(dataset.out_action.values.tolist(), index= dataset.index)
    
    dataset_short= dataset[["id_camino","D0",'D1',"D2","D3",'D4',"D5","A0","A1"]]

    return dataset_short

Add the last state with the exit action

In [7]:
def extend_dataset(dataset_short):
    extended_dataset = dataset_short.copy()
    exit_state = pd.DataFrame([{"id_camino":0,"D0":0,'D1':0,"D2":0,"D3":0,'D4':0,"D5":0,"A0":-1,"A1":-1}])
    for camino in extended_dataset["id_camino"].unique():
        exit_state["id_camino"] = camino
        extended_dataset.append(exit_state)

    return extended_dataset

In [8]:
dataset_short = get_data("./data/OSPA_training_data 2.csv")

In [9]:
dataset_short

Unnamed: 0,id_camino,D0,D1,D2,D3,D4,D5,A0,A1
0,0,85.591116,-9.250073,-4.257165,0.000000,0.000000,0.000000,-0.05236,0.0
1,0,80.846048,-8.006316,-5.915147,-0.618050,0.228180,0.002423,-0.10472,0.0
2,0,74.575747,-7.229901,-5.983671,-0.806063,-0.238873,-0.012752,-0.00000,0.0
3,0,68.915460,-6.614438,-6.322726,-0.403866,0.287311,0.014920,-0.08727,0.0
4,0,62.080447,-5.388121,-6.883976,-0.744963,-0.154067,-0.013679,-0.03491,0.0
...,...,...,...,...,...,...,...,...,...
2073,239,33.896035,-30.077869,-7.023492,-0.332507,0.532105,0.011985,-0.00000,0.0
2074,239,26.749762,-23.981863,-11.837088,-0.170587,0.772923,0.004712,-0.00000,0.0
2075,239,17.436739,-13.520295,-16.015517,-0.120340,0.875689,0.002187,-0.01745,0.0
2076,239,4.547199,-2.341143,-17.729530,-0.336825,0.518724,-0.011393,-0.08727,0.0


In [10]:
extended_dataset= extend_dataset(dataset_short)

In [11]:
print(extended_dataset.loc[extended_dataset["D3"]==0])

      id_camino         D0         D1        D2   D3   D4   D5       A0   A1
0             0  85.591116  -9.250073 -4.257165  0.0  0.0  0.0 -0.05236  0.0
15            1  98.043500 -37.779358 -4.257165  0.0  0.0  0.0 -0.03491  0.0
25            2  91.272922 -36.852284 -4.257165  0.0  0.0  0.0 -0.00000  0.0
34            3  94.839433  -8.007799 -4.257165  0.0  0.0  0.0 -0.05236  0.0
52            4  69.791783 -31.305606 -4.257165  0.0  0.0  0.0 -0.00000  0.0
...         ...        ...        ...       ...  ...  ...  ...      ...  ...
2042        235  41.074009 -34.144254 -4.257165  0.0  0.0  0.0 -0.00000  0.0
2048        236  40.754941 -11.474966 -4.257165  0.0  0.0  0.0 -0.00000  0.0
2054        237  36.542365 -20.220654 -4.257165  0.0  0.0  0.0 -0.00000  0.0
2059        238  77.290122 -11.950868 -4.257165  0.0  0.0  0.0 -0.03491  0.0
2072        239  38.729629 -32.034013 -4.257165  0.0  0.0  0.0 -0.00000  0.0

[240 rows x 9 columns]


# Normalize the data

It is good practice to normalize features that use different scales and ranges. Although the model *might* converge without feature normalization, it makes training more difficult, and it makes the resulting model dependent on the choice of units used in the input.

Note: Although we intentionally generate these statistics from only the training dataset, these statistics will also be used to normalize the test dataset. We need to do that to project the test dataset into the same distribution that the model has been trained on.

In [12]:
def norm(x, train_stats):
  #return (x - train_stats['mean']) / train_stats['std']
  return (x) / train_stats['std']




In [13]:
def denorm(x, train_stats):
  #return (x - train_stats['mean']) / train_stats['std']
  return (x) * train_stats['std']


In [14]:
def normalize_data(dataset_short,action_vector):
    reduced_dataset = dataset_short.copy()
    reduced_dataset.pop("A0")
    reduced_dataset.pop("A1")
    reduced_dataset.pop("id_camino")
    data_stats = reduced_dataset.describe()
    data_stats = data_stats.transpose()

    data_stats.to_csv(r"./data/data_stats2.csv")

    action_vector.append("id_camino")
    normed_data = norm(reduced_dataset, data_stats)
    normed_dataset = normed_data.join(dataset_short[action_vector])

    return normed_dataset

## Pad the data

In [15]:
def padding(normed_dataset,N_STEPS, position):
    PAD_LEN = N_STEPS+1
    padded_data = []
    for camino in normed_dataset["id_camino"].unique():
        group = normed_dataset[normed_dataset["id_camino"]==camino]
        group.pop("id_camino")
        padded_data.append(group.values)
    
    dataset2 = tf.keras.preprocessing.sequence.pad_sequences(padded_data, maxlen=PAD_LEN, dtype='float64', padding=position, truncating=position, value=0.0)

    return dataset2