# Process training Data to train a RNN

In [5]:
from __future__ import absolute_import, division, print_function, unicode_literals
import pathlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import pprint as pprint
import math
import time

In [6]:
try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers

In [7]:
import tensorflow_docs as tfdocs
import tensorflow_docs.plots
import tensorflow_docs.modeling
import os
import json
import pickle
import sklearn
from sklearn import neighbors

### 1. Get the data from CSV files
First Import it using pandas

In [11]:
def class_to_action (class_int):
    with open('./data/class_to_action (28).json') as json_file:
        CtA = json.load(json_file)
        
    action = CtA[class_int]
    return action

In [12]:
def closest_category(X, pred):
    tree = sklearn.neighbors.KDTree(X, leaf_size=2)
    dist, ind = tree.query(pred, k=1)
    return ind

In [13]:

def get_data (file):
    raw_dataset = pd.read_csv(file)
    dataset = raw_dataset.copy()
    training_size = len(dataset)
    N_CATEGORIES = 28

    dataset["A0"] = np.nan
    dataset["A1"] = np.nan

    for index, row in dataset.iterrows():
        CS_array =np.array(eval(row["current_state"]))
        dataset["current_state"][index] = CS_array
        GS_array =np.array(eval(row["goal_state"]))
        dataset["goal_state"][index] = GS_array
        GS_array =np.array(eval(row["out_action"]))
        dataset["out_action"][index] = GS_array

    # Usar estado final 
    dataset["Difference"] =   dataset["goal_state"]-dataset["current_state"]

    dataset[["C0",'C1',"C2","C3",'C4',"C5"]] = pd.DataFrame(dataset.current_state.values.tolist(), index= dataset.index)
    dataset[["D0",'D1',"D2","D3",'D4',"D5"]] = pd.DataFrame(dataset.Difference.values.tolist(), index= dataset.index)
    dataset[["A0","A1"]] = pd.DataFrame(dataset.out_action.values.tolist(), index= dataset.index)
    
    dataset_short= dataset[["id_camino","D0",'D1',"D2","D3",'D4',"D5","A0","A1"]]

    return dataset_short

Add the last state with the exit action

In [14]:
def add_end_state(dataset_short):
    extended_dataset = dataset_short.copy()
    exit_state = pd.DataFrame([{"id_camino":0,"D0":0,'D1':0,"D2":0,"D3":0,'D4':0,"D5":0,"A0":-1,"A1":-1}])
    for camino in extended_dataset["id_camino"].unique():
        exit_state["id_camino"] = camino
        extended_dataset.append(exit_state)

    return extended_dataset

In [80]:
def extend_dataset(dataset_short, N_STEPS):
    extended_dataset = dataset_short.copy()
    last = extended_dataset.iloc[-1,0]
    i=1
    for camino in extended_dataset["id_camino"].unique():
        len_camino = len(extended_dataset[extended_dataset["id_camino"]==camino])
        if(len_camino>N_STEPS):
            tail= extended_dataset[extended_dataset["id_camino"]==camino][len_camino-N_STEPS:]
            tail["id_camino"]= last+i
            i=i+1
            extended_dataset = extended_dataset.append(tail,ignore_index=True)

    return extended_dataset

# Normalize the data

It is good practice to normalize features that use different scales and ranges. Although the model *might* converge without feature normalization, it makes training more difficult, and it makes the resulting model dependent on the choice of units used in the input.

Note: Although we intentionally generate these statistics from only the training dataset, these statistics will also be used to normalize the test dataset. We need to do that to project the test dataset into the same distribution that the model has been trained on.

In [1]:
def norm(x, train_stats):
  #return (x - train_stats['mean']) / train_stats['std']
  return (x) / train_stats['std']




In [13]:
def denorm(x, train_stats):
  #return (x - train_stats['mean']) / train_stats['std']
  return (x) * train_stats['std']


In [14]:
def normalize_data(dataset_short,action_vector):
    reduced_dataset = dataset_short.copy()
    reduced_dataset.pop("A0")
    reduced_dataset.pop("A1")
    reduced_dataset.pop("id_camino")
    data_stats = reduced_dataset.describe()
    data_stats = data_stats.transpose()

    data_stats.to_csv(r"./data/data_stats2.csv")

    action_vector.append("id_camino")
    normed_data = norm(reduced_dataset, data_stats)
    normed_dataset = normed_data.join(dataset_short[action_vector])

    return normed_dataset

## Pad the data

In [15]:
def padding(normed_dataset,N_STEPS, position):
    PAD_LEN = N_STEPS+1
    padded_data = []
    for camino in normed_dataset["id_camino"].unique():
        group = normed_dataset[normed_dataset["id_camino"]==camino]
        group.pop("id_camino")
        padded_data.append(group.values)
    
    dataset2 = tf.keras.preprocessing.sequence.pad_sequences(padded_data, maxlen=PAD_LEN, dtype='float64', padding=position, truncating=position, value=0.0)

    return dataset2

In [16]:
from ast import literal_eval
def print_model_df_analysis(csv_name ="./data/model_data_csv.csv", percent_value=25/100.0):
    df = pd.read_csv(csv_name, converters={"target":literal_eval})
    df["abs_x"] = df["target"].apply(lambda x: abs(x[0]))
    df["abs_z"] = df["target"].apply(lambda x: abs(x[1]))

    return df.loc[df["abs_x"]*percent_value>df["abs_z"]]


In [17]:
df1 = print_model_df_analysis(csv_name ="./data/model_data_csv.csv", percent_value=25/100.0)
df1

Unnamed: 0,actions,cost,distance,p_id,states,target,time,abs_x,abs_z
0,"[[-0.05236, 0.0, 31.53455812455641], [-0.08727...",44.809705,8.158723,0,"[(0.0, 0.0, 4.257165346815115, 0.0, 0, 0), (4....","[57.567204330408806, -7.227249063128963, 0, 0,...",0.031012,57.567204,7.227249
2,"[[-0.08727, 0.0, 31.53455812455641], [-0.08727...",62.940057,5.555057,2,"[(0.0, 0.0, 4.257165346815115, 0.0, 0, 0), (4....","[65.45650918893115, -5.43049229897531, 0, 0, 0...",0.043592,65.456509,5.430492
3,"[[-0.0, 0.0, 31.53455812455641], [-0.01745, 0....",27.616556,10.836310,3,"[(0.0, 0.0, 4.257165346815115, 0.0, 0, 0), (4....","[53.04666590815438, -11.15627190194964, 0, 0, ...",0.029021,53.046666,11.156272
5,"[[-0.05236, 0.0, 31.53455812455641], [-0.05236...",34.295909,8.809463,5,"[(0.0, 0.0, 4.257165346815115, 0.0, 0, 0), (4....","[50.755477685109106, -7.997603559062214, 0, 0,...",0.027456,50.755478,7.997604
6,"[[-0.05236, 0.0, 31.53455812455641], [-0.08727...",43.563273,9.600462,6,"[(0.0, 0.0, 4.257165346815115, 0.0, 0, 0), (4....","[58.66248155311308, -8.49909580156482, 0, 0, 0...",0.033426,58.662482,8.499096
...,...,...,...,...,...,...,...,...,...
223,"[[-0.08727, 0.0, 31.53455812455641], [-0.08727...",74.961941,8.540578,223,"[(0.0, 0.0, 4.257165346815115, 0.0, 0, 0), (4....","[88.57891500008651, -10.134761803809054, 0, 0,...",0.050752,88.578915,10.134762
227,"[[-0.05236, 0.0, 31.53455812455641], [-0.01745...",44.881066,9.735799,227,"[(0.0, 0.0, 4.257165346815115, 0.0, 0, 0), (4....","[83.52663655716658, -14.379154946211687, 0, 0,...",0.037157,83.526637,14.379155
232,"[[-0.08727, 0.0, 31.53455812455641], [-0.08727...",50.827783,5.407150,232,"[(0.0, 0.0, 4.257165346815115, 0.0, 0, 0), (4....","[53.82516962303221, -4.976967885744467, 0, 0, ...",0.041981,53.825170,4.976968
233,"[[-0.0, 0.0, 31.53455812455641], [-0.03491, 0....",23.829686,9.450713,233,"[(0.0, 0.0, 4.257165346815115, 0.0, 0, 0), (4....","[42.298671631893384, -7.8576088962145345, 0, 0...",0.021997,42.298672,7.857609
