In [11]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow.contrib.eager as tfe
# tf.compat.v1.enable_eager_execution()
print(tf.__version__)

import time
import ingestor
import extractor2
import utils
import data
import functools

1.14.0


In [12]:
params = ingestor.get_params()

In [13]:
train_data = data.get_train_dataset(params)
parser = data.tristan_parser
train_parsed = train_data.map(functools.partial(parser, params=params))

In [14]:
# set features and labels
features = {'goal': [], 'goal_asl': [], 'thms': [], 'thms_hard_negatives': []}
labels = {'tac_id': []}

# iterate over dataset to extract data into arrays. remove 'take' part to iterate over the entire dataset
for raw_record in train_parsed.take(2000):
    fx, lx = raw_record[0], raw_record[1]
    features['goal'].append(fx['goal'])
    features['goal_asl'].append(fx['goal_asl'])
    features['thms'].append(fx['thms'])
    features['thms_hard_negatives'].append(fx['thms_hard_negatives'])
    labels['tac_id'].append(lx['tac_id'])

# instantiate extractor object
ex = extractor2.Extractor(params)

# tokenize goals
temp = ex.tokenize(features['goal'], ex.vocab_table)
features['goal_ids'] = temp
#pad all goals to be of length 1000
# goal_list = []
# for j in range(len(temp)):
#         l = len(temp[j])
#         h = tf.pad(temp[j], [[0, 1000-l]], constant_values=0)
#         goal_list.append(h)
# features['goal_ids'] = goal_list

# tokenize hypotheses
length = len(features['goal'])
features['goal_asl_ids'] = []

for i in range(length):
    temp = ex.tokenize(features['goal_asl'][i], ex.vocab_table)
    features['goal_asl_ids'].append(temp)
#     #pad all hypotheses to be of length 1000
#     hypo_list = []
#     for j in range(len(temp)):
#         l = len(temp[j])
#         h = tf.pad(temp[j], [[0, 1000-l]], constant_values=0)
#         hypo_list.append(h)
#     features['goal_asl_ids'].append(hypo_list)

del features['goal']
del features['goal_asl']
del features['thms']
del features['thms_hard_negatives']

# # FEATURES
# # 'goal_ids': an array of LENGTH OF DATASET containing arrays which are the tokenized goals of length 1000
# # below is an example of how to access the entire array as numpy arrays
# print(features['goal_ids'][0].numpy())
# print(len(features['goal_ids'][0].numpy()))
# # 'goal_asl_ids': this below is an array of LENGTH OF DATASET containing lists of tokenized hypotheses 
# # where each hypothesis is of length 1000. below is an example of how to access the first hypothesis 
# # from the list of hypotheses corresponding to the first training example
# print(features['goal_asl_ids'][0][0].numpy())
# print(len(features['goal_asl_ids'][0][0].numpy()))

# # LABELS
# # 'tac_id': array containing tactic ids. below is an example of how to access the first tactic
# print(labels['tac_id'][0].numpy())

In [15]:
# features['goal_ids'] is now an array of size 2000 x 1000
features['goal_ids'] = features['goal_ids'].numpy()
print(features['goal_ids'])
print('Number of training examples:', len(features['goal_ids']))
print('Size of training examples:', len(features['goal_ids'][0]))

# features['goal_asl_ids'] is now an array of size 2000 x ? x 1000
length = len(features['goal_asl_ids'])
for i in range(length):
    features['goal_asl_ids'][i] = [hypothesis.numpy() for hypothesis in features['goal_asl_ids'][i]]  

print(features['goal_asl_ids'])
print('Number of training examples:', len(features['goal_asl_ids']))
print('Number of hypotheses for an example:', len(features['goal_asl_ids'][0]))
print('Size of each hypothesis:', len(features['goal_asl_ids'][0][0]))

# features['tactic_ids'] is now an array of size 2000 x 1
labels['tac_id'] = [i.numpy() for i in labels['tac_id']]
print(labels['tac_id'])
print('Number of training examples:', len(labels['tac_id']))

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [16]:
# convert goals to numpy arrays
goals = np.array(features['goal_ids'])
print(np.shape(goals))

(2000, 1000)


In [17]:
# convert goal hypotheses to numpy arrays and concatenate
hypotheses = features['goal_asl_ids']
length_hyp = len(hypotheses)

for i in range(length_hyp):
    if (len(hypotheses[i]) != 0):
        # concatenate hypotheses in a given hypothesis list
        hypotheses[i] = np.concatenate(hypotheses[i])
        # remove zeroes in between
        hypotheses[i] = hypotheses[i][hypotheses[i] != 0]
        # truncate to max hypothesis length of 3000 characters, i.e. truncating less than 10% of data
        hypotheses[i] = hypotheses[i][0:3000]
        # pad with zeroes to make length 3000 (to save as csv)
        len_conc = len(hypotheses[i])
        hypotheses[i] = np.pad(hypotheses[i], (0, 3000-len_conc), mode='constant')
    else:
        hypotheses[i] = np.zeros(3000, dtype = 'int32')

    
np.set_printoptions(threshold=np.sys.maxsize)
print(np.shape(hypotheses))


(2000, 3000)


In [18]:
print(hypotheses[0])

[  2  70   6   9   4   4   8   5  12   7   7 454  11   4   8   5  12   7
  27   3   2  70   6   9   4   4   4   4   8   5  12   7   7   7   7  23
  18  11   4   4   8   5  12   7   7 113   6   6   9   4   7   4   7   7
  26   6   6   9   4   4   4   8   5  12   7   7   4   4   4   8   5  12
   7   7   7 225  11   4   4   8   5  12   7   7 113  11   4   4   8   5
  12   7   7  32   6   6   9   4   4   8   5  12   7   4   4   8   5  12
   7   7 225  11   4   8   5  12   7  27   6   6   9   4   4   8   5  12
   7   4   4   8   5  12   7   4   8   5  12   7  87   6   6   9   4   4
   4   8   5  12   7   7   4   4   8   5  12   7   4   8   5  12   7 124
   9   4   4   8   5  12   7   7 164  11   4   8   5  12   7  27   6   9
   4   4   4   8   5  12   7   7   4   8   5  12   7 171  11   4   4   8
   5  12   7   7 113   3   2  70   6   9   4   4   4   8   5  12   7   7
   7  23  18  11   4   8   5  12   7  70   6   6   9   4   7   4   7   7
  26   6   6   9   4   4   8   5  12   7   4   4   

In [19]:
# convert tactics to numpy arrays and one-hot encode
a = np.array(labels['tac_id'])
tactics = np.zeros((a.size, 40+1))
tactics[np.arange(a.size),a] = 1
print(np.shape(tactics))

(2000, 41)


In [20]:
X_train, Y_train = goals, tactics
print(np.shape(X_train))
print(np.shape(Y_train))
print(np.shape(hypotheses))

(2000, 1000)
(2000, 41)
(2000, 3000)


In [19]:
# save numpy array as csv file
from numpy import asarray
from numpy import savetxt

# save to csv file
savetxt('x_train2.csv', X_train, delimiter=',')
savetxt('y_train2.csv', Y_train, delimiter=',')
savetxt('hypotheses2.csv', hypotheses, delimiter=',')