# Cronenbot Extrator


## Importing Dependences

In [1]:
import json
import os
import numpy as np
import tensorflow as tf
import random
from sklearn.metrics import confusion_matrix
from keras.models import Sequential
from keras import layers
from keras.regularizers import l2

## Loading paths

In [2]:
train_data = []
trainset_path = '../../dstc8-schema-guided-dialogue/train/'
for file in os.listdir(trainset_path):
    if file == 'schema.json':
        continue
        
    with open(os.path.join(trainset_path, file), 'r') as json_file:
        for elem in json.load(json_file):
            train_data.append(elem)

In [5]:
test_data = []
testset_path = '../../dstc8-schema-guided-dialogue/test/'
for file in os.listdir(testset_path):
    if file == 'schema.json':
        continue
        
    with open(os.path.join(testset_path, file), 'r') as json_file:
        for elem in json.load(json_file):
            test_data.append(elem)

## Encoding Labels

In [3]:
labels = ["o", "title", "genre", "subtitles", "directed_by"]

def slot2label(slot):
  if slot == "title":
    return 2
  elif slot == "genre":
    return 3
  elif slot == "subtitles":
    return 4
  elif slot == "directed_by":
    return 5
  else:
    return 1;

## Loading data and labeling

In [6]:
train_labels = []
test_labels = []
train_sentences = []
test_sentences = []

for dialogue in train_data:
    for idx in range(0, len(dialogue['turns']), 2):
        sentence = dialogue['turns'][idx]['utterance']
        labels = np.ones(len(sentence))
        slots = dialogue['turns'][idx]['frames'][0]['slots']

        for slot in slots:
          if slot2label(slot['slot']) != 1:
            labels = np.concatenate([labels[:slot['start']], np.array([slot2label(slot['slot'])]*(slot['exclusive_end']-slot['start'])), labels[slot['exclusive_end']:]])
            
            train_sentences.append(sentence)
            train_labels.append(labels);
        
for dialogue in test_data:
    for idx in range(0, len(dialogue['turns']), 2):
        sentence = dialogue['turns'][idx]['utterance']
        labels = np.ones(len(sentence))
        slots = dialogue['turns'][idx]['frames'][0]['slots']

        for slot in slots:
          if slot2label(slot['slot']) != 1:
            labels = np.concatenate([labels[:slot['start']], np.array([slot2label(slot['slot'])]*(slot['exclusive_end']-slot['start'])), labels[slot['exclusive_end']:]])
          
            test_sentences.append(sentence)
            test_labels.append(labels);

## Creating vocab and mapping

In [7]:
# create character vocab
all_text = " ".join([" ".join(x) for x in train_sentences+test_sentences])
vocab = sorted(set(all_text))

# create character/id and label/id mapping
char2idx = {u:i+1 for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

print(char2idx)

train_sentences = list(map(lambda s:list(map(lambda c: char2idx[c],s)),train_sentences))
test_sentences = list(map(lambda s:list(map(lambda c: char2idx[c],s)),test_sentences))

{' ': 1, '!': 2, '&': 3, "'": 4, ',': 5, '-': 6, '.': 7, '/': 8, '0': 9, '1': 10, '2': 11, '3': 12, '4': 13, '5': 14, '6': 15, '7': 16, '8': 17, '9': 18, ':': 19, ';': 20, '>': 21, '?': 22, 'A': 23, 'B': 24, 'C': 25, 'D': 26, 'E': 27, 'F': 28, 'G': 29, 'H': 30, 'I': 31, 'J': 32, 'K': 33, 'L': 34, 'M': 35, 'N': 36, 'O': 37, 'P': 38, 'Q': 39, 'R': 40, 'S': 41, 'T': 42, 'U': 43, 'V': 44, 'W': 45, 'X': 46, 'Y': 47, 'Z': 48, 'a': 49, 'b': 50, 'c': 51, 'd': 52, 'e': 53, 'f': 54, 'g': 55, 'h': 56, 'i': 57, 'j': 58, 'k': 59, 'l': 60, 'm': 61, 'n': 62, 'o': 63, 'p': 64, 'q': 65, 'r': 66, 's': 67, 't': 68, 'u': 69, 'v': 70, 'w': 71, 'x': 72, 'y': 73, 'z': 74}


## Creating generators for training

In [8]:
# train_labels = []
# test_labels = []
# train_sentences = []
# test_sentences = []
    
# training generator
def gen_train_series():

    for eg in zip(train_sentences, train_labels):
      yield eg[0],eg[1]

# validation generator
def gen_valid_series():

    for eg in valid_formatted:
      yield eg[0],eg[1]

# test generator
def gen_test_series():

  for eg in zip(test_sentences, test_labels):
      yield eg[0],eg[1]
  
# create Dataset objects for train, test and validation sets  
series = tf.data.Dataset.from_generator(gen_train_series,output_types=(tf.int32, tf.int32),output_shapes = ((None, None)))
# series_valid = tf.data.Dataset.from_generator(gen_valid_series,output_types=(tf.int32, tf.int32),output_shapes = ((None, None)))
series_test = tf.data.Dataset.from_generator(gen_test_series,output_types=(tf.int32, tf.int32),output_shapes = ((None, None)))

BATCH_SIZE = 128
BUFFER_SIZE=1000

# create padded batch series objects for train, test and validation sets
ds_series_batch = series.shuffle(BUFFER_SIZE).padded_batch(BATCH_SIZE, padded_shapes=([None], [None]), drop_remainder=True)
# ds_series_batch_valid = series_valid.padded_batch(BATCH_SIZE, padded_shapes=([None], [None]), drop_remainder=True)
ds_series_batch_test = series_test.padded_batch(BATCH_SIZE, padded_shapes=([None], [None]), drop_remainder=True)

# print example batches
# for input_example_batch, target_example_batch in ds_series_batch_valid.take(1):
#   print(input_example_batch)
#   print(target_example_batch)


## Defining Model

In [9]:

vocab_size = len(vocab)+1

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024

label_size = len(labels)  

# build LSTM model
def build_model(vocab_size,label_size, embedding_dim, rnn_units, batch_size):
      model = tf.keras.Sequential([
          tf.keras.layers.Embedding(vocab_size, embedding_dim,
                            batch_input_shape=[batch_size, None],mask_zero=True),
          tf.keras.layers.LSTM(rnn_units,
                      return_sequences=True,
                      stateful=True,
                      recurrent_initializer='glorot_uniform'),
          tf.keras.layers.Dense(label_size)
          ])
      return model

model = build_model(
      vocab_size = len(vocab)+1,
      label_size=len(labels)+1,
      embedding_dim=embedding_dim,
      rnn_units=rnn_units,
      batch_size=BATCH_SIZE)

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (128, None, 256)          19200     
_________________________________________________________________
lstm (LSTM)                  (128, None, 1024)         5246976   
_________________________________________________________________
dense (Dense)                (128, None, 41)           42025     
Total params: 5,308,201
Trainable params: 5,308,201
Non-trainable params: 0
_________________________________________________________________


## Defining Loss and compiling

In [10]:
import os

# define loss function
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

model.compile(optimizer='adam', loss=loss,metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])

# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

## Training

In [13]:
    EPOCHS=100
    history = model.fit(ds_series_batch, epochs=EPOCHS, validation_data=ds_series_batch_test,callbacks=[checkpoint_callback])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

## Saving

In [14]:

model.save('models/entities.h5')


## Others

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

preds = np.array([])
y_trues= np.array([])

# iterate through test set, make predictions based on trained model
for input_example_batch, target_example_batch in ds_series_batch_test:

  pred=model.predict_on_batch(input_example_batch)
  pred_max=tf.argmax(tf.nn.softmax(pred),2).numpy().flatten()
  y_true=target_example_batch.numpy().flatten()

  preds=np.concatenate([preds,pred_max])
  y_trues=np.concatenate([y_trues,y_true])

# remove padding from evaluation
remove_padding = [(p,y) for p,y in zip(preds,y_trues) if y!=0]

r_p = [x[0] for x in remove_padding]
r_t = [x[1] for x in remove_padding]

# print confusion matrix and classification report
print(confusion_matrix(r_p,r_t))
print(classification_report(r_p,r_t))

[[155662    458   2198    859      0   1698]
 [   111    782    144      0      0     15]
 [  2548     74  14030     19      0     24]
 [  1947      1      0   6293      0     12]
 [   737    134    211      1      0     11]
 [  3654      0     27     13      0  12730]]


  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

         1.0       0.95      0.97      0.96    160875
         2.0       0.54      0.74      0.63      1052
         3.0       0.84      0.84      0.84     16695
         4.0       0.88      0.76      0.82      8253
         5.0       0.00      0.00      0.00      1094
         6.0       0.88      0.78      0.82     16424

    accuracy                           0.93    204393
   macro avg       0.68      0.68      0.68    204393
weighted avg       0.92      0.93      0.92    204393



In [None]:
sentences = np.array(["I want it to be in San Jose"])
sentences = np.array(list(map(lambda s:np.array(list(map(lambda c: char2idx[c],s))),sentences)))
model.predict(sentences)

InvalidArgumentError: ignored

numpy.ndarray