In [1]:
import argparse
from collections import defaultdict, namedtuple
from io import open
import math
import os
from random import shuffle, uniform

import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, Input, Model, utils

In [2]:
def load_data(filename, key = None):
    """
    This method loads and returns the data in filename. If the data is labelled training data, it returns labels too.

    Parameters:
        filename: the location of the training or test data you want to load.

    Returns:
        data: a list of InstanceData objects from that data type and track.
        labels (optional): if you specified training data, a dict of instance_id:label pairs.
    """

    # 'data' stores a list of 'InstanceData's as values.
    data = []

    # If this is training data, then 'labels' is a dict that contains instance_ids as keys and labels as values.
    training = False
    if filename.find('train') != -1:
        training = True

    if training:
        labels = dict()

    test_key = [] 
    if key:    
        print('Loading test labels...')
        with open(key, 'rt', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                temp = dict()
                temp['instance_id'], temp['label'] = line.split()
                temp['label'] = float(temp['label'])
                test_key.append(temp['label'])

    num_exercises = 0
    print('Loading instances...')
    instance_properties = dict()

    with open(filename, 'rt') as f:
        for line in f:
            line = line.strip()

            # If there's nothing in the line, then we're done with the exercise. Print if needed, otherwise continue
            if len(line) == 0:
                num_exercises += 1
                if num_exercises % 100000 == 0:
                    print('Loaded ' + str(len(data)) + ' instances across ' + str(num_exercises) + ' exercises...')
                instance_properties = dict()

            # If the line starts with #, then we're beginning a new exercise
            elif line[0] == '#':
                if 'prompt' in line:
                    instance_properties['prompt'] = line.split(':')[1]
                else:
                    list_of_exercise_parameters = line[2:].split()
                    for exercise_parameter in list_of_exercise_parameters:
                        [key, value] = exercise_parameter.split(':')
                        if key == 'countries':
                            value = value.split('|')
                        elif key == 'days':
                            value = float(value)
                        elif key == 'time':
                            if value == 'null':
                                value = None
                            else:
                                assert '.' not in value
                                value = int(value)
                        instance_properties[key] = value

            # Otherwise we're parsing a new Instance for the current exercise
            else:
                line = line.split()
                if training:
                    assert len(line) == 7
                else:
                    assert len(line) == 6
                assert len(line[0]) == 12

                instance_properties['instance_id'] = line[0]
                instance_properties['token'] = line[1]
                instance_properties['part_of_speech'] = line[2]

                # instance_properties['morphological_features'] = dict()
                # for l in line[3].split('|'):
                #     [key, value] = l.split('=')
                #     if key == 'Person':
                #         value = int(value)
                #     instance_properties['morphological_features'][key] = value

                instance_properties['dependency_label'] = line[4]
                instance_properties['dependency_edge_head'] = int(line[5])
                if training:
                    label = float(line[6])
                    labels[instance_properties['instance_id']] = label
                    instance_properties['label'] = float(line[6])
                if key and test_key != []:
                    instance_properties['label'] = test_key.pop(0)
                data.append(InstanceData(instance_properties=instance_properties))

        print('Done loading ' + str(len(data)) + ' instances across ' + str(num_exercises) +
              ' exercises.\n')

    # if training: return data, labels

    return data

class InstanceData(object):
    """
    A bare-bones class to store the included properties of each instance. This is meant to act as easy access to the
    data, and provides a launching point for deriving your own features from the data.
    """
    def __init__(self, instance_properties):

        # Parameters specific to this instance
        self.instance_id = instance_properties['instance_id']
        self.token = instance_properties['token']
        self.part_of_speech = instance_properties['part_of_speech']
        # self.morphological_features = instance_properties['morphological_features']
        self.dependency_label = instance_properties['dependency_label']
        self.dependency_edge_head = instance_properties['dependency_edge_head']

        # Derived parameters specific to this instance
        self.exercise_index = int(self.instance_id[8:10])
        self.token_index = int(self.instance_id[10:12])

        # Derived parameters specific to this exercise
        self.exercise_id = self.instance_id[:10]

        # Parameters shared across the whole session
        self.user = instance_properties['user']
        self.countries = instance_properties['countries']
        self.days = instance_properties['days']
        self.client = instance_properties['client']
        self.session = instance_properties['session']
        self.format = instance_properties['format']
        self.time = instance_properties['time']
        self.prompt = instance_properties.get('prompt', None)

        # Label
        self.label = instance_properties['label']

        # Derived parameters shared across the whole session
        self.session_id = self.instance_id[:8]

    def get_labels(self):
        return self.label 
    def get_exercise_id(self):
        return self.exercise_id
    def get_format(self):
        return 'format:' + self.format
    def get_user(self):
        return 'user:' + self.user
    def get_token(self):
        return 'token:' + self.token.lower()
    def get_part_of_speech(self):
        return 'part_of_speech:' + self.part_of_speech

    def to_features(self):
        """
        Prepares those features that we wish to use in the LogisticRegression example in this file. We introduce a bias,
        and take a few included features to use. Note that this dict restructures the corresponding features of the
        input dictionary, 'instance_properties'.

        Returns:
            to_return: a representation of the features we'll use for logistic regression in a dict. A key/feature is a
                key/value pair of the original 'instance_properties' dict, and we encode this feature as 1.0 for 'hot'.
        """
        to_return = dict()

        to_return['bias'] = 1.0
        to_return['user:' + self.user] = 1.0
        to_return['format:' + self.format] = 1.0
        to_return['token:' + self.token.lower()] = 1.0

        to_return['part_of_speech:' + self.part_of_speech] = 1.0
        # for morphological_feature in self.morphological_features:
        #     to_return['morphological_feature:' + morphological_feature] = 1.0
        to_return['dependency_label:' + self.dependency_label] = 1.0

        return to_return

In [3]:
def get_raw_dataset(train, test, key):
    print("Getting training data...")
    training_data = load_data(train)

    print("Getting test data...")
    test_data = load_data(test, key)

    return training_data, test_data
    

In [4]:
training_data, test_data = get_raw_dataset("data_en_es/en_es.slam.20190204.train", "data_en_es/en_es.slam.20190204.test", "data_en_es/en_es.slam.20190204.test.key")

Getting training data...
Loading instances...
Loaded 317049 instances across 100000 exercises...
Loaded 635368 instances across 200000 exercises...
Loaded 951536 instances across 300000 exercises...
Loaded 1271940 instances across 400000 exercises...
Loaded 1591344 instances across 500000 exercises...
Loaded 1911212 instances across 600000 exercises...
Loaded 2227444 instances across 700000 exercises...
Loaded 2546704 instances across 800000 exercises...
Done loading 2622957 instances across 824012 exercises.

Getting test data...
Loading test labels...
Loading instances...
Loaded 337728 instances across 100000 exercises...
Done loading 386604 instances across 114586 exercises.



In [5]:
def feature_mapping(data):
    users, formats, tokens, part_of_speeches = dict(), dict(), dict(), dict()
    u_count, f_count, t_count, pos_count = 0, 0, 0, 0
    
    for instance_data in data:
        for key in instance_data.to_features().keys():
            if 'user:' in key and key not in users:
                users[key] = u_count
                u_count += 1
            if 'format:' in key and key not in formats:
                formats[key] = f_count
                f_count += 1
            if 'token:' in key and key not in tokens:
                tokens[key] = t_count
                t_count += 1
            if 'part_of_speech' in key and key not in part_of_speeches:
                part_of_speeches[key] = pos_count
                pos_count += 1

  
    
    return users, formats, tokens, part_of_speeches

In [115]:
users, formats, tokens, part_of_speeches = feature_mapping(training_data)

In [13]:
def build_formatted_dataset(data, max_sequence_length = 8):
    users, formats, tokens, part_of_speeches = feature_mapping(data)
    
    # exercise_word = [[data[0].get_token(), data[0].get_user(), data[0].get_format(), data[0].get_part_of_speech()]]

    exercise = [[tokens[data[0].get_token()], users[data[0].get_user()], formats[data[0].get_format()], part_of_speeches[data[0].get_part_of_speech()]]]
    prev_exercise_id = data[0].get_exercise_id()
    test = None
    for i in range(1, 1000000): 
        if prev_exercise_id == data[i].get_exercise_id():
            token_info = [[tokens[data[i].get_token()], users[data[i].get_user()], formats[data[i].get_format()], part_of_speeches[data[i].get_part_of_speech()]]]
            exercise = np.append(exercise, token_info, axis = 0)

            # token_word = [[data[i].get_token(), data[i].get_user(), data[i].get_format(), data[i].get_part_of_speech()]]
            # exercise_word = np.append(exercise_word, token_word, axis = 0)
        else:
            prev_exercise_id = data[i].get_exercise_id()
            token_info = [[tokens[data[i].get_token()], users[data[i].get_user()], formats[data[i].get_format()], part_of_speeches[data[i].get_part_of_speech()]]]
            exercise = np.append(exercise, token_info, axis = 0)

            # token_word = [[data[i].get_token(), data[i].get_user(), data[i].get_format(), data[i].get_part_of_speech()]]
            # exercise_word = np.append(exercise_word, token_word, axis = 0)
        if i % 100000 == 0:
            print("Processed {} of {} instances".format(i, len(data)))
    
    return exercise
    # print(exercise_word)


In [14]:
exercises = build_formatted_dataset(training_data)



Processed 100000 of 2622957 instances
Processed 200000 of 2622957 instances
Processed 300000 of 2622957 instances
Processed 400000 of 2622957 instances
Processed 500000 of 2622957 instances
Processed 600000 of 2622957 instances
Processed 700000 of 2622957 instances
Processed 800000 of 2622957 instances
Processed 900000 of 2622957 instances


In [21]:
def create_labels(data):
    labels = []
    for i in range(0, 1000000):
        labels = np.append(labels, [data[i].get_labels()])

        if i % 100000 == 0:
            print("Processed {} of {} labels".format(i, len(data)))
    return labels

In [22]:
Y_train = create_labels(training_data)

Processed 0 of 2622957 labels
Processed 100000 of 2622957 labels
Processed 200000 of 2622957 labels
Processed 300000 of 2622957 labels
Processed 400000 of 2622957 labels
Processed 500000 of 2622957 labels
Processed 600000 of 2622957 labels
Processed 700000 of 2622957 labels
Processed 800000 of 2622957 labels
Processed 900000 of 2622957 labels


In [79]:
def create_embeddings(data, batch_size = None):
    """
    Mapping of each distinct feature to an unique index in a dictionary is used to create an embedding matrix for 
    each said distinct feature. Each individual embedding matrix will be concatenated
    together to create one large embedding matrix.

    Parameters:
        data: a list of InstanceData objects from that data type and track.
    Return:
        feature_maxtrix_concat: concatenated embedding matrix
    """     
    users, formats, tokens, part_of_speeches = feature_mapping(data)  

    # Creating embedding matrices for each feature
    print("Building embedding matrix...")

    # Embedding layers
    user_tensor, format_tensor, token_tensor, pos_tensor = Input(shape = (batch_size, ), name = "users"), Input(shape = (batch_size, ), name = "formats"), Input(shape = (batch_size, ), name = "tokens"), Input(shape = (batch_size, ), name = "pos")
    user_embed = layers.Embedding(len(users), 128, input_length=4) (user_tensor)
    format_embed = layers.Embedding(len(formats), 8, input_length=4) (format_tensor)
    token_embed = layers.Embedding(len(tokens), 256, input_length=4) (token_tensor)
    pos_embed = layers.Embedding(len(part_of_speeches), 32, input_length=4) (pos_tensor)

    embed_matrix = [user_embed, format_embed, token_embed, pos_embed]
    embed_matrix_concat = layers.Concatenate()(embed_matrix)

    print("Embedding matrix: {}".format(embed_matrix_concat))

    input_tensor = [user_tensor, format_tensor, token_tensor, pos_tensor]

    return embed_matrix_concat, input_tensor
    

In [80]:
def LSTM(data, X_train, Y_train):
   batch_size = 32
   epochs = 10 

   embedding, input_tensor = create_embeddings(data)
   
   # Generate the model
   X = layers.LSTM(units = 256, return_sequences= True) (embedding)
   X = layers.Dropout(0.25) (X)
   X = layers.Dense(units = 64, activation = 'softmax') (X)


   LSTM_model = Model(input_tensor, X)
   LSTM_model.compile(optimizer = 'Adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])
   LSTM_model.summary()


   LSTM_hist = LSTM_model.fit(
      x = X_train, 
      y = Y_train, 
      batch_size = batch_size,
      epochs = epochs)

   # return LSTM_hist
   

In [81]:
LSTM(training_data, exercises, Y_train)

Building embedding matrix...
Embedding matrix: Tensor("concatenate_15/Identity:0", shape=(None, 32, 424), dtype=float32)
Model: "model_11"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
users (InputLayer)              [(None, 32)]         0                                            
__________________________________________________________________________________________________
formats (InputLayer)            [(None, 32)]         0                                            
__________________________________________________________________________________________________
tokens (InputLayer)             [(None, 32)]         0                                            
__________________________________________________________________________________________________
pos (InputLayer)                [(None, 32)]         0               

AssertionError: in user code:

    /home/nathgoh/anaconda3/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:571 train_function  *
        outputs = self.distribute_strategy.run(
    /home/nathgoh/anaconda3/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:951 run  **
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    /home/nathgoh/anaconda3/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:2290 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    /home/nathgoh/anaconda3/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:2649 _call_for_each_replica
        return fn(*args, **kwargs)
    /home/nathgoh/anaconda3/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:531 train_step  **
        y_pred = self(x, training=True)
    /home/nathgoh/anaconda3/lib/python3.8/site-packages/tensorflow/python/keras/engine/base_layer.py:927 __call__
        outputs = call_fn(cast_inputs, *args, **kwargs)
    /home/nathgoh/anaconda3/lib/python3.8/site-packages/tensorflow/python/keras/engine/network.py:717 call
        return self._run_internal_graph(
    /home/nathgoh/anaconda3/lib/python3.8/site-packages/tensorflow/python/keras/engine/network.py:899 _run_internal_graph
        assert str(id(x)) in tensor_dict, 'Could not compute output ' + str(x)

    AssertionError: Could not compute output Tensor("dense_10/Identity:0", shape=(None, 32, 64), dtype=float32)
