In [31]:
import argparse
from collections import defaultdict, namedtuple
from io import open
import math
import os
from random import shuffle, uniform

import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, Input, Model, utils

In [9]:
def load_data(filename):
    """
    This method loads and returns the data in filename. If the data is labelled training data, it returns labels too.

    Parameters:
        filename: the location of the training or test data you want to load.

    Returns:
        data: a list of InstanceData objects from that data type and track.
        labels (optional): if you specified training data, a dict of instance_id:label pairs.
    """

    # 'data' stores a list of 'InstanceData's as values.
    data = []

    # If this is training data, then 'labels' is a dict that contains instance_ids as keys and labels as values.
    training = False
    if filename.find('train') != -1:
        training = True

    if training:
        labels = dict()

    num_exercises = 0
    print('Loading instances...')
    instance_properties = dict()

    with open(filename, 'rt') as f:
        for line in f:
            line = line.strip()

            # If there's nothing in the line, then we're done with the exercise. Print if needed, otherwise continue
            if len(line) == 0:
                num_exercises += 1
                if num_exercises % 100000 == 0:
                    print('Loaded ' + str(len(data)) + ' instances across ' + str(num_exercises) + ' exercises...')
                instance_properties = dict()

            # If the line starts with #, then we're beginning a new exercise
            elif line[0] == '#':
                if 'prompt' in line:
                    instance_properties['prompt'] = line.split(':')[1]
                else:
                    list_of_exercise_parameters = line[2:].split()
                    for exercise_parameter in list_of_exercise_parameters:
                        [key, value] = exercise_parameter.split(':')
                        if key == 'countries':
                            value = value.split('|')
                        elif key == 'days':
                            value = float(value)
                        elif key == 'time':
                            if value == 'null':
                                value = None
                            else:
                                assert '.' not in value
                                value = int(value)
                        instance_properties[key] = value

            # Otherwise we're parsing a new Instance for the current exercise
            else:
                line = line.split()
                if training:
                    assert len(line) == 7
                else:
                    assert len(line) == 6
                assert len(line[0]) == 12

                instance_properties['instance_id'] = line[0]
                instance_properties['token'] = line[1]
                instance_properties['part_of_speech'] = line[2]

                instance_properties['morphological_features'] = dict()
                for l in line[3].split('|'):
                    [key, value] = l.split('=')
                    if key == 'Person':
                        value = int(value)
                    instance_properties['morphological_features'][key] = value

                instance_properties['dependency_label'] = line[4]
                instance_properties['dependency_edge_head'] = int(line[5])
                if training:
                    label = float(line[6])
                    labels[instance_properties['instance_id']] = label
                data.append(InstanceData(instance_properties=instance_properties))

        print('Done loading ' + str(len(data)) + ' instances across ' + str(num_exercises) +
              ' exercises.\n')

    if training:
        return data, labels
    else:
        return data

class InstanceData(object):
    """
    A bare-bones class to store the included properties of each instance. This is meant to act as easy access to the
    data, and provides a launching point for deriving your own features from the data.
    """
    def __init__(self, instance_properties):

        # Parameters specific to this instance
        self.instance_id = instance_properties['instance_id']
        self.token = instance_properties['token']
        self.part_of_speech = instance_properties['part_of_speech']
        self.morphological_features = instance_properties['morphological_features']
        self.dependency_label = instance_properties['dependency_label']
        self.dependency_edge_head = instance_properties['dependency_edge_head']

        # Derived parameters specific to this instance
        self.exercise_index = int(self.instance_id[8:10])
        self.token_index = int(self.instance_id[10:12])

        # Derived parameters specific to this exercise
        self.exercise_id = self.instance_id[:10]

        # Parameters shared across the whole session
        self.user = instance_properties['user']
        self.countries = instance_properties['countries']
        self.days = instance_properties['days']
        self.client = instance_properties['client']
        self.session = instance_properties['session']
        self.format = instance_properties['format']
        self.time = instance_properties['time']
        self.prompt = instance_properties.get('prompt', None)

        # Derived parameters shared across the whole session
        self.session_id = self.instance_id[:8]

    def to_features(self):
        """
        Prepares those features that we wish to use in the LogisticRegression example in this file. We introduce a bias,
        and take a few included features to use. Note that this dict restructures the corresponding features of the
        input dictionary, 'instance_properties'.

        Returns:
            to_return: a representation of the features we'll use for logistic regression in a dict. A key/feature is a
                key/value pair of the original 'instance_properties' dict, and we encode this feature as 1.0 for 'hot'.
        """
        to_return = dict()

        to_return['bias'] = 1.0
        to_return['user:' + self.user] = 1.0
        to_return['format:' + self.format] = 1.0
        to_return['token:' + self.token.lower()] = 1.0

        to_return['part_of_speech:' + self.part_of_speech] = 1.0
        # for morphological_feature in self.morphological_features:
        #     to_return['morphological_feature:' + morphological_feature] = 1.0
        to_return['dependency_label:' + self.dependency_label] = 1.0

        return to_return

In [11]:
def building_dataset(train, test):
    training_data, training_labels = load_data(train)
    test_data = load_data(test)
    return training_data, training_labels, test_data


In [12]:
training_data, training_labels, test_data = building_dataset("data_en_es/en_es.slam.20190204.train", "data_en_es/en_es.slam.20190204.test")

Loading instances...
Loaded 317049 instances across 100000 exercises...
Loaded 635368 instances across 200000 exercises...
Loaded 951536 instances across 300000 exercises...
Loaded 1271940 instances across 400000 exercises...
Loaded 1591344 instances across 500000 exercises...
Loaded 1911212 instances across 600000 exercises...
Loaded 2227444 instances across 700000 exercises...
Loaded 2546704 instances across 800000 exercises...
Done loading 2622957 instances across 824012 exercises.

Loading instances...
Loaded 337728 instances across 100000 exercises...
Done loading 386604 instances across 114586 exercises.



In [13]:
def extract_features(data):
    # Mapping feature to an index
    feature_dict = dict()
    count = 0

    for instance_data in data:
        for key in instance_data.to_features().keys():
           if key not in feature_dict:
               feature_dict[key] = count
               count += 1
    feature_len = len(feature_dict.keys())

    return feature_dict

In [14]:
LARGE_EMBED_SIZE = 32
SMALL_EMBED_SIZE = 16

def create_embeddings(data):
    """
    Mapping of each distinct feature to an unique index in a dictionary is used to create an embedding matrix for 
    each said distinct feature. Each individual embedding matrix will be concatenated
    together to create one large embedding matrix.

    Parameters:
        data: a list of InstanceData objects from that data type and track.
    Return:
        feature_maxtrix_concat: concatenated embedding matrix
    """     
    feature_dict = extract_features(data)  

    # Creating embedding matrices for each feature
    print("Building embedding matrix...")
    users, formats, tokens = [], [], []
    for key in feature_dict:
        if "user:" in key:
            users.append(feature_dict[key])
        if "format:" in key:
            formats.append(feature_dict[key])
        if "token:" in key:
            tokens.append(feature_dict[key])

    # Embedding layers
    user_tensor, format_tensor, token_tensor = Input(shape = (None, ), name = "users"), Input(shape = (None, ), name = "formats"), Input(shape = (None, ), name = "tokens")
    user_embed = layers.Embedding(len(users), LARGE_EMBED_SIZE) (user_tensor)
    format_embed = layers.Embedding(len(formats), SMALL_EMBED_SIZE) (format_tensor)
    token_embed = layers.Embedding(len(tokens), LARGE_EMBED_SIZE) (token_tensor)

    feature_matrix = [user_embed, format_embed, token_embed]
    feature_matrix_concat = layers.Concatenate()(feature_matrix)

    print("Embedding matrix: {}".format(feature_matrix_concat))

    # Help hopefully make network train faster
    feature_matrix_concat = layers.BatchNormalization() (feature_matrix_concat)
    input_tensor = [user_tensor, format_tensor, token_tensor]

    return feature_matrix_concat, input_tensor
    

In [29]:
def format_data(data, labels = None):
    Y_train_oh = utils.to_categorical(labels)
    

In [30]:
format_data(training_data, training_labels)

0.0


In [15]:
def LSTM(train_data, labels = None):
   batch_size = 32
   epochs = 10 

   embedding, input_tensor = create_embeddings(train_data)
   
   # Generate the model
   X = layers.LSTM(units = 256, return_sequences= True) (embedding)
   X = layers.Dropout(0.25) (X)
   X = layers.Dense(units = 128) (X)
   X = layers.Activation('softmax') (X)

   LSTM_model = Model(input_tensor, X)
   LSTM_model.compile(optimizer = 'Adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])


   # LSTM_hist = model.fit(
   #    x = , 
   #    y = , 
   #    batch_size = batch_size,
   #    epochs = epochs)

   # return LSTM_hist
   

In [16]:
LSTM(training_data, training_labels)

Building embedding matrix...
Embedding matrix: Tensor("concatenate/Identity:0", shape=(None, None, 80), dtype=float32)
