In [1]:
import argparse
from collections import defaultdict, namedtuple
from io import open
import math
import os
from random import shuffle, uniform

from copy import deepcopy
import pandas as pd
import numpy as np

In [2]:
def load_data(filename, key = None):
    """
    This method loads and returns the data in filename. If the data is labelled training data, it returns labels too.

    Parameters:
        filename: the location of the training or test data you want to load.
        key: the labels for the test data should we be loading in a test data

    Returns:
        data: a list of InstanceData objects from that data type and track, includes the labels
    """

    # 'data' stores a list of 'InstanceData's as values.
    data = []

    # If this is training data, then 'labels' is a dict that contains instance_ids as keys and labels as values.
    training = False
    if filename.find('train') != -1:
        training = True

    if training:
        labels = dict()

    test_key = [] 
    if key:    
        print('Loading test labels...')
        with open(key, 'rt', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                temp = dict()
                temp['instance_id'], temp['label'] = line.split()
                temp['label'] = float(temp['label'])
                test_key.append(temp['label'])

    num_exercises = 0
    print('Loading instances...')
    instance_properties = dict()

    with open(filename, 'rt') as f:
        for line in f:
            line = line.strip()

            # If there's nothing in the line, then we're done with the exercise. Print if needed, otherwise continue
            if len(line) == 0:
                num_exercises += 1
                if num_exercises % 100000 == 0:
                    print('Loaded ' + str(len(data)) + ' instances across ' + str(num_exercises) + ' exercises...')
                instance_properties = dict()

            # If the line starts with #, then we're beginning a new exercise
            elif line[0] == '#':
                if 'prompt' in line:
                    instance_properties['prompt'] = line.split(':')[1]
                else:
                    list_of_exercise_parameters = line[2:].split()
                    for exercise_parameter in list_of_exercise_parameters:
                        [key, value] = exercise_parameter.split(':')
                        if key == 'countries':
                            value = value.split('|')
                        elif key == 'days':
                            value = float(value)
                        elif key == 'time':
                            if value == 'null':
                                value = None
                            else:
                                assert '.' not in value
                                value = int(value)
                        instance_properties[key] = value

            # Otherwise we're parsing a new Instance for the current exercise
            else:
                line = line.split()
                if training:
                    assert len(line) == 7
                else:
                    assert len(line) == 6
                assert len(line[0]) == 12

                instance_properties['instance_id'] = line[0]
                instance_properties['token'] = line[1]
                instance_properties['part_of_speech'] = line[2]

                # instance_properties['morphological_features'] = dict()
                # for l in line[3].split('|'):
                #     [key, value] = l.split('=')
                #     if key == 'Person':
                #         value = int(value)
                #     instance_properties['morphological_features'][key] = value

                instance_properties['dependency_label'] = line[4]
                instance_properties['dependency_edge_head'] = int(line[5])
                if training:
                    label = float(line[6])
                    labels[instance_properties['instance_id']] = label
                    instance_properties['label'] = float(line[6])
                if key and test_key != []:
                    instance_properties['label'] = test_key.pop(0)
                data.append(InstanceData(instance_properties=instance_properties))

        print('Done loading ' + str(len(data)) + ' instances across ' + str(num_exercises) +
              ' exercises.\n')

    # if training: return data, labels

    return data

class InstanceData(object):
    """
    A bare-bones class to store the included properties of each instance. This is meant to act as easy access to the
    data, and provides a launching point for deriving your own features from the data.
    """
    def __init__(self, instance_properties):

        # Parameters specific to this instance
        self.instance_id = instance_properties['instance_id']
        self.token = instance_properties['token']
        self.part_of_speech = instance_properties['part_of_speech']
        # self.morphological_features = instance_properties['morphological_features']
        self.dependency_label = instance_properties['dependency_label']
        self.dependency_edge_head = instance_properties['dependency_edge_head']

        # Derived parameters specific to this instance
        self.exercise_index = int(self.instance_id[8:10])
        self.token_index = int(self.instance_id[10:12])

        # Derived parameters specific to this exercise
        self.exercise_id = self.instance_id[:10]

        # Parameters shared across the whole session
        self.user = instance_properties['user']
        self.countries = instance_properties['countries']
        self.days = instance_properties['days']
        self.client = instance_properties['client']
        self.session = instance_properties['session']
        self.format = instance_properties['format']
        self.time = instance_properties['time']
        self.prompt = instance_properties.get('prompt', None)

        # Label
        self.label = instance_properties['label']

        # Derived parameters shared across the whole session
        self.session_id = self.instance_id[:8]

    def get_labels(self):
        return self.label 
    def get_exercise_id(self):
        return self.exercise_id
    def get_format(self):
        return self.format
    def get_user(self):
        return self.user
    def get_token(self):
        return self.token.lower()
    def get_part_of_speech(self):
        return self.part_of_speech

    def to_features(self):
        """
        Prepares those features that we wish to use in the LogisticRegression example in this file. We introduce a bias,
        and take a few included features to use. Note that this dict restructures the corresponding features of the
        input dictionary, 'instance_properties'.

        Returns:
            to_return: a representation of the features we'll use for logistic regression in a dict. A key/feature is a
                key/value pair of the original 'instance_properties' dict, and we encode this feature as 1.0 for 'hot'.
        """
        to_return = dict()

        to_return['bias'] = 1.0
        to_return['user:' + self.user] = 1.0
        to_return['format:' + self.format] = 1.0
        to_return['token:' + self.token.lower()] = 1.0

        to_return['part_of_speech:' + self.part_of_speech] = 1.0
        # for morphological_feature in self.morphological_features:
        #     to_return['morphological_feature:' + morphological_feature] = 1.0
        to_return['dependency_label:' + self.dependency_label] = 1.0

        return to_return

In [3]:
# Get the dataset
def get_raw_dataset(train, test, key):
    print("Getting training data...")
    training_data = load_data(train)

    print("Getting test data...")
    test_data = load_data(test, key)

    return training_data, test_data
    

In [4]:
train_data, test_data = get_raw_dataset("../data_en_es/en_es.slam.20190204.train", "../data_en_es/en_es.slam.20190204.test", "../data_en_es/en_es.slam.20190204.test.key")

Getting training data...
Loading instances...
Loaded 317049 instances across 100000 exercises...
Loaded 635368 instances across 200000 exercises...
Loaded 951536 instances across 300000 exercises...
Loaded 1271940 instances across 400000 exercises...
Loaded 1591344 instances across 500000 exercises...
Loaded 1911212 instances across 600000 exercises...
Loaded 2227444 instances across 700000 exercises...
Loaded 2546704 instances across 800000 exercises...
Done loading 2622957 instances across 824012 exercises.

Getting test data...
Loading test labels...
Loading instances...
Loaded 337728 instances across 100000 exercises...
Done loading 386604 instances across 114586 exercises.



In [5]:
# Format the dataset into a pandas dataframe that only contains the labels and features
# we want to use in our model
def build_formatted_dataset(data):
    users, formats, tokens, part_of_speeches, labels = [], [], [] ,[], []
    
    for instance_data in data:
        users.append(instance_data.get_user())
        formats.append(instance_data.get_format())
        tokens.append(instance_data.get_token())
        part_of_speeches.append(instance_data.get_part_of_speech())
        labels.append(instance_data.get_labels())

    dataset = {'user':users, 'format':formats, 'token':tokens, 'part_of_speech':part_of_speeches, 'label':labels}
    dataset = pd.DataFrame.from_dict(dataset)

    return dataset


In [6]:
formatted_train_data = build_formatted_dataset(train_data)
formatted_test_data = build_formatted_dataset(test_data)

In [7]:
formatted_train_data

Unnamed: 0,user,format,token,part_of_speech,label
0,XEinXf5+,reverse_translate,i,PRON,0.0
1,XEinXf5+,reverse_translate,am,VERB,0.0
2,XEinXf5+,reverse_translate,a,DET,0.0
3,XEinXf5+,reverse_translate,boy,NOUN,0.0
4,XEinXf5+,reverse_translate,i,PRON,0.0
...,...,...,...,...,...
2622952,Ja1WEMqy,reverse_translate,table,NOUN,0.0
2622953,Ja1WEMqy,reverse_translate,red,VERB,0.0
2622954,Ja1WEMqy,reverse_translate,as,ADP,0.0
2622955,Ja1WEMqy,reverse_translate,a,DET,1.0


In [8]:
formatted_test_data

Unnamed: 0,user,format,token,part_of_speech,label
0,XEinXf5+,listen,i,PRON,0.0
1,XEinXf5+,listen,feel,VERB,0.0
2,XEinXf5+,listen,fine,ADJ,1.0
3,XEinXf5+,listen,now,ADV,0.0
4,XEinXf5+,listen,i,PRON,0.0
...,...,...,...,...,...
386599,Ja1WEMqy,reverse_translate,answer,NOUN,0.0
386600,Ja1WEMqy,reverse_translate,the,DET,0.0
386601,Ja1WEMqy,reverse_translate,answer,NOUN,0.0
386602,Ja1WEMqy,reverse_translate,is,VERB,0.0


In [9]:
formatted_train_data.to_pickle("../data_en_es/en_es_train_data.pkl")
formatted_test_data.to_pickle("../data_en_es/en_es_test_data.pkl")