In [2]:
# modules to import
import os
import re
import csv
import subprocess
from mosestokenizer import MosesTokenizer, MosesDetokenizer
# These are both modules that I need to create files for in the folder
from apply_bpe import BPE
import build_data

### Download data
We can usually skip this bit because we already have it downloaded

In [2]:
# TODO add a check to the directory so we can uncomment this code and 
# make it into an if statement instead
# dpath = "./data"
# build_data.make_dir(dpath)
# fname = 'traindev.zip'
# url = 'http://www.macs.hw.ac.uk/InteractionLab/E2E/data/' + fname
# build_data.download(url, dpath, fname)
# build_data.untar(dpath, fname)

### Create files with byte pair encoding
We're not going to use byte pair encoding anymore since the dataset is probably not diverse enough to warrant it

In [3]:
def create_source_target_files(input_file):
    bpe_codes = open('../data/trainset.2000.bpe.codes')
    encoder = BPE(bpe_codes)
    word_tok = MosesTokenizer(no_escape=True)
    def tokenize(text):
        word_tokens = word_tok.tokenize(text)
        sub_word_tokens = encoder.segment(' '.join(word_tokens))
        return sub_word_tokens
    outpath = os.path.dirname(input_file)
    file_name = os.path.basename(os.path.splitext(input_file)[0])
    target_file = open(os.path.join(outpath, file_name + "-target.tok.2000.bpe"), 'w')
    source_file = open(os.path.join(outpath, file_name + "-source.tok.2000.bpe"), 'w')
    input_csv = csv.reader(open(input_file, newline=''), delimiter=',', quotechar='"')
    # skip the first line in the csv with the column headers
    next(input_csv)
#     from IPython.core.debugger import Tracer; Tracer()() 
    for line in input_csv:
        meaning_representations = line[0].split(', ')
        acts_tok_bpe = []
        for act in meaning_representations:
            act_type = act[0:act.find("[")].replace(' ', '')
            acts_tok_bpe += ['__start_' + act_type + '__']
            acts_tok_bpe += [tokenize(act[act.find("[")+1:act.find("]")])]
            acts_tok_bpe += ['__end_' + act_type + '__']
        acts = ' '.join(acts_tok_bpe).strip()
        target = tokenize(line[1]).strip()
        source_file.write(acts + '\n')
        target_file.write(target + '\n')
    source_file.close()
    target_file.close()
            
def create_bpe_dict(input_file, num_operations):
    # Inputs
    # file name
    # TODO BPE size
    word_tok = MosesTokenizer(no_escape=True)
    def tokenize(text):
        word_tokens = word_tok.tokenize(text)
        # I do feel like I should name the second
        # variable something other than word_tokens
        word_tokens = ' '.join(word_tokens)
        return word_tokens
    outpath = os.path.dirname(input_file)
    file_name = os.path.basename(os.path.splitext(input_file)[0])
    bpe_input_file = open(os.path.join(outpath, file_name + '-bpe-input.txt'), 'w')
    input_csv = csv.reader(open(input_file, newline=''), delimiter=',', quotechar='"')
    # skip the first line in the csv with the column headers
    next(input_csv)
    for line in input_csv:
        meaning_representations = line[0].split(', ')
        acts_tokenized = []
        for act in meaning_representations:
            # https://stackoverflow.com/a/4894156/4507677 - instead of regex
            acts_tokenized += [tokenize(act[act.find("[")+1:act.find("]")])]
        acts = ' '.join(acts_tokenized)
        target = tokenize(line[1])
        bpe_input_file.write(acts + '\n' + target + '\n')
    bpe_input_file.close()
    # There were some errors with permissions for running this command so we're going 
    # to leave it and do it manually for now
    # We fixed the permission issues and I might try to integrate this into the code later. We were missing
    # execute rights on the python module
    # ./learn_bpe.py -s {num_operations} < {train_file} > {codes_file}
#     subprocess.run(['./learn_bpe.py', '-s', str(num_operations), '<', os.path.join(outpath, file_name + '-bpe-input.txt'),
#                    '>', os.path.join(outpath, file_name + '.' + str(num_operations) + '.bpe.codes') ])


I think what we were trying to do here was be too fancy with the data cleaning and copy how they did it in parlai. Better to keep it simple. They were trying to make a generic thing that could handle lots of different ways of downloading data

In [4]:
def build(dpath):
    # dpath = ./data
    # http://www.macs.hw.ac.uk/InteractionLab/E2E/data/traindev.zip
    if not build_data.built(dpath):
        print('building data')
        if build_data.built(dpath):
            # An older version exists, so remove the outdated files
            # TODO it would be better if this just removed the cleaning 
            # steps rather than forcing it to redownload the data again
            build_data.remove_dir(dpath)
        # Download the training and validation data
        fname = 'traindev.zip'
        url = 'http://www.macs.hw.ac.uk/InteractionLab/E2E/data/' + fname
        build_data.download(url, dpath, fname)
        build_data.untar(dpath, fname)

        create_source_target_files(os.path.join(dpath, "trainset.csv"))


## Clean and tokenize

We have devset and trainset csvs. All we need is a step of steps that will output the same stuff we already are doing, except without the byte pair encoding. 

`TODO` Eventually we might consider trying to attach the categories to the tokens as features `token|category` rather than enclosed `__start__ token __end__`


### Progress

We added sentence tokenization to deal with trailing full stops. And decided to start doing aggresive hyphenation removal with moses tokenizer. Hence why we replace the token the moses tokenizer uses `'\@-\@'` with a regular hyphen.

## TODO

Add test data to the processing steps

In [5]:
from nltk.tokenize import sent_tokenize
import os
import csv

In [15]:
valid_file_name = "../data/devset.csv"
train_file_name = "../data/trainset.csv"
test_file_name = "../data/test_e2e.csv"

word_tok = MosesTokenizer('en')
def tokenized(text):
    #word_tokens = word_tok.tokenize(text, agressive_dash_splits=True, escape=False)
    word_tokens = word_tok(text)
    return word_tokens

for input_file in [train_file_name, valid_file_name, test_file_name]:
    outpath = os.path.dirname(input_file)
    file_name = os.path.basename(os.path.splitext(input_file)[0])
    target_file = open(os.path.join(outpath, file_name + "-target.tok"), 'w')
    source_file = open(os.path.join(outpath, file_name + "-source.tok"), 'w')
    additional_words_file = open(os.path.join(outpath, file_name + "-additional-words-source.tok"), 'w')
    input_csv = csv.reader(open(input_file, newline=''), delimiter=',', quotechar='"')
    # skip the first line in the csv with the column headers
    next(input_csv)
#     from IPython.core.debugger import Tracer; Tracer()() 
    for line in input_csv:
        meaning_representations = line[0].split(', ')
        acts_tok = []
        additional_words_tok = []
        for act in meaning_representations:
            act_type = act[0:act.find("[")].replace(' ', '')
            acts_tok += ['__start_' + act_type + '__']
            acts_tok += word_tok(act[act.find("[")+1:act.find("]")])
            acts_tok += ['__end_' + act_type + '__']
            if act_type in ['near']:
                additional_words_tok += ['near']
            elif act_type not in ['name']:
                additional_words_tok += ['_'.join(word_tok(act[act.find("[")+1:act.find("]")]))]
        acts = ' '.join(acts_tok).strip().replace('\@-\@', '-')
        source_file.write(acts + '\n')
        additional_words = ' '.join(additional_words_tok).strip().replace('\@-\@', '-')
        additional_words_file.write(additional_words + '\n')
        if len(line) > 1:
            target_tok = [word_tok(t) for t in sent_tokenize(line[1])]
            target = ' '.join([j for i in target_tok for j in i]).strip().replace('\@-\@', '-')
            target_file.write(target + '\n')
    source_file.close()
    target_file.close()


FileNotFoundError: [Errno 2] No such file or directory: '../data/test_e2e.csv'

In [8]:
from nltk.tokenize.toktok import ToktokTokenizer

In [13]:
a = ToktokTokenizer()

In [17]:
a.tokenize()

TypeError: tokenize() missing 1 required positional argument: 'text'

In [13]:
import nltk


In [14]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /soe/nivarghe/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True