In [51]:
import numpy as np
import os
import time
import torch
from torch import optim
from src.eval.mscc import mscc_evaluation
from src.core.nets import Context2vec
from src.util.args import parse_args
from src.util.batch import Dataset
from src.util.config import Config
from src.util.io import write_embedding, write_config, read_config, load_vocab


def run_inference_by_user_input(model,
                                itos,
                                stoi,
                                unk_token,
                                bos_token,
                                eos_token,
                                device):

    def return_split_sentence(sentence):
        if ' ' not in sentence:
            print('sentence should contain white space to split it into tokens')
            raise SyntaxError
        elif '[]' not in sentence:
            print('sentence should contain `[]` that notes the target')
            raise SyntaxError
        else:
            tokens = sentence.lower().strip().split()
            target_pos = tokens.index('[]')
            return tokens, target_pos

    ''' norm_weight
    '''
    model.norm_embedding_weight(model.criterion.W)

    while True:
        sentence = input('>> ')
        try:
            tokens, target_pos = return_split_sentence(sentence)
        except SyntaxError:
            continue
        tokens[target_pos] = unk_token
        tokens = [bos_token] + tokens + [eos_token]
        indexed_sentence = [stoi[token] if token in stoi else stoi[unk_token] for token in tokens]
        input_tokens = \
            torch.tensor(indexed_sentence, dtype=torch.long, device=device).unsqueeze(0)
        topv, topi = model.run_inference(input_tokens, target=None, target_pos=target_pos)
        for value, key in zip(topv, topi):
            print(value.item(), itos[key.item()])


def main():
    args = parse_args()
    gpu_id = args.gpu_id
    train = args.train
    use_cuda = torch.cuda.is_available() and gpu_id > -1
    max_sent_length = 64
    if use_cuda:
        device = torch.device('cuda:{}'.format(gpu_id))
        torch.cuda.set_device(gpu_id)
    else:
        device = torch.device('cpu')

    config = Config(args.config_file)

    if train:
        batch_size = config.batch_size
        n_epochs = config.n_epochs
        word_embed_size = config.word_embed_size
        hidden_size = config.hidden_size
        learning_rate = config.learning_rate
        if not os.path.isfile(args.input_file):
            raise FileNotFoundError

        print('Loading input file')
        counter = 0
        with open(args.input_file) as f:
            sentences = []
            for line in f:
                sentence = line.strip().lower().split()
                if 0 < len(sentence) < max_sent_length:
                    counter += 1

        sentences = np.empty(counter, dtype=object)
        counter = 0
        with open(args.input_file) as f:
            for line in f:
                sentence = line.strip().lower().split()
                if 0 < len(sentence) < max_sent_length:
                    sentences[counter] = np.array(sentence)
                    counter += 1

        print('Creating dataset')
        dataset = Dataset(sentences, batch_size, config.min_freq, device)
        counter = np.array([dataset.vocab.freqs[word] if word in dataset.vocab.freqs else 0
                            for word in dataset.vocab.itos])
        model = Context2vec(vocab_size=len(dataset.vocab),
                            counter=counter,
                            word_embed_size=word_embed_size,
                            hidden_size=hidden_size,
                            n_layers=config.n_layers,
                            bidirectional=True,
                            use_mlp=config.use_mlp,
                            dropout=config.dropout,
                            pad_index=dataset.pad_index,
                            device=device,
                            inference=False).to(device)
        optimizer = optim.Adam(model.parameters(), lr=learning_rate)

        print(batch_size, n_epochs, word_embed_size, hidden_size, device)
        print(model)

        interval = 1e6
        for epoch in range(n_epochs):
            begin_time = time.time()
            cur_at = begin_time
            total_loss = 0.0
            word_count = 0
            next_count = interval
            last_accum_loss = 0.0
            last_word_count = 0
            for iterator in dataset.get_batch_iter(batch_size):
                for batch in iterator:
                    sentence = getattr(batch, 'sentence')
                    target = sentence[:, 1:-1]
                    if target.size(0) == 0:
                        continue
                    optimizer.zero_grad()
                    loss = model(sentence, target)
                    loss.backward()
                    optimizer.step()
                    total_loss += loss.data.mean()

                    minibatch_size, sentence_length = target.size()
                    word_count += minibatch_size * sentence_length
                    accum_mean_loss = float(total_loss)/word_count if total_loss > 0.0 else 0.0
                    if word_count >= next_count:
                        now = time.time()
                        duration = now - cur_at
                        throuput = float((word_count-last_word_count)) / (now - cur_at)
                        cur_mean_loss = (float(total_loss)-last_accum_loss)/(word_count-last_word_count)
                        print('{} words, {:.2f} sec, {:.2f} words/sec, {:.4f} accum_loss/word, {:.4f} cur_loss/word'
                              .format(word_count, duration, throuput, accum_mean_loss, cur_mean_loss))
                        next_count += interval
                        cur_at = now
                        last_accum_loss = float(total_loss)
                        last_word_count = word_count

            print(total_loss.item())

        output_dir = os.path.dirname(args.wordsfile)
        if output_dir and not os.path.exists(output_dir):
            os.makedirs(output_dir)
        write_embedding(dataset.vocab.itos, model.criterion.W, use_cuda, args.wordsfile)
        torch.save(model.state_dict(), args.modelfile)
        torch.save(optimizer.state_dict(), args.modelfile+'.optim')
        output_config_file = args.modelfile+'.config.json'
        write_config(output_config_file,
                     vocab_size=len(dataset.vocab),
                     word_embed_size=word_embed_size,
                     hidden_size=hidden_size,
                     n_layers=config.n_layers,
                     bidirectional=True,
                     use_mlp=config.use_mlp,
                     dropout=config.dropout,
                     pad_index=dataset.pad_index,
                     unk_token=dataset.unk_token,
                     bos_token=dataset.bos_token,
                     eos_token=dataset.eos_token,
                     learning_rate=learning_rate)
    else:
        config_file = args.modelfile+'.config.json'
        config_dict = read_config(config_file)
        model = Context2vec(vocab_size=config_dict['vocab_size'],
                            counter=[1]*config_dict['vocab_size'],
                            word_embed_size=config_dict['word_embed_size'],
                            hidden_size=config_dict['hidden_size'],
                            n_layers=config_dict['n_layers'],
                            bidirectional=config_dict['bidirectional'],
                            use_mlp=config_dict['use_mlp'],
                            dropout=config_dict['dropout'],
                            pad_index=config_dict['pad_index'],
                            device=device,
                            inference=True).to(device)
        model.load_state_dict(torch.load(args.modelfile))
        optimizer = optim.Adam(model.parameters(), lr=config_dict['learning_rate'])
        optimizer.load_state_dict(torch.load(args.modelfile+'.optim'))
        itos, stoi = load_vocab(args.wordsfile)
        unk_token = config_dict['unk_token']
        bos_token = config_dict['bos_token']
        eos_token = config_dict['eos_token']
        model.eval()

        if args.task == 'mscc':
            if not os.path.isfile(config.question_file) or not os.path.isfile(config.answer_file):
                raise FileNotFoundError

            mscc_evaluation(config.question_file,
                            config.answer_file,
                            'mscc.result',
                            model,
                            stoi,
                            unk_token=unk_token,
                            bos_token=bos_token,
                            eos_token=eos_token,
                            device=device)

        else:
            run_inference_by_user_input(model,
                                        itos,
                                        stoi,
                                        unk_token=unk_token,
                                        bos_token=bos_token,
                                        eos_token=eos_token,
                                        device=device)


In [17]:
from torchtext import data


class Dataset:
    def __init__(self,
                 sentences: list,
                 batch_size: int,
                 min_freq: int,
                 device: int,
                 pad_token='<PAD>',
                 unk_token='<UNK>',
                 bos_token='<BOS>',
                 eos_token='<EOS>',
                 seed=777):

        np.random.seed(seed)
        self.sent_dict = self._gathered_by_lengths(sentences)
        self.pad_token = pad_token
        self.unk_token = unk_token
        self.bos_token = bos_token
        self.eos_token = eos_token
        self.device = device

        self.sentence_field = data.Field(use_vocab=True,
                                         unk_token=self.unk_token,
                                         pad_token=self.pad_token,
                                         init_token=self.bos_token,
                                         eos_token=self.eos_token,
                                         batch_first=True,
                                         include_lengths=False)
        self.sentence_id_field = data.Field(use_vocab=False, batch_first=True)

        self.sentence_field.build_vocab(sentences, min_freq=min_freq)
        self.vocab = self.sentence_field.vocab
        if self.pad_token:
            self.pad_index = self.sentence_field.vocab.stoi[self.pad_token]

        self.dataset = self._create_dataset(self.sent_dict, sentences)

    def get_raw_sentence(self, sentences):
        return [[self.vocab.itos[idx] for idx in sentence]
                for sentence in sentences]

    def _gathered_by_lengths(self, sentences):
        lengths = [(index, len(sent)) for index, sent in enumerate(sentences)]
        lengths = sorted(lengths, key=lambda x: x[1], reverse=True)

        sent_dict = dict()
        current_length = -1
        for (index, length) in lengths:
            if current_length == length:
                sent_dict[length].append(index)
            else:
                sent_dict[length] = [index]
                current_length = length

        return sent_dict

    def _create_dataset(self, sent_dict, sentences):
        datasets = dict()
        _fields = [('sentence', self.sentence_field),
                   ('id', self.sentence_id_field)]
        for sent_length, sent_indices in sent_dict.items():
            sent_indices = np.array(sent_indices)
            items = [*zip(sentences[sent_indices], sent_indices[:, np.newaxis])]
            datasets[sent_length] = data.Dataset(self._get_examples(items, _fields), _fields)
        return np.random.permutation(list(datasets.values()))

    def _get_examples(self, items: list, fields: list):
        return [data.Example.fromlist(item, fields) for item in items]

    def get_batch_iter(self, batch_size: int):

        def sort(data: data.Dataset) -> int:
            return len(getattr(data, 'sentence'))

        for dataset in self.dataset:
            yield data.Iterator(dataset=dataset,
                                batch_size=batch_size,
                                sort_key=sort,
                                train=True,
                                repeat=False,
                                device=self.device)

In [8]:
dat_path = '../../../../data/processed/X_wiki2.npy'

sentences = np.load(dat_path)

In [52]:
batch_size = 100
min_freq = 1
device = 'cuda'
n_epochs = 1

dataset = Dataset(sentences, batch_size, min_freq, device)

In [56]:
dataset.get_raw_sentence([[1, 3, 4, 6, 3], [3, 6, 6]])

[['<PAD>', '<EOS>', 'the', '.', '<EOS>'], ['<EOS>', '.', '.']]

In [57]:
ZZZ = 0
interval = 1e6

for epoch in range(n_epochs):
    begin_time = time.time()
    cur_at = begin_time
    total_loss = 0.0
    word_count = 0
    next_count = interval
    last_accum_loss = 0.0
    last_word_count = 0
    for iterator in dataset.get_batch_iter(batch_size):
        for batch in iterator:
            sentence = getattr(batch, 'sentence')
            targetget = sentence[:, 1:-1]
            print('sentence', sentence)
            print('target', target)
            print()
            ZZZ += 1
            if ZZZ >= 10: break

sentence tensor([[    2,    16,  5586,  5981,  8591,  2177,   934,    11,  4725,    28,
            12,   503,     7,  2395, 14227, 20031,    47,  3134,  4671,   125,
            46,   709,     9,  7645,    25,  1514,    75,    15,  8019,    47,
         20477,  6061,    26,     5,  2467,  2227,     5, 14961,     5,     8,
             5,  1104,    25,    75,    15,  8019,    47, 20477,    26,     5,
         21465,     5, 33799,     5,  7113,     5, 21513,    47,     8,    25,
            75,    15,  8019,    47, 20477,    26,     5,  2386,    47,  2638,
          3041,    25,    75,    15,  8019,    47, 20477,    26,     5, 19423,
             5,  3319,     5,   330,    25,   919,     9,   123,    47,  1959,
            26,     5,   131,    47,  1090,  1469,     5,  6093,     5,   297,
             5,  9423,     5,   277,     9,  3012,     5,  1488,     9,   916,
          2158,     5,   362,     5, 21798,     9, 19275,     5,   807,     5,
          9461,     9,   359,  9461,     5,

  del sys.path[0]


sentence tensor([[    2,   107,     8,  ...,   287,     6,     3],
        [    2,    16,  2548,  ...,  1240,     6,     3],
        [    2,   140,     7,  ...,    26,     6,     3],
        ...,
        [    2,   180,  4661,  ...,     6,    13,     3],
        [    2, 18842,    94,  ...,   840,    13,     3],
        [    2,   107,    14,  ..., 25552,     6,     3]], device='cuda:0')
target tensor([[  107,     8,     8,  ...,   661,   287,     6],
        [   16,  2548,    15,  ...,    12,  1240,     6],
        [  140,     7,   622,  ...,   339,    26,     6],
        ...,
        [  180,  4661,     8,  ...,  2189,     6,    13],
        [18842,    94,   804,  ..., 33738,   840,    13],
        [  107,    14, 13621,  ...,    11, 25552,     6]], device='cuda:0')

sentence tensor([[   2,   16, 1556,  ..., 4867,    6,    3],
        [   2,   75, 1956,  ...,   19,    6,    3],
        [   2,  316,   39,  ..., 1253,    6,    3],
        ...,
        [   2,  178,   78,  ..., 4036,    6,   

In [63]:
sentences[0:3][:, 1:-1]

IndexError: too many indices for array

In [47]:
'''
The function create_mscc_dataset is Copyright 2016 Oren Melamud
Modifications copyright (C) 2018 Tatsuya Aoki

This code is based on https://github.com/orenmel/context2vec/blob/master/context2vec/eval/mscc_text_tokenize.py
Used to convert the Microsoft Sentence Completion Challnege (MSCC) learning corpus into a one-sentence-per-line format.
'''

import glob
import numpy
import torch
import sys
import os
from nltk.tokenize import word_tokenize, sent_tokenize


def create_mscc_dataset(input_dir, output_filename, lowercase=True):
    def write_paragraph_lines(paragraph_lines, file_obj):
        paragraph_str = ' '.join(paragraph_lines)
        for sent in sent_tokenize(paragraph_str):
            if lowercase:
                sent = sent.lower()
            file_obj.write(' '.join(word_tokenize(sent))+'\n')

    if input_dir[-1] != '/':
        input_dir += '/'

    if not os.path.isdir(input_dir):
        raise NotADirectoryError

    print('Read files from', input_dir)
    print('Creating dataset to', output_filename)
    files = glob.glob(input_dir + '*.TXT')
    with open(output_filename, mode='w') as output_file:
        for file in files:
            with open(file, mode='r', errors='ignore') as input_file:
                paragraph_lines = []
                for i, line in enumerate(input_file):
                    if len(line.strip()) == 0 and len(paragraph_lines) > 0:
                        write_paragraph_lines(paragraph_lines, output_file)
                        paragraph_lines = []
                    else:
                        paragraph_lines.append(line)
                if len(paragraph_lines) > 0:
                    write_paragraph_lines(paragraph_lines, output_file)
                print('Read {} lines'.format(i))


def read_mscc_questions(input_file, lower=True):
    with open(input_file, mode='r') as f:
        questions = []
        for line in f:
            q_id, text = line.split(' ', 1)
            if lower:
                text = text.lower()
            text = text.strip().split()
            target_word = ''
            for index, token in enumerate(text):
                if token.startswith('[') and token.endswith(']'):
                    target_word = token[1:-1]
                    target_pos = index
            if not target_word:
                raise SyntaxError
            questions.append([text, q_id, target_word, target_pos])
    return questions


def print_mscc_score(gold_q_id: list,
                     q_id_and_sim: tuple):

    assert len(q_id_and_sim) % 5 == 0

    gold = numpy.array(gold_q_id)
    answer = numpy.array([sorted(q_id_and_sim[5*i:5*(i+1)], key=lambda x:x[1], reverse=True)
                          for i in range(int(len(q_id_and_sim)/5))])[:, 0, 0]
    correct_or_not = (gold == answer)
    mid = int(len(correct_or_not) / 2)
    dev = correct_or_not[:mid]
    test = correct_or_not[mid:]

    print('Overall', float(sum(correct_or_not))/len(correct_or_not))
    print('dev', float(sum(dev))/len(dev))
    print('test', float(sum(test))/len(test))


def mscc_evaluation(question_file,
                    answer_file,
                    output_file,
                    model,
                    stoi,
                    unk_token,
                    bos_token,
                    eos_token,
                    device):

        questions = read_mscc_questions(question_file)
        q_id_and_sim = []
        with open(question_file, mode='r') as f, open(output_file, mode='w') as w:
            for question, input_line in zip(questions, f):
                tokens, q_id, target_word, target_pos = question
                tokens[target_pos] = target_word
                tokens = [bos_token] + tokens + [eos_token]
                indexed_sentence = [stoi[token] if token in stoi else stoi[unk_token] for token in tokens]
                input_tokens = \
                    torch.tensor(indexed_sentence, dtype=torch.long, device=device).unsqueeze(0)
                indexed_target_word = input_tokens[0, target_pos+1]
                similarity = model.run_inference(input_tokens, indexed_target_word, target_pos)
                q_id_and_sim.append((q_id, similarity))
                w.write(input_line.strip() + '\t' + str(similarity) + '\n')

        with open(answer_file, mode='r') as f:
            gold_q_id = [line.split(' ', 1)[0] for line in f]

        print_mscc_score(gold_q_id, q_id_and_sim)


# if __name__ == '__main__':
#     if len(sys.argv) < 2:
#         print('Please specify your input directory that contains MSCC dataset.')
#         print('(Most of the case the name of the directory might be `Holmes_Training_Data`.)')
#         print('sample usage: python src/eval/mscc.py ~/dataset/Holmes_Training_Data/')
#         quit()
#     create_mscc_dataset(sys.argv[1], 'dataset/mscc_train.txt')


In [48]:
create_mscc_dataset('dataset/', 'sample_run.txt')

Read files from dataset/
Creating dataset to sample_run.txt


In [49]:
!ls dataset/

sample.txt


In [42]:
!ls

config.toml  LICENSE  play.ipynb  sample_run.txt
dataset      models   README.md   src


In [50]:
!head dataset/sample.txt

 pierre <unk> N years old will join the board as a nonexecutive director nov. N 
 mr. <unk> is chairman of <unk> n.v. the dutch publishing group 
 rudolph <unk> N years old and former chairman of consolidated gold fields plc was named a nonexecutive director of this british industrial conglomerate 
 a form of asbestos once used to make kent cigarette filters has caused a high percentage of cancer deaths among a group of workers exposed to it more than N years ago researchers reported 
 the asbestos fiber <unk> is unusually <unk> once it enters the <unk> with even brief exposures to it causing symptoms that show up decades later researchers said 
 <unk> inc. the unit of new york-based <unk> corp. that makes kent cigarettes stopped using <unk> in its <unk> cigarette filters in N 
 although preliminary findings were reported more than a year ago the latest results appear in today 's new england journal of medicine a forum likely to bring new attention to the problem 
 a <unk> <unk>