<a href="https://colab.research.google.com/github/ollema/nlp2019/blob/master/a1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [119]:
!wget https://raw.githubusercontent.com/ollema/nlp2019/master/a1_data/wsd_train.txt

--2019-11-06 16:23:23--  https://raw.githubusercontent.com/ollema/nlp2019/master/a1_data/wsd_train.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 46435645 (44M) [text/plain]
Saving to: ‘wsd_train.txt.25’


2019-11-06 16:23:23 (228 MB/s) - ‘wsd_train.txt.25’ saved [46435645/46435645]



In [0]:
import torch
from torch import nn
import time
import torchtext

import random
random.seed(1)
torch.manual_seed(1)

class WSDClassifier(nn.Module):
    
    def __init__(self, text_field, class_field, emb_dim, hidden_size, update_pretrained=False):
        super().__init__()        

        self.n = 70
        self.batch_size = 128

        voc_size = len(text_field.vocab)
        n_senses = len(class_field.vocab)   
        
        # embedding layer.
        self.embedding = nn.Embedding(voc_size, emb_dim)

        # if we're using pre-trained embeddings, copy them into the model's embedding layer.
        if text_field.vocab.vectors is not None:
            self.embedding.weight = torch.nn.Parameter(text_field.vocab.vectors, requires_grad=update_pretrained)
        
        # bidirectional LSTM
        self.lstm = nn.LSTM(input_size=emb_dim, hidden_size=hidden_size, bidirectional=True, num_layers=1)
        self.num_directions = 2

        # hidden layer
        self.a = nn.Linear(2 * hidden_size, 2 * hidden_size)

        # classification layer
        self.y = nn.Softmax(nn.Linear(2 * hidden_size, n_senses))

        
    def forward(self, texts):
        # The words in the documents are encoded as integers. The shape of the documents
        # tensor is (max_len, batch), where batch is the number of documents in this batch,
        # and max_len is the maximal length of a document in the batch.

        # First look up the embeddings for all the words in the documents.
        # The shape is now (max_len, batch, emb_dim).
        print("texts size:", texts.size())
        embedded = self.embedding(texts)
        
        # rnn_output: the outputs at all positions of the final layer
        print("embedded size:", embedded.size())
        output, _ = self.lstm(embedded)
        print("output size:", output.size())

        # the shape of output is (seq_len, batch, 2 * hidden_size)
        # we select the forward and backward states at position n and concatenate them.
        output = output.view(-1, self.batch_size, self.num_directions, self.lstm.hidden_size)
        print("output size:", output.size())

        forward = output[70 - 1, :, 0, :]
        backward = output[70, :, 1, :]

        print(forward.size())
        print(backward.size())

        raise RuntimeError

        top_both = torch.cat([top_forward, top_backward], dim=1)
        
        # apply the hidden layer and return the output.
        hidden = self.a(top_both)

        # apply the top layer + softmax and return the output.
        return self.y(top_both)

In [0]:
def read_data(corpus_file, doc_start, with_padding = True):
    """ Parses input file and returns filtered datasets for each word-type in corpus as well as
    list of word-types found in corpus. """
    # Initialization
    text = torchtext.data.Field(sequential=True, tokenize=lambda x: x.split())
    label = torchtext.data.LabelField(is_target=True)
    datafields = [('text', text), ('label', label)]
    label_column = 0

    if with_padding == True:
        pad_string = '<pad>'
        sentence_length = 140
        half_sentence_length = int(sentence_length/2)
        with open(corpus_file, encoding='utf-8') as f:
            examples = []
            for line in f:
                columns = line.strip().split(maxsplit=doc_start)
                position_of_wordtype = int(columns[2])

                # Split the long string doc into array and extract words before the wordtype.
                doc = columns[-1]
                doc_string_vector = doc.split()
                temp_pad = [pad_string for x in range(0,70)]

                padded_doc = []
                padded_doc.extend(temp_pad)
                padded_doc.extend(doc_string_vector)
                padded_doc.extend(temp_pad)

                sliced_doc = padded_doc[position_of_wordtype:position_of_wordtype + 140]

                if len(sliced_doc) != 140:
                    print(sliced_doc)
                    raise RuntimeError

                sliced_doc = " ".join(sliced_doc)
                label = columns[label_column]

                examples.append(torchtext.data.Example.fromlist([sliced_doc, label], datafields))
    else:
        with open(corpus_file, encoding='utf-8') as f:
            examples = []
            for line in f:
                columns = line.strip().split(maxsplit=doc_start)
                doc = columns[-1]
                label = columns[label_column]
                examples.append(torchtext.data.Example.fromlist([doc, label], datafields))
    unfiltered_data = torchtext.data.Dataset(examples, datafields,filter_pred=None)

    # Read complete dataset to get set of word-types. E.i 'keep', 'line'...
    filter_function = None
    word_types = set()
    for example in unfiltered_data.examples:
        word_types.add(example.label.split("%", 1)[0])
    word_types = list(word_types)

    # Create filtered datasets for each word-type
    filtered_datasets = {}
    for a_word_type in word_types:
        filter_function = lambda ex: ex.label.split("%", 1)[0] == a_word_type
        text = torchtext.data.Field(sequential=True, tokenize=lambda x: x.split())
        label = torchtext.data.LabelField(is_target=True)
        datafields = [('text', text), ('label', label)]

        filtered_data_set = torchtext.data.Dataset(examples, datafields, filter_pred=filter_function)
        filtered_datasets[a_word_type] = (filtered_data_set, text, label)
    return filtered_datasets

In [122]:
use_pretrained = False
from collections import defaultdict

filtered_datasets = read_data('wsd_train.txt', doc_start=4)

for word_type, filtered_dataset in filtered_datasets.items():
    dataset = filtered_dataset[0]
    text = filtered_dataset[1]
    label = filtered_dataset[2]

    train, valid = dataset.split([0.8, 0.2])

    if use_pretrained:
        print('We are using pre-trained word embeddings.')
        text.build_vocab(train, vectors="glove.6B.100d")
    else:        
        print('We are training word embeddings from scratch.')
        text.build_vocab(train, max_size=10000)
    
    label.build_vocab(train)
        
    model = WSDClassifier(text, label, emb_dim=100, hidden_size=74, update_pretrained=True)

    device = 'cuda'
    model.to(device)

    # example = train.examples.pop()
    # print(len(example.text))

    # raise RuntimeError

    train_iterator = torchtext.data.Iterator(
        train,
        device=device,
        batch_size=128,
        repeat=False,
        train=True,
        sort=False)

    valid_iterator = torchtext.data.Iterator(
        valid,
        device=device,
        batch_size=128,
        repeat=False,
        train=False,
        sort=False)

    loss_function = torch.nn.CrossEntropyLoss()   
    optimizer = torch.optim.SGD(model.parameters(), lr=2) 
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.96)


    train_batches = list(train_iterator)
    valid_batches = list(valid_iterator)

    history = defaultdict(list)

    for i in range(25):
        
        t0 = time.time()
        
        loss_sum = 0
        n_batches = 0

        model.train()
        
        for batch in train_batches:
                        
            scores = model(batch.text)
            loss = loss_function(scores, batch.label)

            optimizer.zero_grad()            
            loss.backward()
            optimizer.step()

            loss_sum += loss.item()
            n_batches += 1
        
        train_loss = loss_sum / n_batches
        history['train_loss'].append(train_loss)
        
        n_correct = 0
        n_valid = len(valid)
        loss_sum = 0
        n_batches = 0

        model.eval()
        
        for batch in valid_batches:
            scores = model(batch.text)
            n_corr_batch, loss_batch = evaluate_validation(scores, loss_function, batch.label)
            loss_sum += loss_batch
            n_correct += n_corr_batch
            n_batches += 1
        val_acc = n_correct / n_valid
        val_loss = loss_sum / n_batches

        history['val_loss'].append(val_loss)
        history['val_acc'].append(val_acc)        
        
        scheduler.step()

        t1 = time.time()

        print(f'Epoch {i+1}: train loss = {train_loss:.4f}, val loss = {val_loss:.4f}, val acc: {val_acc:.4f}, time = {t1-t0:.4f}, lr = {optimizer.param_groups[0]["lr"]}')

        break

    plt.plot(history['train_loss'])
    plt.plot(history['val_loss'])
    plt.plot(history['val_acc'])
    plt.legend(['training loss', 'validation loss', 'validation accuracy'])

We are training word embeddings from scratch.
texts size: torch.Size([140, 128])
embedded size: torch.Size([140, 128, 100])
output size: torch.Size([140, 128, 148])
output size: torch.Size([140, 128, 2, 74])
torch.Size([128, 74])
torch.Size([128, 74])


RuntimeError: ignored