In [1]:
import pandas as pd
import torchtext
import torch
import spacy
import en_core_web_sm
svoc = en_core_web_sm.load()
import numpy as np
from torchtext.data import TabularDataset
from torchtext import vocab
from torch import nn
import time
import random

In [2]:
from collections import defaultdict
from collections import Counter

In [3]:
datao = pd.read_pickle("~/OneDrive/kph/processed2.pkl")

In [61]:
# filter out less interesting keywords
datao['ext perc'].value_counts()

3     5403
4     4737
2     3903
5     2703
1     1883
6     1032
0      746
7      305
8      120
9       46
10      35
11      17
13       8
12       6
15       5
14       3
16       3
17       2
22       2
18       1
Name: ext perc, dtype: int64

In [4]:
# choose everything larger than 3 as the training and testing data set
datatrain = datao[datao['ext perc']>=3]
datatest = datao[datao['ext perc']<3]

In [39]:
# prepare the train, test, validate data set
def prepare_csv(df_train, df_test,VAL_RATIO, seed=250):
    idx = np.arange(df_train.shape[0])
    np.random.seed(seed)
    np.random.shuffle(idx)
    val_size = int(len(idx) * VAL_RATIO)
    df_train.iloc[idx[val_size:], :].to_csv(
        "~/OneDrive/kph/ttd/train.csv", index=False)
    df_train.iloc[idx[:val_size], :].to_csv(
        "~/OneDrive/kph/ttd/val.csv", index=False)
    df_test.to_csv("~/OneDrive/kph/ttd/test.csv", index=False)

In [40]:
prepare_csv(datatrain.loc[:,['SRC','TRG']], datatest.loc[:,['SRC','TRG']],0.2, seed=250)

In [5]:
# separate train and validate 
VAL_RATIO = 0.2
dtrain = datatrain.loc[:,['SRC','TRG']]
seed=250
idx = np.arange(datatrain.shape[0])
np.random.seed(seed)
np.random.shuffle(idx)
val_size = int(len(idx) * VAL_RATIO)
df_train = dtrain.iloc[idx[val_size:], :]
df_val = dtrain.iloc[idx[:val_size], :]
df_test = datatest.loc[:,['SRC','TRG']]

In [6]:
tokenizertrg = lambda x: x.split()
def tokenizersrc(text): # create a tokenizer function
    return [tok.text for tok in svoc.tokenizer(text)]

In [7]:
# change strategy, store lists directly into the iterator
SRC = torchtext.data.Field(init_token='<bos>', eos_token='<eos>', sequential=True, lower=False)
TRG = torchtext.data.Field(init_token='<bos>', eos_token='<eos>', sequential=True, unk_token=None)
fields = [('SRC', SRC), ('TRG', TRG)]

In [8]:
examples = []
words = []
labels = []
for pmid in df_train.index:
    words = tokenizersrc(df_train.loc[pmid,'SRC'])
    labels = tokenizertrg(df_train.loc[pmid,'TRG'])
    examples.append(torchtext.data.Example.fromlist([words, labels], fields))
trainexamples = torchtext.data.Dataset(examples, fields)

In [9]:
examples = []
words = []
labels = []
for pmid in df_val.index:
    words = tokenizersrc(df_val.loc[pmid,'SRC'])
    labels = tokenizertrg(df_val.loc[pmid,'TRG'])
    examples.append(torchtext.data.Example.fromlist([words, labels], fields))
valexamples = torchtext.data.Dataset(examples, fields)

In [10]:
embvec = vocab.GloVe(name='840B', dim=300,cache='/home/pding/Documents/glove/')

In [11]:
SRC.build_vocab(trainexamples, vectors=embvec)

In [12]:
TRG.build_vocab(trainexamples)

In [13]:
class RNNTagger(nn.Module):
    
    def __init__(self, text_field, label_field, emb_dim, rnn_size, update_pretrained=False):
        super().__init__()
        
        voc_size = len(text_field.vocab)
        self.n_labels = len(label_field.vocab)       
        
        # Embedding layer. If we're using pre-trained embeddings, copy them
        # into our embedding module.
        self.embedding = nn.Embedding(voc_size, emb_dim)
        if text_field.vocab.vectors is not None:
            self.embedding.weight = torch.nn.Parameter(text_field.vocab.vectors, 
                                                       requires_grad=update_pretrained)

        # RNN layer. We're using a bidirectional GRU with one layer.
        self.rnn = nn.GRU(input_size=emb_dim, hidden_size=rnn_size, 
                          bidirectional=True, num_layers=1)

        # Output layer. As in the example last week, the input will be two times
        # the RNN size since we are using a bidirectional RNN.
        self.top_layer = nn.Linear(2*rnn_size, self.n_labels)
 
        # To deal with the padding positions later, we need to know the
        # encoding of the padding dummy word and the corresponding dummy output tag.
        self.pad_word_id = text_field.vocab.stoi[text_field.pad_token]
        self.pad_label_id = label_field.vocab.stoi[label_field.pad_token]
    
        # Loss function that we will use during training.
        self.loss = torch.nn.CrossEntropyLoss(reduction='sum')
        
    def compute_outputs(self, sentences):
        # The words in the documents are encoded as integers. The shape of the documents
        # tensor is (max_len, n_docs), where n_docs is the number of documents in this batch,
        # and max_len is the maximal length of a document in the batch.

        # First look up the embeddings for all the words in the documents.
        # The shape is now (max_len, n_sentences, emb_dim).        
        embedded = self.embedding(sentences)

        # Apply the RNN.
        # The shape of the RNN output tensor is (max_len, n_sentences, 2*rnn_size).
        rnn_out, _ = self.rnn(embedded)
        
        # Apply the linear output layer.
        # The shape of the output tensor is (max_len, n_sentences, n_labels).
        out = self.top_layer(rnn_out)
        
        # Find the positions where the token is a dummy padding token.
        pad_mask = (sentences == self.pad_word_id).float()

        # For these positions, we add some large number in the column corresponding
        # to the dummy padding label.
        out[:, :, self.pad_label_id] += pad_mask*10000

        return out
                
    def forward(self, sentences, labels):
        # As discussed above, this method first computes the predictions, and then
        # the loss function.
        
        # Compute the outputs. The shape is (max_len, n_sentences, n_labels).
        scores = self.compute_outputs(sentences)
        
        # Flatten the outputs and the gold-standard labels, to compute the loss.
        # The input to this loss needs to be one 2-dimensional and one 1-dimensional tensor.
        scores = scores.view(-1, self.n_labels)
        labels = labels.view(-1)
        return self.loss(scores, labels)

    def predict(self, sentences):
        # Compute the outputs from the linear units.
        scores = self.compute_outputs(sentences)

        # Select the top-scoring labels. The shape is now (max_len, n_sentences).
        predicted = scores.argmax(dim=2)

        # We transpose the prediction to (n_sentences, max_len), and convert it
        # to a NumPy matrix.
        return predicted.t().cpu().numpy()

In [14]:
from torchcrf import CRF

class RNNCRFTagger(nn.Module):
    
    def __init__(self, text_field, label_field, emb_dim, rnn_size, update_pretrained=False):
        super().__init__()
        
        voc_size = len(text_field.vocab)
        self.n_labels = len(label_field.vocab)       
        
        self.embedding = nn.Embedding(voc_size, emb_dim)
        if text_field.vocab.vectors is not None:
            self.embedding.weight = torch.nn.Parameter(text_field.vocab.vectors, 
                                                       requires_grad=update_pretrained)

        self.rnn = nn.GRU(input_size=emb_dim, hidden_size=rnn_size, 
                          bidirectional=True, num_layers=1)

        self.top_layer = nn.Linear(2*rnn_size, self.n_labels)
 
        self.pad_word_id = text_field.vocab.stoi[text_field.pad_token]
        self.pad_label_id = label_field.vocab.stoi[label_field.pad_token]
    
        self.crf = CRF(self.n_labels)
        
    def compute_outputs(self, sentences):
        embedded = self.embedding(sentences)
        rnn_out, _ = self.rnn(embedded)
        out = self.top_layer(rnn_out)
        
        pad_mask = (sentences == self.pad_word_id).float()
        out[:, :, self.pad_label_id] += pad_mask*10000
        
        return out
                
    def forward(self, sentences, labels):
        # Compute the outputs of the lower layers, which will be used as emission
        # scores for the CRF.
        scores = self.compute_outputs(sentences)

        # We return the loss value. The CRF returns the log likelihood, but we return 
        # the *negative* log likelihood as the loss value.            
        # PyTorch's optimizers *minimize* the loss, while we want to *maximize* the
        # log likelihood.
        return -self.crf(scores, labels)
            
    def predict(self, sentences):
        # Compute the emission scores, as above.
        scores = self.compute_outputs(sentences)

        # Apply the Viterbi algorithm to get the predictions. This implementation returns
        # the result as a list of lists (not a tensor), corresponding to a matrix
        # of shape (n_sentences, max_len).
        return self.crf.decode(scores)

In [15]:
#Convert a list of BIO labels, coded as integers, into spans identified by a beginning, an end, and a label.
# To allow easy comparison later, we store them in a dictionary indexed by the start position.
def to_spans(l_ids, voc):
    spans = {}
    current_lbl = None
    current_start = None
    for i, l_id in enumerate(l_ids):
        l = voc[l_id]

        if l[0] == 'B': 
            # Beginning of a named entity: B-something.
            if current_lbl:
                # If we're working on an entity, close it.
                spans[current_start] = (current_lbl, i)
            # Create a new entity that starts here.
            current_lbl = l[2:]
            current_start = i
        elif l[0] == 'I':
            # Continuation of an entity: I-something.
            if current_lbl:
                # If we have an open entity, but its label does not
                # correspond to the predicted I-tag, then we close
                # the open entity and create a new one.
                if current_lbl != l[2:]:
                    spans[current_start] = (current_lbl, i)
                    current_lbl = l[2:]
                    current_start = i
            else:
                # If we don't have an open entity but predict an I tag,
                # we create a new entity starting here even though we're
                # not following the format strictly.
                current_lbl = l[2:]
                current_start = i
        else:
            # Outside: O.
            if current_lbl:
                # If we have an open entity, we close it.
                spans[current_start] = (current_lbl, i)
                current_lbl = None
                current_start = None
    return spans

# Compares two sets of spans and records the results for future aggregation.
def compare(gold, pred, stats):
    for start, (lbl, end) in gold.items():
        stats['total']['gold'] += 1
        stats[lbl]['gold'] += 1
    for start, (lbl, end) in pred.items():
        stats['total']['pred'] += 1
        stats[lbl]['pred'] += 1
    for start, (glbl, gend) in gold.items():
        if start in pred:
            plbl, pend = pred[start]
            if glbl == plbl and gend == pend:
                stats['total']['corr'] += 1
                stats[glbl]['corr'] += 1

# This function combines the auxiliary functions we defined above.
def evaluate_iob(predicted, gold, label_field, stats):
    # The gold-standard labels are assumed to be an integer tensor of shape
    # (max_len, n_sentences), as returned by torchtext.
    gold_cpu = gold.t().cpu().numpy()
    gold_cpu = list(gold_cpu.reshape(-1))

    # The predicted labels assume the format produced by pytorch-crf, so we
    # assume that they have been converted into a list already.
    # We just flatten the list.
    pred_cpu = [l for sen in predicted for l in sen]
    
    # Compute spans for the gold standard and prediction.
    gold_spans = to_spans(gold_cpu, label_field.vocab.itos)
    pred_spans = to_spans(pred_cpu, label_field.vocab.itos)

    # Finally, update the counts for correct, predicted and gold-standard spans.
    compare(gold_spans, pred_spans, stats)

# Computes precision, recall and F-score, given a dictionary that contains
# the counts of correct, predicted and gold-standard items.
def prf(stats):
    if stats['pred'] == 0:
        return 0, 0, 0
    p = stats['corr']/stats['pred']
    r = stats['corr']/stats['gold']
    if p > 0 and r > 0:
        f = 2*p*r/(p+r)
    else:
        f = 0
    return p, r, f

In [16]:
model0 = RNNCRFTagger(SRC, TRG, emb_dim=300, rnn_size=128, update_pretrained=False)

In [17]:
# Count the number of words and sentences.
n_tokens_train = 0
n_sentences_train = 0
for ex in trainexamples:
    n_tokens_train += len(ex.SRC) + 2
    n_sentences_train += 1
n_tokens_valid = 0       
for ex in valexamples:
    n_tokens_valid += len(ex.SRC)

In [18]:
batch_size = 300
n_batches = np.ceil(n_sentences_train / batch_size)

mean_n_tokens = n_tokens_train / n_batches


In [19]:
device = 'cuda'

In [20]:
train_iterator = torchtext.data.BucketIterator(
            trainexamples,
            device=device,
            batch_size=batch_size,
            sort_key=lambda x: len(x.SRC),
            repeat=False,
            train=True,
            sort=True)

valid_iterator = torchtext.data.BucketIterator(
    valexamples,
    device=device,
    batch_size=64,
    sort_key=lambda x: len(x.SRC),
    repeat=False,
    train=False,
    sort=True)
    

In [21]:
train_batches = list(train_iterator)
valid_batches = list(valid_iterator)

In [22]:
model0.to(device)

RNNCRFTagger(
  (embedding): Embedding(72701, 300)
  (rnn): GRU(300, 128, bidirectional=True)
  (top_layer): Linear(in_features=256, out_features=6, bias=True)
  (crf): CRF(num_tags=6)
)

In [23]:
optimizer = torch.optim.Adam(model0.parameters(), lr=0.01, weight_decay=1e-5)

n_labels = len(TRG.vocab)

history = defaultdict(list)    

n_epochs = 3

for i in range(1, n_epochs + 1):

    t0 = time.time()

    loss_sum = 0

    model0.train()
    for batch in train_batches:

        # Compute the output and loss.
        loss = model0(batch.SRC, batch.TRG) / mean_n_tokens

        optimizer.zero_grad()            
        loss.backward()
        optimizer.step()
        loss_sum += loss.item()

    train_loss = loss_sum / n_batches
    history['train_loss'].append(train_loss)

    # Evaluate on the validation set.
    if i % 1 == 0:
        stats = defaultdict(Counter)

        model0.eval()
        with torch.no_grad():
            for batch in valid_batches:
                # Predict the model's output on a batch.
                predicted = model0.predict(batch.SRC)                   
                # Update the evaluation statistics.
                evaluate_iob(predicted, batch.TRG, TRG, stats)

        # Compute the overall F-score for the validation set.
        _, _, val_f1 = prf(stats['total'])

        history['val_f1'].append(val_f1)

        t1 = time.time()
        print(f'Epoch {i}: train loss = {train_loss:.4f}, val f1: {val_f1:.4f}, time = {t1-t0:.4f}')

# After the final evaluation, we print more detailed evaluation statistics, including
# precision, recall, and F-scores for the different types of named entities.
print()
print('Final evaluation on the validation set:')
p, r, f1 = prf(stats['total'])
print(f'Overall: P = {p:.4f}, R = {r:.4f}, F1 = {f1:.4f}')

ValueError: the first two dimensions of emissions and tags must match, got (88, 300) and (90, 300)

In [48]:
def tagging(sentences):
    # This method applies the trained model to a list of sentences.

    # First, create a torchtext Dataset containing the sentences to tag.
    examples = []
    for sen in sentences:
        labels = ['?']*len(sen) # placeholder
        examples.append(torchtext.data.Example.fromlist([sen, labels], fields))
    dataset = torchtext.data.Dataset(examples, fields)

    iterator = torchtext.data.Iterator(
        dataset,
        device=device,
        batch_size=64,
        repeat=False,
        train=False,
        sort=False)

    # Apply the trained model to all batches.
    out = []
    model0.eval()
    with torch.no_grad():
        for batch in iterator:
            # Call the model's predict method. This returns a list of NumPy matrix
            # containing the integer-encoded tags for each sentence.
            predicted = model0.predict(batch.SRC)

            # Convert the integer-encoded tags to tag strings.
            for tokens, pred_sen in zip(sentences, predicted):
                out.append([self.TRG.vocab.itos[pred_id] for _, pred_id in zip(tokens, pred_sen[1:])])
    return out

In [49]:
def print_tags(sentence):
    tokens = sentence.split()
    tags = tagging([tokens])[0]
    for token, tag1 in zip(tokens, tags):
        print(f'{token:12s}{tag1}')

In [42]:
print_tags('John Johnson was born in Moscow , lives in Gothenburg , and works for Chalmers Technical University and the University of Gothenburg .')

TypeError: tagging() missing 1 required positional argument: 'sentences'

In [5]:
SRC = Field(sequential=True, tokenize=tokenizersrc)
TRG = Field(sequential=True, tokenize=tokenizertrg, use_vocab=False)

In [6]:
data_fields = [('SRC', SRC), ('TRG', TRG)]

In [7]:
train, val, test = TabularDataset.splits(
    path='~/OneDrive/kph/ttd/', train='train.csv',
    validation='val.csv', test='test.csv', format='csv',
    fields=data_fields,skip_header=True)


In [8]:
# first version with fixed embeddings


In [10]:
SRC.build_vocab(train, vectors=embvec)

In [11]:

import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim

torch.manual_seed(250)

<torch._C.Generator at 0x7efdcc0c07b0>

In [None]:
#TEXT.vocab.set_vectors(vectors.stoi, vectors.vectors, vectors.dim)
#embedding = nn.Embedding(n_embed, embed_dim).from_pretrained(TEXT.vocab.vectors)

In [43]:
BATCH_SIZE = 3

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train, val, test), 
    batch_size = BATCH_SIZE)
    #sort_key=lambda x: len(x.SRC))

