In [1]:
import torch
from torch import nn
import time
import torchtext
import numpy as np

import random

from collections import defaultdict, Counter

import matplotlib.pyplot as plt

%config InlineBackend.figure_format = 'retina' 
plt.style.use('seaborn')

import pandas as pd

In [2]:
embvec = torchtext.vocab.GloVe(name='840B', dim=300,cache='/home/pding/Documents/glove/')

In [3]:
import spacy
import en_core_web_sm
svoc = spacy.load("en_core_web_sm")

In [4]:
datao = pd.read_pickle("~/OneDrive/kph/processed2.pkl")
datatrain = datao[datao['ext perc']>=3]
datatest = datao[datao['ext perc']<3]
# separate train and validate 
VAL_RATIO = 0.2
dtrain = datatrain.loc[:,['SRC','TRG']]
dtraink = datatrain.loc[:,['SRC','TRG','keywords','ext perc']]
seed=250
idx = np.arange(datatrain.shape[0])
np.random.seed(seed)
np.random.shuffle(idx)
val_size = int(len(idx) * VAL_RATIO)
df_train = dtrain.iloc[idx[val_size:], :]
df_val = dtrain.iloc[idx[:val_size], :]
df_val_k = dtraink.iloc[idx[:val_size], :]
df_test = datatest.loc[:,['SRC','TRG']]

In [5]:
tokenizertrg = lambda x: x.split()
def tokenizersrc(text): # create a tokenizer function
    return [tok.text for tok in svoc.tokenizer(text)]

In [6]:
def read_data(df_train, datafields):
    examples = []
    words = []
    labels = []
    for pmid in df_train.index:
        words = tokenizersrc(df_train.loc[pmid,'SRC'])
        labels = tokenizertrg(df_train.loc[pmid,'TRG'])
        examples.append(torchtext.data.Example.fromlist([words, labels], datafields))
    return torchtext.data.Dataset(examples, datafields)

In [7]:
from torchcrf import CRF

class RNNCRFTagger(nn.Module):
    
    def __init__(self, text_field, label_field, emb_dim, rnn_size, update_pretrained=False):
        super().__init__()
        
        voc_size = len(text_field.vocab)
        self.n_labels = len(label_field.vocab)       
        
        self.embedding = nn.Embedding(voc_size, emb_dim)
        if text_field.vocab.vectors is not None:
            self.embedding.weight = torch.nn.Parameter(text_field.vocab.vectors, 
                                                       requires_grad=update_pretrained)

        self.rnn = nn.LSTM(input_size=emb_dim, hidden_size=rnn_size, 
                          bidirectional=True, num_layers=1)

        self.top_layer = nn.Linear(2*rnn_size, self.n_labels)
 
        self.pad_word_id = text_field.vocab.stoi[text_field.pad_token]
        self.pad_label_id = label_field.vocab.stoi[label_field.pad_token]
    
        self.crf = CRF(self.n_labels)
#        self.crf = ConditionalRandomField(self.n_labels, label_encoding="BIO",
#                                          idx2tag=text_field.vocab.itos
#                                         )
    def compute_outputs(self, sentences):
        embedded = self.embedding(sentences)
        rnn_out, _ = self.rnn(embedded)
        out = self.top_layer(rnn_out)
        return out
                
    def forward(self, sentences, labels):
        # Compute the outputs of the lower layers, which will be used as emission
        # scores for the CRF.
        scores = self.compute_outputs(sentences)
        mask0 = sentences != self.pad_word_id
        mask = mask0.byte()
        # We return the loss value. The CRF returns the log likelihood, but we return 
        # the *negative* log likelihood as the loss value.            
        # PyTorch's optimizers *minimize* the loss, while we want to *maximize* the
        # log likelihood.
        return -self.crf(scores, labels, mask=mask)
            
    def predict(self, sentences):
        # Compute the emission scores, as above.
        scores = self.compute_outputs(sentences)
        mask0 = sentences != self.pad_word_id
        mask = mask0.byte()
        # Apply the Viterbi algorithm to get the predictions. This implementation returns
        # the result as a list of lists (not a tensor), corresponding to a matrix
        # of shape (n_sentences, max_len).
        return self.crf.decode(scores, mask=mask)


In [8]:
TEXT = torchtext.data.Field(init_token='<bos>', eos_token='<eos>', sequential=True, lower=False)
LABEL = torchtext.data.Field(init_token='<bos>', eos_token='<eos>', sequential=True, unk_token=None)
fields = [('text', TEXT), ('label', LABEL)]
device = 'cuda'
use_pretrained = True

In [22]:
train_examples = read_data(df_train, fields)
valid_examples = read_data(df_val, fields)
# Load the pre-trained embeddings that come with the torchtext library.
if use_pretrained:
    print('We are using pre-trained word embeddings.')
    TEXT.build_vocab(train_examples, vectors=embvec)
else:  
    print('We are training word embeddings from scratch.')
    TEXT.build_vocab(train_examples, max_size=5000)
LABEL.build_vocab(train_examples)
# Create one of the models defined above.
#self.model = RNNTagger(self.TEXT, self.LABEL, emb_dim=300, rnn_size=128, update_pretrained=False)
model0 = RNNCRFTagger(TEXT, LABEL, emb_dim=300, rnn_size=85, update_pretrained=False)

model0.to(device)


We are using pre-trained word embeddings.


RNNCRFTagger(
  (embedding): Embedding(72288, 300)
  (rnn): LSTM(300, 85, bidirectional=True)
  (top_layer): Linear(in_features=170, out_features=6, bias=True)
  (crf): CRF(num_tags=6)
)

In [25]:
batch_size = 300

optimizer = torch.optim.Adam(model0.parameters(), lr=0.01, weight_decay=1e-5)

n_epochs = 20

In [11]:
def train(train_examples, valid_examples, embvec, TEXT, LABEL, device, model, batch_size, optimizer, n_epochs):


    # Count the number of words and sentences.
    n_tokens_train = 0
    n_sentences_train = 0
    for ex in train_examples:
        n_tokens_train += len(ex.text) + 2
        n_sentences_train += 1
    n_tokens_valid = 0       
    for ex in valid_examples:
        n_tokens_valid += len(ex.text)


    
    n_batches = np.ceil(n_sentences_train / batch_size)

    mean_n_tokens = n_tokens_train / n_batches

    train_iterator = torchtext.data.BucketIterator(
        train_examples,
        device=device,
        batch_size=batch_size,
        sort_key=lambda x: len(x.text),
        repeat=False,
        train=True,
        sort=True)

    valid_iterator = torchtext.data.BucketIterator(
        valid_examples,
        device=device,
        batch_size=64,
        sort_key=lambda x: len(x.text),
        repeat=False,
        train=False,
        sort=True)

    train_batches = list(train_iterator)
    valid_batches = list(valid_iterator)

    n_labels = len(LABEL.vocab)

    history = defaultdict(list)    

   

    for i in range(1, n_epochs + 1):

        t0 = time.time()

        loss_sum = 0

        model.train()
        for batch in train_batches:

            # Compute the output and loss.
            loss = model(batch.text, batch.label)  / mean_n_tokens

            optimizer.zero_grad()            
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 5)
            optimizer.step()
            loss_sum += loss.item()

        train_loss = loss_sum / n_batches
        history['train_loss'].append(train_loss)

        # Evaluate on the validation set.
        if i % 1 == 0:
            stats = defaultdict(Counter)

            t1 = time.time()
            print(f'Epoch {i}: train loss = {train_loss:.4f}, time = {t1-t0:.4f}')

    # After the final evaluation, we print more detailed evaluation statistics, including
    # precision, recall, and F-scores for the different types of named entities.


In [41]:
train(train_examples, valid_examples, embvec, TEXT, LABEL, device, model0, batch_size, optimizer,n_epochs)

Epoch 1: train loss = 0.0623, time = 16.2841
Epoch 2: train loss = 0.0631, time = 16.1392
Epoch 3: train loss = 0.0657, time = 16.1092
Epoch 4: train loss = 0.0646, time = 16.1398
Epoch 5: train loss = 0.0650, time = 16.1308
Epoch 6: train loss = 0.0650, time = 16.1051
Epoch 7: train loss = 0.0630, time = 16.1076
Epoch 8: train loss = 0.0623, time = 16.1172
Epoch 9: train loss = 0.0623, time = 16.1317
Epoch 10: train loss = 0.0620, time = 16.1472
Epoch 11: train loss = 0.0626, time = 16.1464
Epoch 12: train loss = 0.0635, time = 16.1333
Epoch 13: train loss = 0.0629, time = 16.1508
Epoch 14: train loss = 0.0626, time = 16.1351
Epoch 15: train loss = 0.0628, time = 16.1565
Epoch 16: train loss = 0.0622, time = 16.0859
Epoch 17: train loss = 0.0618, time = 16.1368
Epoch 18: train loss = 0.0614, time = 16.1593
Epoch 19: train loss = 0.0623, time = 16.0963
Epoch 20: train loss = 0.0620, time = 16.1048


In [47]:
torch.save(model0.state_dict(), '/home/pding/OneDrive/kph/kph/lstm1_85crf_620.pt')

In [61]:
torch.save(model0.state_dict(), '/home/pding/OneDrive/kph/kph/lstm2cr688.pt')

In [63]:
model0.load_state_dict(torch.load('/home/pding/OneDrive/kph/kph/lstmcrf.pt'))

RuntimeError: Error(s) in loading state_dict for RNNCRFTagger:
	size mismatch for rnn.weight_ih_l0: copying a param with shape torch.Size([384, 300]) from checkpoint, the shape in current model is torch.Size([512, 300]).
	size mismatch for rnn.weight_hh_l0: copying a param with shape torch.Size([384, 128]) from checkpoint, the shape in current model is torch.Size([512, 128]).
	size mismatch for rnn.bias_ih_l0: copying a param with shape torch.Size([384]) from checkpoint, the shape in current model is torch.Size([512]).
	size mismatch for rnn.bias_hh_l0: copying a param with shape torch.Size([384]) from checkpoint, the shape in current model is torch.Size([512]).
	size mismatch for rnn.weight_ih_l0_reverse: copying a param with shape torch.Size([384, 300]) from checkpoint, the shape in current model is torch.Size([512, 300]).
	size mismatch for rnn.weight_hh_l0_reverse: copying a param with shape torch.Size([384, 128]) from checkpoint, the shape in current model is torch.Size([512, 128]).
	size mismatch for rnn.bias_ih_l0_reverse: copying a param with shape torch.Size([384]) from checkpoint, the shape in current model is torch.Size([512]).
	size mismatch for rnn.bias_hh_l0_reverse: copying a param with shape torch.Size([384]) from checkpoint, the shape in current model is torch.Size([512]).

In [14]:
def kphext2(sentences,tags):
    kph = []
    for i in range(len(sentences)):
        s0 = svoc.tokenizer(sentences[i])
        s1 = [tok.text for tok in s0]
        t1 = tags[i]
        k1 = []
        for j in range(len(s1)):
            start = j
            if t1[j] == 'B':
                sti = 0
                stop = j+1
                while sti == 0:
                    try: 
                        kt = str(t1[stop])
                        if kt == 'I':
                            stop = stop+1
                        else:
                            k2 = str(s0[start:stop])
                            k1.append(k2)
                            sti =1
                    except(IndexError):
                        k2 = s0[start:stop]
                        k1.append(k2)
                        sti =1
                k2 = str(s1[j])
        kph.append(k1)
    return kph

In [15]:
def evaltest2(df_val, df_val_k, model):
    # This method applies the trained model to a list of sentences.
    examples = []
    for sen in df_val.SRC:
        words = tokenizersrc(sen)
        labels = ['O']*len(words) # placeholder
        examples.append(torchtext.data.Example.fromlist([words, labels], fields))
    dataset = torchtext.data.Dataset(examples, fields)

    iterator = torchtext.data.Iterator(
        dataset,
        device=device,
        batch_size=1,
        repeat=False,
        train=False,
        sort=False)

    # Apply the trained model to all batches.
    out = []
    model.eval()
    for batch in iterator:
        # Call the model's predict method. This returns a list of NumPy matrix
        # containing the integer-encoded tags for each sentence.
        predicted = model.predict(batch.text)

        # Convert the integer-encoded tags to tag strings.
        #for tokens, pred_sen in zip(sentences, predicted):
        for tokens, pred_sen in zip(batch.text.view(1,-1), predicted):
            out.append([LABEL.vocab.itos[pred_id] for _, pred_id in zip(tokens, pred_sen[1:])])
    return out

In [42]:
out2 = evaltest2(df_val, df_val_k, model0)

In [28]:
def tagperct(df_val,out):
    tp = np.empty(len(out))
    for i in range(len(df_val.index)):
        trg = tokenizertrg(df_val.iloc[i,1])
        total = 0
        for x in trg:
            if x != 'O':
                total = total+1
        matched = 0
        for j in range(total):
            if trg[j] != 'O':
                if trg[j]== out[i][j]:
                    matched = matched +1
        p = matched/total
        tp[i] = p
    return tp
        

In [48]:
def tokperct(df_val,out):
    tp = np.empty(len(out))
    for i in range(len(df_val.index)):
        trg = tokenizertrg(df_val.iloc[i,1])
        total = 0
        for x in trg:
            total = total+1
        matched = 0
        for j in range(total):
            if trg[j]== out[i][j]:
                matched = matched +1
        p = matched/total
        tp[i] = p
    return tp
        

In [None]:
outags2 = kphext2(sentences,tagss)

In [29]:
def kphperct(df_val_k,out):
    tp = np.empty(len(out))
    for i in range(len(df_val_k.index)):
        ktrg = df_val_k.iloc[i,2]
        pred = kphext2([df_val_k.iloc[i,0]],[out[i]])
        k = 0
        for kp in ktrg:
            if str(kp).lower() in [str(x).lower() for x in pred[0]]:
                k = k+1
        tp[i] = k/df_val_k.iloc[i,3]
    return tp

In [49]:
ttp1 = tokperct(df_val,out2)

In [None]:
sentences = [df_val_k.iloc[1,0]]

In [50]:
ttp1.mean()

0.9381363796103301

In [43]:
ttp2 = tagperct(df_val,out2)

In [44]:
ttp2.mean()

0.14402059837551062

In [45]:
ttp3 = kphperct(df_val_k,out2)

In [397]:
import pickle
pickle.dump( out2, open( "/home/pding/OneDrive/kph/kph/testresults.p", "wb" ) )

In [60]:
ttp3.mean()

0.521348910997703

In [46]:
ttp3.mean()

0.5117608400027607

In [330]:
for x in trg:
    if x != 'O':
        total = total+1


In [331]:
total

10

In [349]:
df_val.iloc[1,1].

'B O O B O O O O O O O O O O B B O O O O O O O O O O O O O O B O O O O O O O O O O O O O O O O O O O O O B O O B B O O O O O O O O O O O O O O O O O O O O O O O O O B O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O B O O O O B O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O B B O O O O O O O O O O O O O O O O O O O O O O O O B O O B O O O O O O O O O O O O O O O O O B O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O B O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O

In [401]:
x = 'True'

In [402]:
x is True

False