In [1]:
pip install transformers

Note: you may need to restart the kernel to use updated packages.


In [30]:
import torch
import pickle
import torch.nn as nn
import torch.optim as optim
from transformers import AutoTokenizer, AutoModel
import warnings
import time
import numpy as np
warnings.filterwarnings("ignore")

tokenizer = AutoTokenizer.from_pretrained("nghuyong/ernie-2.0-large-en")

In [3]:
def dataset(filename):
    with open(filename,'r') as fp:
        lines = [line.strip() for line in fp]
    return lines

In [4]:
def wordData(data):
    wordLines = data
    words = []
    probabilities = []
    wordList = []
    pos = []
    empty = []
    for line in wordLines:
        lineSplit = line.strip().split('\t')
        if line:
            word = lineSplit[1]
            prob = lineSplit[4]
            temp = lineSplit[5]
            words.append(word)
            probabilities.append(float(prob))
            pos.append(temp)
        elif not (len(empty) and []):
            wordList.append((words, pos, probabilities))
            words = []
            probabilities = []
            pos = []
    return wordList

In [5]:
def wordtest(data):
    wordLines = data
    words = []
    testWord = []
    empty = []
    for line in wordLines:
        lineSplit = line.strip().split('\t')
        if line:
            word = lineSplit[1]            
            words.append(word)
        elif not len(empty):
            testWord.append(words)
            words = []       
    return testWord

In [6]:
def preTokenizing(data):
    text = []
    pos = []
    probs = []
    for i,j,k in data:
            text.append(i)
            pos.append(j)
            probs.append(k)
    return text,pos, probs

In [7]:
TRAINING_FILE = "train.txt"
DEV_FILE = "dev.txt"
TEST_FILE = "test_data.txt"


In [8]:
trainText = wordData(dataset(TRAINING_FILE))
testEval = wordtest(dataset(TEST_FILE))
devText = wordData(dataset(DEV_FILE))

trainWords,trainTags, trainLabels = preTokenizing(trainText)
devWords, devTags, devLabels = preTokenizing(devText)

tokenized_text = []
for i in trainWords:
  sent = ''
  for h in i:
    if sent == '':
      sent = sent + h
    else:
      sent = sent+ " " +h
  tokenized_text.append(sent)  

In [9]:
def prob_list(batch_data,batch_probs):
  pb = []
  for i,j in zip(batch_data,batch_probs):
    tp = []
    for k,l in zip(i,j):
      temp = tokenizer.tokenize(k)
      if len(temp) == 1:
        tp.append(float(l))
      if len(temp) > 1:
        for i in range(len(temp)):
          tp.append(float(l))
    pb.append(tp)
  return pb

In [10]:
class ErnieModel(nn.Module):
    def __init__(self):
        super(ErnieModel, self).__init__()
        self.ernie = AutoModel.from_pretrained('nghuyong/ernie-2.0-large-en')
        self.linear = nn.Linear(1024, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, tokens):
        pooled_output,_ = self.ernie(tokens)
        linear_output = self.linear(pooled_output)
        proba = self.sigmoid(linear_output)
        return proba

In [36]:
bert_clf = ErnieModel()
optimizer = optim.Adamax(bert_clf.parameters(), lr=0.1)
bert_clf.train()
probablities = prob_list(trainWords,trainLabels)
pred_probs = []
for epoch_num in range(1):
    start_time = time.time()
    print("Running epoch_num ---->{}.....".format(epoch_num))
    count = 0
    ep_ls = 0.0
    for batch_data, batch_probs in zip(tokenized_text, probablities):
      bert_clf.zero_grad()
      answers_temp = []
      tokens = tokenizer.tokenize(batch_data)
      tid = tokenizer.encode_plus(tokens, add_special_tokens=False, return_attention_mask=False, return_tensors='pt')
      probas = bert_clf(tid['input_ids'])
      loss_func = nn.MSELoss(reduction = 'mean')
      p = []
      for i in batch_probs:
        q=[]
        q.append(i)
        p.append(q)
      batch_loss = loss_func(probas, torch.tensor(p))
      ep_ls = ep_ls + batch_loss.item()
      batch_loss.backward()
      optimizer.zero_grad()
      optimizer.step()
      count = count + 1
      o = batch_data.split(" ")
      temp_ans = []
      k = 0
      for i in o:
        if (len(tokenizer.tokenize(i))) == 1:
            temp_ans.append(probas[0][k].item())
            k= k + 1
        else:
            dum = []
            for g in range(len(tokenizer.tokenize(i))):
               dum.append(probas[0][k].item())
               k = k + 1
            val = np.average(dum)
            temp_ans.append(val)   
      pred_probs.append(temp_ans)
      if count == 10:
        break
    print("loss ---->{}".format((ep_ls/float(count))))
    print("Total runtime ----> %s seconds\n" % (time.time() - start_time))
      
      

Running epoch_num ---->0.....
loss ---->0.1414060562849045
Total runtime ----> 37.41793632507324 seconds



In [13]:
filename = 'modelep_5.sav'
pickle.dump(bert_clf, open(filename, 'wb'))

In [37]:
pred_probs

[[0.5728209614753723, 0.5750526189804077, 0.5087984204292297],
 [0.48095619678497314,
  0.4743928015232086,
  0.5327974557876587,
  0.5838488340377808,
  0.5195691585540771,
  0.4967581629753113,
  0.47870853543281555,
  0.5170894861221313,
  0.5010387897491455,
  0.4500127136707306],
 [0.6143060922622681,
  0.46975140273571014,
  0.5745329856872559,
  0.608700156211853,
  0.5496838092803955,
  0.5981898903846741,
  0.5987788438796997,
  0.5419473648071289],
 [0.4919780194759369, 0.4203195571899414],
 [0.4081747233867645,
  0.4090423583984375,
  0.4211327135562897,
  0.43705207109451294,
  0.36064231395721436,
  0.5978460907936096,
  0.43168124556541443,
  0.4607680141925812,
  0.4083693027496338,
  0.45330724120140076,
  0.4535541832447052,
  0.5563782453536987],
 [0.6730214357376099,
  0.6171466708183289,
  0.5863562226295471,
  0.6619576215744019,
  0.5785463452339172,
  0.5180293321609497,
  0.5949971079826355,
  0.5721586346626282,
  0.5892332792282104,
  0.5543256402015686,
  0.5