In [458]:
#Install the transformers for using pre-trained Models
!pip install transformers



In [460]:
#Importing all the necessary packages
import torch
import pickle
import torch.nn as nn
import torch.optim as optim
from transformers import AutoTokenizer, AutoModel
import warnings
import time
import numpy as np
warnings.filterwarnings("ignore")

In [461]:
#Defining the tokenizer and pre_trained model 
tokenizer = AutoTokenizer.from_pretrained("nghuyong/ernie-2.0-large-en")
pre_trained_model = AutoModel.from_pretrained('nghuyong/ernie-2.0-large-en')

In [462]:
# Load the train, test and dev dataset
def load_dataset(filename):
    with open(filename,'r') as fp:
        lines = [line.strip() for line in fp]
    return lines

In [463]:
# Getting the words, pos, probablities from both the Train and Dev dataset
def word_traindev_Data(data):
    wordLines = data
    words = []
    probabilities = []
    wordList = []
    pos = []
    empty = []
    for line in wordLines:
        lineSplit = line.strip().split('\t')
        if line:
            word = lineSplit[1]
            prob = lineSplit[4]
            temp = lineSplit[5]
            words.append(word)
            probabilities.append(float(prob))
            pos.append(temp)
        elif not (len(empty) and []):
            wordList.append((words, pos, probabilities))
            words = []
            probabilities = []
            pos = []
    return wordList

In [464]:
# Getting the words, pos, probablities from both the Test dataset
def word_test_Data(data):
    wordLines = data
    words = []
    testWord = []
    empty = []
    for line in wordLines:
        lineSplit = line.strip().split('\t')
        if line:
            word = lineSplit[1]            
            words.append(word)
        elif not len(empty):
            testWord.append(words)
            words = []       
    return testWord

In [465]:
# Generate separate list of words, pos and probablities for Train and Dev data
def data_preprocess_train_dev(data):
    text = []
    pos = []
    probs = []
    for i,j,k in data:
            text.append(i)
            pos.append(j)
            probs.append(k)
    return text,pos, probs

In [466]:
# Generate separate list of words, pos and probablities for Test data
def data_preprocess_test(data):
    text = []
    for i in data:
            text.append(i)
    return text

In [467]:
# Replicating probablities for matching length incase of sub tokenized words
def prob_list(batch_data,batch_probs):
    pb = []
    for i,j in zip(batch_data,batch_probs):
        tp = []
        for k,l in zip(i,j):
            temp = tokenizer.tokenize(k)
            if len(temp) == 1:
                tp.append(float(l))
            if len(temp) > 1:
                for i in range(len(temp)):
                    tp.append(float(l))
        pb.append(tp)
    return pb

In [468]:
# Generate sentence from words in dataset
def get_sentence(words):    
    tokenized_text = []
    for i in words:
        sent = ''
        for h in i:
            if sent == '':
                sent = sent + h
            else:
                sent = sent+ " " +h
        tokens = tokenizer.tokenize(sent)
        tid = tokenizer.encode(tokens, add_special_tokens=False)
        tokenized_text.append(tid)
    return tokenized_text

In [469]:
def pad_func(data):
    max_len = 0
    for i in data:
        if len(i) > max_len:
            max_len = len(i)
    padded = [i + [0]*(max_len-len(i)) for i in data]
    return padded

In [470]:
# Specifying file names
TRAINING_FILE = "train.txt"
DEV_FILE = "dev.txt"
TEST_FILE = "test_data.txt"


In [471]:
# Preprocessing work on the dataset 
trainText = word_traindev_Data(load_dataset(TRAINING_FILE))
testEval = word_test_Data(load_dataset(TEST_FILE))
devText = word_traindev_Data(load_dataset(DEV_FILE))

trainWords,trainTags, trainLabels = data_preprocess_train_dev(trainText)
devWords, devTags, devLabels = data_preprocess_train_dev(devText)
testWords = data_preprocess_test(testEval)


In [472]:
#Training data
train_tokens = get_sentence(trainWords)
train_probablities = prob_list(trainWords,trainLabels)
train_tokens_pad = pad_func(train_tokens)
train_probablities_pad = pad_func(train_probablities)

In [474]:
#Dev data
dev_tokens = get_sentence(devWords)
dev_probablities = prob_list(devWords,devLabels)
dev_tokens_pad = pad_func(dev_tokens)
dev_probablities_pad = pad_func(dev_probablities)

In [475]:
#defining the model class
class ErnieModel(nn.Module):
    def __init__(self):
        super(ErnieModel, self).__init__()
        self.ernie = pre_trained_model
        self.linear = nn.Linear(1024, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, tokens):
        pooled_output,_ = self.ernie(tokens)
        linear_output = self.linear(pooled_output)
        proba = self.sigmoid(linear_output)
        return proba

In [None]:
# Training the model
model = ErnieModel()
optimizer = optim.Adamax(model.parameters(), lr=0.1)
loss_func = nn.MSELoss(reduction = 'mean')
batch = 100
for epoch_num in range(10):
    model.train()
    start_time = time.time()
    print("Running epoch number ---->{}".format(epoch_num))
    training_loss = []
    validation_loss = []
    for i in range(0, len(train_tokens_pad), batch):
        model.zero_grad()
        train_probas = model(torch.tensor(train_tokens_pad[i:i+batch]))
        train_grd_truth = []
        for i in train_probablities_pad[i:i+batch]:
            p = []
            for j in i:
                q=[]
                q.append(j)
                p.append(q)
            train_grd_truth.append(p)
        train_batch_loss = loss_func(train_probas, torch.tensor(train_grd_truth))
        training_loss.append(train_batch_loss.item())
        train_batch_loss.backward()
        optimizer.zero_grad()
        optimizer.step()
    print("Training loss ---->{}".format((np.average(training_loss))))
    print("Total runtime ----> %s seconds\n" % (time.time() - start_time))
    
    #Validation Run
    model.eval()
    start_time = time.time()
    for i in range(0, len(dev_tokens_pad), batch):
        dev_probas = model(torch.tensor(dev_tokens_pad[i:i+batch]))
        dev_grd_truth = []
        for i in dev_probablities_pad[i:i+batch]:
            p = []
            for j in i:
                q=[]
                q.append(j)
                p.append(q)
            dev_grd_truth.append(p)
        dev_batch_loss = loss_func(dev_probas, torch.tensor(dev_grd_truth))
        validation_loss.append(dev_batch_loss.item())
    print("Validation loss ---->{}".format((np.average(validation_loss))))
    print("Total runtime ----> %s seconds\n" % (time.time() - start_time))
    

Running epoch number ---->0
Training loss ---->0.21222754567861557
Total runtime ----> 1746.43834066391 seconds

Validation loss ---->0.19990291446447372
Total runtime ----> 68.5400459766388 seconds

Running epoch number ---->1
Training loss ---->0.21182132991296904
Total runtime ----> 1700.1604647636414 seconds

Validation loss ---->0.19990291446447372
Total runtime ----> 68.23428678512573 seconds

Running epoch number ---->2
Training loss ---->0.21185098003063882
Total runtime ----> 1696.9388134479523 seconds

Validation loss ---->0.19990291446447372
Total runtime ----> 72.30727744102478 seconds

Running epoch number ---->3
Training loss ---->0.21213645062276296
Total runtime ----> 1652.2579152584076 seconds

Validation loss ---->0.19990291446447372
Total runtime ----> 64.92111802101135 seconds

Running epoch number ---->4
Training loss ---->0.21288843133619853
Total runtime ----> 1724.8364732265472 seconds

Validation loss ---->0.19990291446447372
Total runtime ----> 61.482586145401

In [None]:
#Saving the model
filename = 'emphasis_model.sav'
pickle.dump(model, open(filename, 'wb'))