In [1]:
#Install the transformers for using pre-trained Models
!pip install transformers



In [2]:
#Importing all the necessary packages
import torch
import pickle
import torch.nn as nn
import torch.optim as optim
from transformers import AutoTokenizer, AutoModel
import warnings
import time
import numpy as np
from earlystopping import EarlyStopping
warnings.filterwarnings("ignore")

In [3]:
#Defining the tokenizer and pre_trained model 
tokenizer = AutoTokenizer.from_pretrained("nghuyong/ernie-2.0-large-en")
pre_trained_model = AutoModel.from_pretrained('nghuyong/ernie-2.0-large-en')

In [4]:
# Load the train, test and dev dataset
def load_dataset(filename):
    with open(filename,'r') as fp:
        lines = [line.strip() for line in fp]
    return lines

In [5]:
# Getting the words, pos, probablities from both the Train and Dev dataset
def word_traindev_Data(data):
    wordLines = data
    words = []
    probabilities = []
    wordList = []
    pos = []
    empty = []
    for line in wordLines:
        lineSplit = line.strip().split('\t')
        if line:
            word = lineSplit[1]
            prob = lineSplit[4]
            temp = lineSplit[5]
            words.append(word)
            probabilities.append(float(prob))
            pos.append(temp)
        elif not (len(empty) and []):
            wordList.append((words, pos, probabilities))
            words = []
            probabilities = []
            pos = []
    return wordList

In [6]:
# Getting the words, pos, probablities from both the Test dataset
def word_test_Data(data):
    wordLines = data
    words = []
    testWord = []
    empty = []
    for line in wordLines:
        lineSplit = line.strip().split('\t')
        if line:
            word = lineSplit[1]            
            words.append(word)
        elif not len(empty):
            testWord.append(words)
            words = []       
    return testWord

In [7]:
# Generate separate list of words, pos and probablities for Train and Dev data
def data_preprocess_train_dev(data):
    text = []
    pos = []
    probs = []
    for i,j,k in data:
            text.append(i)
            pos.append(j)
            probs.append(k)
    return text,pos, probs

In [8]:
# Generate separate list of words, pos and probablities for Test data
def data_preprocess_test(data):
    text = []
    for i in data:
            text.append(i)
    return text

In [9]:
# Replicating probablities for matching length incase of sub tokenized words
def prob_list(batch_data,batch_probs):
    pb = []
    for i,j in zip(batch_data,batch_probs):
        tp = []
        for k,l in zip(i,j):
            temp = tokenizer.tokenize(k)
            if len(temp) == 1:
                tp.append(float(l))
            if len(temp) > 1:
                for i in range(len(temp)):
                    tp.append(float(l))
        pb.append(tp)
    return pb

In [10]:
# Generate sentence from words in dataset
def get_sentence(words):    
    tokenized_text = []
    for i in words:
        sent = ''
        for h in i:
            if sent == '':
                sent = sent + h
            else:
                sent = sent+ " " +h
        tokens = tokenizer.tokenize(sent)
        tid = tokenizer.encode(tokens, add_special_tokens=False)
        tokenized_text.append(tid)
    return tokenized_text

In [11]:
def pad_func(data):
    max_len = 0
    for i in data:
        if len(i) > max_len:
            max_len = len(i)
    padded = [i + [0]*(max_len-len(i)) for i in data]
    return padded

In [12]:
# Specifying file names
TRAINING_FILE = "train.txt"
DEV_FILE = "dev.txt"
TEST_FILE = "test_data.txt"


In [13]:
# Preprocessing work on the dataset 
trainText = word_traindev_Data(load_dataset(TRAINING_FILE))
testEval = word_test_Data(load_dataset(TEST_FILE))
devText = word_traindev_Data(load_dataset(DEV_FILE))

trainWords,trainTags, trainLabels = data_preprocess_train_dev(trainText)
devWords, devTags, devLabels = data_preprocess_train_dev(devText)
testWords = data_preprocess_test(testEval)


In [14]:
#Training data
train_tokens = get_sentence(trainWords)
train_probablities = prob_list(trainWords,trainLabels)
train_tokens_pad = pad_func(train_tokens)
train_probablities_pad = pad_func(train_probablities)

In [15]:
#Dev data
dev_tokens = get_sentence(devWords)
dev_probablities = prob_list(devWords,devLabels)
dev_tokens_pad = pad_func(dev_tokens)
dev_probablities_pad = pad_func(dev_probablities)

In [16]:
#defining the model class
class ErnieModel(nn.Module):
    def __init__(self):
        super(ErnieModel, self).__init__()
        self.ernie = pre_trained_model
        self.linear = nn.Linear(1024, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, tokens):
        pooled_output,_ = self.ernie(tokens)
        linear_output = self.linear(pooled_output)
        proba = self.sigmoid(linear_output)
        return proba

In [17]:
# Training the model
model = ErnieModel()
model_path = 'emphasismodel.pth'
early_stopping = EarlyStopping(model_path,4,True)
optimizer = optim.Adamax(model.parameters(), lr=0.1)
loss_func = nn.MSELoss(reduction = 'mean')
batch = 32
for epoch_num in range(10):
    model.train()
    start_time = time.time()
    print("Running epoch number ---->{}".format(epoch_num))
    training_loss = []
    validation_loss = []
    for i in range(0, len(train_tokens_pad), batch):
        model.zero_grad()
        train_probas = model(torch.tensor(train_tokens_pad[i:i+batch]))
        train_grd_truth = []
        for i in train_probablities_pad[i:i+batch]:
            p = []
            for j in i:
                q=[]
                q.append(j)
                p.append(q)
            train_grd_truth.append(p)
        train_batch_loss = loss_func(train_probas, torch.tensor(train_grd_truth))
        training_loss.append(train_batch_loss.item())
        train_batch_loss.backward()
        optimizer.zero_grad()
        optimizer.step()
        
    print("Training loss ---->{}".format((np.average(training_loss))))
    print("Total runtime ----> %s seconds\n" % (time.time() - start_time))
    
    #Validation Run
    model.eval()
    start_time = time.time()
    for i in range(0, len(dev_tokens_pad), batch):
        dev_probas = model(torch.tensor(dev_tokens_pad[i:i+batch]))
        dev_grd_truth = []
        for i in dev_probablities_pad[i:i+batch]:
            p = []
            for j in i:
                q=[]
                q.append(j)
                p.append(q)
            dev_grd_truth.append(p)
        dev_batch_loss = loss_func(dev_probas, torch.tensor(dev_grd_truth))
        validation_loss.append(dev_batch_loss.item())
        
    print("Validation loss ---->{}".format((np.average(validation_loss))))
    print("Total runtime ----> %s seconds\n" % (time.time() - start_time))
    early_stopping(np.average(validation_loss), model)

    if early_stopping.early_stop is True:
        print("Early stopping")
        break
    

Running epoch number ---->0
Training loss ---->0.22760500412347706
Total runtime ----> 1451.2978570461273 seconds

Validation loss ---->0.21707046031951904
Total runtime ----> 41.52866888046265 seconds

Validation loss is (inf --> 0.21707).  Saving model ...
Running epoch number ---->1
Training loss ---->0.2268183042490205
Total runtime ----> 1444.3351719379425 seconds

Validation loss ---->0.21707046031951904
Total runtime ----> 39.96937441825867 seconds

Validation loss is (0.21707 --> 0.21707).  Saving model ...
Running epoch number ---->2
Training loss ---->0.22727009550083516
Total runtime ----> 1411.040531873703 seconds

Validation loss ---->0.21707046031951904
Total runtime ----> 39.42757558822632 seconds

Validation loss is (0.21707 --> 0.21707).  Saving model ...
Running epoch number ---->3
Training loss ---->0.22665972921044328
Total runtime ----> 1407.9004814624786 seconds

Validation loss ---->0.21707046031951904
Total runtime ----> 38.54274368286133 seconds

Validation los

In [18]:
#saving the model
torch.save(model,model_path)

In [19]:
#loading the trained model
model = torch.load(model_path)

In [43]:
#Test Data
tokenized_test_text = []
for i in testWords:
    sent = ""
    for j in i:
        if sent == "":
            sent += j
        else:
            sent = sent + " " + j
    tokenized_test_text.append(sent)

In [48]:
# Testing the model on Test Dataset
test_prob=[]

for batch_data in tokenized_test_text:
    tokens = tokenizer.tokenize(batch_data)
    tid = tokenizer.encode_plus(tokens, add_special_tokens=False, return_attention_mask=False, return_tensors='pt')
    test_probas = model(tid['input_ids'])
    test_probas=test_probas.data
    out = batch_data.split(" ")
    temp_ans = []
    index = 0
    for i in out:
        if (len(tokenizer.tokenize(i))) == 1:
            temp_ans.append(test_probas[0][index].item())
            index = index + 1
        else:
            holder = []
            for j in range(len(tokenizer.tokenize(i))):
                holder.append(test_probas[0][index].item())
                index = index + 1
            prb = np.average(holder)
            temp_ans.append(prb) 
    test_prob.append(temp_ans)
    

In [50]:
#Predicted Probablities
for i in range(10):
    print("Sentence = {}".format(tokenized_test_text[i]))
    print("Emphasis Values = {}\n".format(test_prob[i]))

Sentence = We 'll be closed from 12/24 to 1/1 . See you in the New Year !
Emphasis Values = [0.44879353046417236, 0.466549813747406, 0.5147578120231628, 0.44036665558815, 0.489033579826355, 0.4737546344598134, 0.48464372754096985, 0.4211202661196391, 0.4757114350795746, 0.4624374508857727, 0.43629321455955505, 0.523070216178894, 0.5038394331932068, 0.470909059047699, 0.5153239965438843, 0.45707380771636963]

Sentence = No matter how hard you work , someone else is working harder .
Emphasis Values = [0.38491129875183105, 0.3850019574165344, 0.5098055005073547, 0.4513833224773407, 0.40719500184059143, 0.48779892921447754, 0.45820364356040955, 0.39396074414253235, 0.47681671380996704, 0.3848966956138611, 0.4817645251750946, 0.5142143964767456, 0.38491830229759216]

Sentence = The less I needed , the better I felt .
Emphasis Values = [0.46332982182502747, 0.4040818512439728, 0.44980910420417786, 0.5157061219215393, 0.49623650312423706, 0.4885362386703491, 0.45792195200920105, 0.45895540714