In [79]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import pandas as pd
import pickle

In [80]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.manual_seed(0)

print("Using device:", device)


Using device: cuda


In [81]:
# load the dataset
with open('./UD_English-Atis/en_atis-ud-dev.conllu', 'r') as f:
    dev_data = f.read()
    
with open('./UD_English-Atis/en_atis-ud-train.conllu', 'r') as f:
    train_data = f.read()
    
with open('./UD_English-Atis/en_atis-ud-test.conllu', 'r') as f:
    test_data = f.read()
    
print("Train data size:", len(train_data))
print("Dev data size:", len(dev_data))
print("Test data size:", len(test_data))
# print("Train data sample:", train_data[:1000])

Train data size: 2588142
Dev data size: 349562
Test data size: 350662


In [82]:
def get_sentences(data):
    
    """ Function to get sentences from the dataset """
    
    sentences=[]
    for line in data.split('\n'):
        if line.startswith('# text = '):
            sentences.append(line[9:])
            
    return sentences


train_sentences = get_sentences(train_data)
test_sentences = get_sentences(test_data)
dev_sentences = get_sentences(dev_data)
        
print ("Train sentences size:", len(train_sentences))
print("Test sentences size:", len(train_sentences))    
print("Dev sentences size:", len(dev_sentences)) 

Train sentences size: 4274
Test sentences size: 4274
Dev sentences size: 572


In [83]:
def get_labels(data):
    labels = []
    words = []
    output_data = []

    for line in data.split('\n'):
        if (line):
            if line.startswith('# text = ') or line.startswith('# sent_id = '):
                continue

            temp = line.split('\t')[0:4] # getting the first 4 elements of the line
            temp=temp[0:2]+[temp[3]] # removing the 3rd element of the line
            
            # temp = index, word, label
            
            if temp[0] == '1':
                words=[]
                labels=[]
                output_data.append((words, labels))
                    
            words.append(temp[1])
            labels.append(temp[2])
            
            
    return output_data   


training_data = get_labels(train_data)
testing_data = get_labels(test_data)
dev_data = get_labels(dev_data)        

In [84]:
print(training_data)

[(['what', 'is', 'the', 'cost', 'of', 'a', 'round', 'trip', 'flight', 'from', 'pittsburgh', 'to', 'atlanta', 'beginning', 'on', 'april', 'twenty', 'fifth', 'and', 'returning', 'on', 'may', 'sixth'], ['PRON', 'AUX', 'DET', 'NOUN', 'ADP', 'DET', 'NOUN', 'NOUN', 'NOUN', 'ADP', 'PROPN', 'ADP', 'PROPN', 'VERB', 'ADP', 'NOUN', 'NUM', 'ADJ', 'CCONJ', 'VERB', 'ADP', 'NOUN', 'ADJ']), (['now', 'i', 'need', 'a', 'flight', 'leaving', 'fort', 'worth', 'and', 'arriving', 'in', 'denver', 'no', 'later', 'than', '2', 'pm', 'next', 'monday'], ['ADV', 'PRON', 'VERB', 'DET', 'NOUN', 'VERB', 'PROPN', 'PROPN', 'CCONJ', 'VERB', 'ADP', 'PROPN', 'DET', 'ADJ', 'ADP', 'NUM', 'NOUN', 'ADJ', 'NOUN']), (['i', 'need', 'to', 'fly', 'from', 'kansas', 'city', 'to', 'chicago', 'leaving', 'next', 'wednesday', 'and', 'returning', 'the', 'following', 'day'], ['PRON', 'VERB', 'PART', 'VERB', 'ADP', 'PROPN', 'PROPN', 'ADP', 'PROPN', 'VERB', 'ADJ', 'NOUN', 'CCONJ', 'VERB', 'DET', 'VERB', 'NOUN']), (['what', 'is', 'the', 'mean

In [85]:
word2idx = {}
tags2idx = {}
idx2tag = {}

for sent, tags in training_data:
    for word in sent:
        if word not in word2idx:
            word2idx[word] = len(word2idx)
    
    for tag in tags:
        if tag not in tags2idx:
            tags2idx[tag] = len(tags2idx)
            idx2tag[len(tags2idx)-1] = tag
            
word2idx['<unk>'] = len(word2idx)
tags2idx['<unk>'] = len(tags2idx)
idx2tag[len(tags2idx)-1] = '<unk>'

# save the dictionaries 
with open('word2idx.pkl', 'wb') as f:
    pickle.dump(word2idx, f)
    
with open('tags2idx.pkl', 'wb') as f:
    pickle.dump(tags2idx, f)

with open('idx2tag.pkl', 'wb') as f:
    pickle.dump(idx2tag, f)

print("Vocab size:", len(word2idx))
print("Tags size:", len(tags2idx))
print("Tags:", tags2idx)
# print("Vocab:", word2idx)

Vocab size: 864
Tags size: 14
Tags: {'PRON': 0, 'AUX': 1, 'DET': 2, 'NOUN': 3, 'ADP': 4, 'PROPN': 5, 'VERB': 6, 'NUM': 7, 'ADJ': 8, 'CCONJ': 9, 'ADV': 10, 'PART': 11, 'INTJ': 12, '<unk>': 13}


In [86]:
EMBEDDING_DIM = 128
HIDDEN_DIM = 128

In [87]:
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()

        self.hidden_dim = hidden_dim  # the number of features in the hidden state h
        self.word_embeddings = nn.Embedding(
            vocab_size, embedding_dim)  # embedding layer

        self.lstm = nn.LSTM(embedding_dim, hidden_dim)  # LSTM layer

        # the linear layer that maps from hidden state space to tag space-
        self.hidden2tag = nn.Linear(
            hidden_dim, tagset_size)  # fully connected layer
        self.hidden = self.init_hidden()  # initialize the hidden state

    def init_hidden(self):
        # Before we've done anything, we dont have any hidden state.
        return (torch.zeros(1, 1, self.hidden_dim).to(device),
                torch.zeros(1, 1, self.hidden_dim).to(device))  # (h0, c0)

    def forward(self, sentence):
        # get the embedding of the words
        embeds = self.word_embeddings(sentence)
        lstm_out, self.hidden = self.lstm(
            embeds.view(len(sentence), 1, -1), self.hidden)  # pass the embedding to the LSTM layer
        # pass the output of the LSTM layer to the fully connected layer
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        # get the softmax of the output of the fully connected layer
        tag_scores = F.log_softmax(tag_space, dim=1) 
        return tag_scores


In [88]:
model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word2idx), len(tags2idx)).to(device) # initialize the model

loss_function = nn.NLLLoss() # define the loss function
optimizer = optim.SGD(model.parameters(), lr=0.1) # define the optimizer

    
def prepare_sequence(seq, to_idx):
    idxs=[to_idx['<unk>'] if w not in to_idx else to_idx[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long).to(device)

    

In [89]:
# Training the model and evaluating it on the test set

def train_model(model, data, num_epoch):
    
    for epoch in range(num_epoch):    
        for sentence, tags in data:
            model.zero_grad() # clear the gradients of all optimized variables
            model.hidden = model.init_hidden() # initialize the hidden state

            sentence_in = prepare_sequence(sentence, word2idx) # convert the sentence to a tensor
            targets = prepare_sequence(tags, tags2idx) # convert the tags to a tensor

            tag_scores = model(sentence_in) # forward pass
            loss = loss_function(tag_scores, targets) # calculate the loss
            loss.backward() # backward pass
            optimizer.step() # update the parameters
        
        
        print("Epoch:", epoch, "Loss:", loss.item())
            
    

In [90]:
train_model(model, training_data, 3)

Epoch: 0 Loss: 0.0002589538926258683
Epoch: 1 Loss: 0.0001228894107043743
Epoch: 2 Loss: 8.448152948403731e-05
Epoch: 3 Loss: 5.957078974461183e-05
Epoch: 4 Loss: 4.331556920078583e-05


In [91]:
# save the model

model_path = 'pos_tagger_pretrained_model.pt'
torch.save(model.state_dict(), model_path)

In [92]:
def evaluate(model, data):
    model.eval()
    correct = 0
    total = 0
    for sentence, tags in data:
        sentence_in = prepare_sequence(sentence, word2idx)
        targets = prepare_sequence(tags, tags2idx)
        tag_scores = model(sentence_in)
        _, predicted = torch.max(tag_scores, 1)
        total += targets.size(0)
        correct += (predicted == targets).sum().item()
    return 100 * correct / total

In [93]:
print("Accuracy on the test set:", evaluate(model, testing_data))
print("Accuracy on the training set:", evaluate(model, training_data))
print("Accuracy on the dev set:", evaluate(model, dev_data))


Accuracy on the test set: 95.66869300911854
Accuracy on the training set: 96.8081389374165
Accuracy on the dev set: 95.8007224563516


In [94]:
sentence="Mary had a little lamb"

sentence = sentence.lower().split()
print("Sentence:", sentence)
print("Predicted:", [idx2tag[i] for i in model(prepare_sequence(sentence, word2idx)).argmax(1).tolist()])


Sentence: ['mary', 'had', 'a', 'little', 'lamb']
Predicted: ['NOUN', 'NOUN', 'DET', 'NOUN', 'NOUN']
