<a href="https://colab.research.google.com/github/patanjali-b/Lesk_WSD/blob/main/POS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [28]:
pip install conllu

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [29]:
import conllu
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
# from 'dataset.py' import POSTaggingDataset

In [30]:
device = "cuda" if torch.cuda.is_available() else "cpu"


In [31]:
with open('/en_atis-ud-train.conllu', 'r', encoding="utf-8") as f:
    sentences = conllu.parse(f.read())

    train_data = []

    for sentence in sentences:
        indexes = []
        words = []
        pos = []
        # print(sentence)
        for i in range(len(sentence)):
            indexes.append(sentence[i]['id'])
            words.append(sentence[i]['form'])
            pos.append(sentence[i]['upos'])
        tagged_sentence = (words, pos)
        train_data.append(tagged_sentence)
print(train_data[0])


(['what', 'is', 'the', 'cost', 'of', 'a', 'round', 'trip', 'flight', 'from', 'pittsburgh', 'to', 'atlanta', 'beginning', 'on', 'april', 'twenty', 'fifth', 'and', 'returning', 'on', 'may', 'sixth'], ['PRON', 'AUX', 'DET', 'NOUN', 'ADP', 'DET', 'NOUN', 'NOUN', 'NOUN', 'ADP', 'PROPN', 'ADP', 'PROPN', 'VERB', 'ADP', 'NOUN', 'NUM', 'ADJ', 'CCONJ', 'VERB', 'ADP', 'NOUN', 'ADJ'])


In [32]:
tag_to_idx = {
    "ADJ": 0,
    "ADP": 1,
    "ADV": 2,
    "AUX": 3,
    "CCONJ": 4,
    "DET": 5,
    "INTJ": 6,
    "NOUN": 7,
    "NUM": 8,
    "PART": 9,
    "PRON": 10,
    "PROPN": 11,
    "PUNCT": 12,
    "SCONJ": 13,
    "SYM": 14,
    "VERB": 15,
    "X": 16,
    "UNK":17
}

tagset_size = len(tag_to_idx)



In [33]:
word_to_idx = {}
for sentence, tags in train_data:
    for word in sentence:
        if word not in word_to_idx:
            word_to_idx[word] = len(word_to_idx)
word_to_idx["UNK"] = len(word_to_idx)
vocab_size = len(word_to_idx)


# MODEL

In [34]:
embedding_dim = 300
hidden_dim = 300

class LSTMTagger(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        self.hidden = self.init_hidden()
    def init_hidden(self):
      return (torch.zeros(1,1,self.hidden_dim), torch.zeros(1,1,self.hidden_dim))
    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [35]:
model = LSTMTagger(embedding_dim, hidden_dim, vocab_size, tagset_size).to(device)
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)   

def input_seq(sentence, word_to_idx):
    idxs = []
    for w in sentence:
          if w in word_to_idx:
                idxs.append(word_to_idx[w])
          if w not in word_to_idx:
                idxs.append(word_to_idx["UNK"])


    # length = len(sentence)
    # if(length < 46):
    #   while(46-length >0 and flag == 1):
    #     idxs.append(len(word_to_idx))
    #   while(46-length >0 and flag == 0):
    #     idxs.append(len(tag_to_idx))      
        
    return torch.tensor(idxs, dtype=torch.long).to(device)

for epoch in range(15): 
    print("Epoch: ", epoch)
    print("Device type = ", device)
    for sentence, tags in train_data:
        model.zero_grad()
        model.hidden = model.init_hidden()
        input_sentence = input_seq(sentence, word_to_idx)
        targets = input_seq(tags, tag_to_idx)

        tag_scores = model(input_sentence).to(device)

        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()

    print("Loss: ", loss.item())
    #calculate accuracy
    correct_counts = 0
    total_counts = 0
    with torch.no_grad():
        for sentence, tags in train_data:
            input_sentence = input_seq(sentence, word_to_idx)
            targets = input_seq(tags, tag_to_idx)
            tag_scores = model(input_sentence).to(device)
            #print("Tag scores = ", len(tag_scores))
            for i in range(len(tag_scores)):
                predicted_index = torch.argmax(tag_scores[i])
                if predicted_index == targets[i]:
                    correct_counts += 1
                total_counts += 1
        print("Training Accuracy = ", correct_counts/total_counts)
        

# torch.save(model.state_dict(), "model.pt")

"""# TEST THE MODEL"""


# torch.load("model.pt")
# model.eval()


Epoch:  0
Device type =  cpu
Loss:  0.0005204427870921791
Training Accuracy =  0.9540232247456583
Epoch:  1
Device type =  cpu
Loss:  0.00019949952547904104
Training Accuracy =  0.9632514643921488
Epoch:  2
Device type =  cpu
Loss:  0.00012994921416975558
Training Accuracy =  0.9658411262974
Epoch:  3
Device type =  cpu
Loss:  0.00011509454634506255
Training Accuracy =  0.9666015825711644
Epoch:  4
Device type =  cpu
Loss:  9.526610665488988e-05
Training Accuracy =  0.9674031445894564
Epoch:  5
Device type =  cpu
Loss:  7.904250378487632e-05
Training Accuracy =  0.9680608365019011
Epoch:  6
Device type =  cpu
Loss:  7.028233085293323e-05
Training Accuracy =  0.9686363169252903
Epoch:  7
Device type =  cpu
Loss:  6.024073081789538e-05
Training Accuracy =  0.968903504264721
Epoch:  8
Device type =  cpu
Loss:  5.0422320782672614e-05
Training Accuracy =  0.9690679272428322
Epoch:  9
Device type =  cpu
Loss:  4.225747397867963e-05
Training Accuracy =  0.9694995375603741
Epoch:  10
Device ty

'# TEST THE MODEL'