In [125]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
#importing the data
f = open('/content/train.txt','r')
temp_sentance = []
temp_tags = []
traning_sentances = [] #sentance of the training data
traning_tags = []  #tab of the training data
for line in f:
    line = line.rstrip('\n')
    items  = line.split(' ')
    
    if line == '':
        traning_sentances.append(temp_sentance)
        traning_tags.append(temp_tags)
        temp_sentance = []
        temp_tags = []
        
        
    else:
        word = items[0]
        tag = items[3]
        temp_sentance.append(word)
        temp_tags.append(tag)
        
        
f.close()

In [126]:
training_data = [] #the input of training
for i in range (0,len(traning_sentances)):
    training_data.append((traning_sentances[i],traning_tags[i]))

#Vocab:
vocab = {}
tags_set = {} 

for sent, tags in training_data:
    for word in sent:
        if word not in vocab:  # word has not been assigned an index yet
            vocab[word] = len(vocab)  # Assign each word with a unique 
    for tag in tags:
        if tag not in tags_set:
            tags_set[tag] = len(tags_set)
#Assign the tag
EMBEDDING_DIM = 6
HIDDEN_DIM = 6

In [144]:
fv = open('/content/valid.txt','r')
temp_sentance = []
temp_tags = []
validate_sentances = [] #sentance of the training data
validate_tags = []  #tab of the training data
for line in fv:
    line = line.rstrip('\n')
    items  = line.split(' ')
    
    if line == '':
        validate_sentances.append(temp_sentance)
        validate_tags.append(temp_tags)
        temp_sentance = []
        temp_tags = []
        
        
    else:
        word = items[0]
        tag = items[3]
        temp_sentance.append(word)
        temp_tags.append(tag)
        
        
fv.close()
validate_data = [] #the input of training
for i in range (0,len(validate_sentances)):
    validate_data.append((validate_sentances[i],validate_tags[i]))

#Vocab:
vocab_validate = {}
tags_set_validate = {} 

for sent, tags in validate_data:
    for word in sent:
        if word not in vocab_validate:  # word has not been assigned an index yet
            vocab_validate[word] = len(vocab_validate)  # Assign each word with a unique 
    for tag in tags:
        if tag not in tags_set_validate :
            tags_set_validate [tag] = len(tags_set_validate )
#Assign the tag

In [145]:
#create the model
class LSTMTagger(nn.Module):
    
    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [146]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)
model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(vocab), len(tags_set))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

# See what the scores are before training
# Note that element i,j of the output is the score for tag j for word i.
# Here we don't need to train, so the code is wrapped in torch.no_grad()
with torch.no_grad():
    inputs = prepare_sequence(training_data[1][0], vocab)
    tag_scores = model(inputs)


In [147]:
loss_track_lstm = []
for epoch in range(20): 
    training_loss = 0.
    
    for sentence, tags in training_data:
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Tensors of word indices.
        sentence_in = prepare_sequence(sentence, vocab)
        targets = prepare_sequence(tags, tags_set)

        # Step 3. Run our forward pass.
        tag_scores = model(sentence_in)

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(tag_scores, targets)
        training_loss += loss.item()
        loss.backward()
        optimizer.step()
    training_loss /= i
    loss_track_lstm.append(training_loss)
    print(f'epoch: {epoch}, training_loss {training_loss}')


# See what the scores are after training
with torch.no_grad():
    inputs = prepare_sequence(training_data[0][0], vocab)
    tag_scores = model(inputs)

    # The sentence is "the dog ate the apple".  i,j corresponds to score for tag j
    # for word i. The predicted tag is the maximum scoring tag.
    # Here, we can see the predicted sequence below is 0 1 2 0 1
    # since 0 is index of the maximum value of row 1,
    # 1 is the index of maximum value of row 2, etc.
    # Which is DET NOUN VERB DET NOUN, the correct sequence!
    print(tag_scores)

epoch: 0, training_loss 3.3972879949938317
epoch: 1, training_loss 2.76139103317728
epoch: 2, training_loss 2.3144302534441383
epoch: 3, training_loss 2.024062489461222
epoch: 4, training_loss 1.8398382056199591
epoch: 5, training_loss 1.6879057591707682
epoch: 6, training_loss 1.5260604351631544
epoch: 7, training_loss 1.4000191406803522
epoch: 8, training_loss 1.3345054451724274
epoch: 9, training_loss 1.2379102861547095
epoch: 10, training_loss 1.1722143221509087
epoch: 11, training_loss 1.1140061067655107
epoch: 12, training_loss 1.0398851627780015
epoch: 13, training_loss 0.991050560183293
epoch: 14, training_loss 0.9437430990499386
epoch: 15, training_loss 0.9228904914243808
epoch: 16, training_loss 0.891853737344878
epoch: 17, training_loss 0.8311541569180495
epoch: 18, training_loss 0.8056381019364237
epoch: 19, training_loss 0.7607659367790959
tensor([[-3.6622e-03, -7.9277e+00, -6.7947e+00, -6.6209e+00, -8.3193e+00,
         -8.3315e+00, -8.7192e+00, -8.5903e+00, -1.1597e+01]]

In [148]:
model.eval()
y_pred = []
y_label = []

for item in validate_data:
  tags = item[1]
  for tag in tags:
    y_label.append(tag)









In [149]:
tag_list = list(tags_set.keys())
for sentance , tags in  validate_data:
  inputs = prepare_sequence(sentance, vocab_validate)

  inputs = inputs
  tag_scores = model(inputs)
# print the most likely tag index, by grabbing the index with the maximum score!
# recall that these numbers correspond to tag2idx = {"DET": 0, "NN": 1, "V": 2}
  _, predicted_tags = torch.max(tag_scores, 1)
  for idx in predicted_tags:
    predict_tag = tag_list[idx]
    y_pred.append(predict_tag)

print(y_pred)


['O', 'B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O', 'B-PER', 'I-PER', 'B-LOC', 'O', 'O', 'B-MISC', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'B-ORG', 'O', 'O', 'B-PER', 'O', 'B-MISC', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'B-ORG', 'I-PER', 'O', 'B-PER', 'O', 'B-ORG', 'I-MISC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'O', 'O', 'B-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'I-PER', 'O', 'B-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-

# New Section

In [151]:
print(len(y_pred), len(y_label))
count = 0
for i in range(len(y_pred)):
  if (y_pred[i] == y_label[i]):
    count +=1

count/51578

51578 51578


0.6290860444375509