In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from torch.nn.modules import padding
from torch.optim.lr_scheduler import StepLR

## Finding the distribution of TAGS

In [2]:
with open('train', "r") as f:
  d={}
  for line in f:
    line = line.strip()
    if len(line) != 0:
      parts = line.split(" ")
      label = parts[2]
      d[label]=1+d.get(label,0)

d

{'B-ORG': 6321,
 'O': 170524,
 'B-MISC': 3438,
 'B-PER': 6600,
 'I-PER': 4528,
 'B-LOC': 7140,
 'I-ORG': 3704,
 'I-MISC': 1155,
 'I-LOC': 1157}

## Creating Custom Dataset loader for Train and Dev



In [3]:
class NERDataset(Dataset):
    def __init__(self, filename):
        self.data = []
        self.word2idx = {'<unk>':1,'<unkcap>':2}
        self.label2idx = {}
        self.max_sent_len = 0
        self.wordCounter={}

        with open(filename, "r") as f:
            sentence = []
            labels = []
            for line in f:
                line = line.strip()
                if len(line) != 0:
                    parts = line.split(" ")
                    word = parts[1]
                    self.wordCounter[word]=1+self.wordCounter.get(word,0)

        for word,count in self.wordCounter.items():
          if count>1 and word not in self.word2idx:
            self.word2idx[word]=len(self.word2idx)

        with open(filename, "r") as f:
            sentence = []
            labels = []
            for line in f:
                line = line.strip()
                if len(line) == 0:
                    if len(sentence) > self.max_sent_len:
                        self.max_sent_len = len(sentence)

                    self.data.append((sentence, labels))
                    sentence = []
                    labels = []
                else:
                    parts = line.split(" ")
                    word = parts[1]
                    label = parts[2]
                    if word not in self.word2idx:
                      if word[0].isupper():
                        word='<unkcap>'
                      else:
                        word='<unk>'
                        
                      
                    if label not in self.label2idx:
                        self.label2idx[label] = len(self.label2idx)

                    sentence.append(self.word2idx[word])
                    labels.append(self.label2idx[label])

        if len(sentence) > 0:
            if len(sentence) > self.max_sent_len:
                self.max_sent_len = len(sentence)

            self.data.append((sentence, labels))
        
    
        self.word2idx['<PAD>'] = len(self.word2idx)
        self.pad_idx = self.word2idx['<PAD>']
        
        
        # Pad sentences
        self.x = [torch.tensor(s) for s, _ in self.data]
        self.x = pad_sequence(self.x, batch_first=True, padding_value=self.pad_idx)

        # Pad labels
        self.y = [torch.tensor(l) for _, l in self.data]
        self.y = pad_sequence(self.y, batch_first=True,padding_value=self.pad_idx)

        # Calculate lengths
        self.lengths = [len(s) for s, _ in self.data]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        return self.x[index], self.lengths[index], self.y[index]



In [4]:
class ValidateNERDataset(Dataset):
    def __init__(self, filename,word2idx,label2idx):
        self.data = []
        self.word2idx = word2idx
        self.label2idx = label2idx
        self.max_sent_len = 0

        with open(filename, "r") as f:
            sentence = []
            labels = []
            for line in f:
                line = line.strip()
                if len(line) == 0:
                    if len(sentence) > self.max_sent_len:
                        self.max_sent_len = len(sentence)

                    self.data.append((sentence, labels))
                    sentence = []
                    labels = []
                else:
                    parts = line.split(" ")
                    word = parts[1]
                    label = parts[2]
                    if word not in self.word2idx:
                      if word[0].isupper():
                        word = '<unkcap>'
                      else:
                        word='<unk>'

                    sentence.append(self.word2idx[word])
                    labels.append(self.label2idx[label])

        if len(sentence) > 0:
            if len(sentence) > self.max_sent_len:
                self.max_sent_len = len(sentence)

            self.data.append((sentence, labels))


        self.pad_idx = self.word2idx['<PAD>']
        
        # Pad sentences
        self.x = [torch.tensor(s) for s, _ in self.data]
        self.x = pad_sequence(self.x, batch_first=True,padding_value=self.pad_idx)

        # Pad labels
        self.y = [torch.tensor(l) for _, l in self.data]
        self.y = pad_sequence(self.y, batch_first=True,padding_value=self.pad_idx)

        # Calculate lengths
        self.lengths = [len(s) for s, _ in self.data]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        return self.x[index], self.lengths[index], self.y[index]



## BLSTM Model

In [5]:
class BLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, dropout, label_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=train_dataset.pad_idx)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True ,bidirectional=True)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.act = nn.ELU(alpha=0.75)
        self.classifier = nn.Linear(output_dim,label_dim)

    def forward(self, x, x_lengths):
        embedded = self.embedding(x)
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, x_lengths, batch_first=True,enforce_sorted=False)
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output, batch_first=True)
        output = self.dropout(output)
        output=self.fc(output)
        output = self.act(output)
        output=self.classifier(output)
        output = output.permute(0, 2, 1)
        return output


## Setting the hyperparameter

In [56]:
batch_size = 8
train_dataset = NERDataset('train')

# Hyperparameters
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 128
OUTPUT_LABEL_DIM = len(train_dataset.label2idx)
DROPOUT = 0.33
#learning rate .1 is best
LEARNING_RATE = .005
EPOCHS = 50
STEP_SIZE = 20
GAMMA = 1


In [57]:

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = BLSTM(len(train_dataset.word2idx), EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, DROPOUT, OUTPUT_LABEL_DIM).to(device)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True)
optimizer = optim.SGD(model.parameters(), lr=LEARNING_RATE,momentum=.95)
scheduler = StepLR(optimizer, step_size=STEP_SIZE, gamma=GAMMA)
#freq=list(d.values())
#sum_freq = sum(freq)
#result = [(1.5 - f/sum_freq) for f in freq]
#class_weights = torch.tensor(result).to(device)
#class_weights = torch.tensor([1.6, .5, 1.6, 1.1, 1.4, 1.1, 1.6, 2.1, 2.1]).to(device)

#class_weights = torch.tensor([1.6, .7, 1.6, 1.8, 1.7, 1.6, 1.7, 1.6, 1.6]).to(device)
class_weights = torch.tensor([1.7, .75, 1.5, 1.8, 1.7, 1.8, 1.7, 1.6, 1.5]).to(device)

#class_weights = torch.tensor([2, .5, 2, 1.1, 1.4, 1.1, 1.6, 2.1, 2.1]).to(device)

criterion = nn.CrossEntropyLoss(ignore_index=train_dataset.pad_idx,weight=class_weights).to(device)


In [8]:
train_dataset.label2idx

{'B-ORG': 0,
 'O': 1,
 'B-MISC': 2,
 'B-PER': 3,
 'I-PER': 4,
 'B-LOC': 5,
 'I-ORG': 6,
 'I-MISC': 7,
 'I-LOC': 8}

In [9]:
dev_dataset = ValidateNERDataset('dev',train_dataset.word2idx,train_dataset.label2idx)
dev_loader = DataLoader(dev_dataset)

## Training the Model

In [58]:
min_val_loss=5
# Train the model
for epoch in range(EPOCHS):
    model.train()

    for batch_idx, (x, lengths, y) in enumerate(train_loader):
        optimizer.zero_grad()

        target_packed_embedded = nn.utils.rnn.pack_padded_sequence(y.to(device), lengths, batch_first=True, enforce_sorted=False)
        target, target_lengths = nn.utils.rnn.pad_packed_sequence(target_packed_embedded, batch_first=True)
        
        
        #output = model(x.to(device), lengths.to(device))
        output = model(x.to(device), lengths.cpu())

        loss = criterion(output, target.to(device))
        loss.backward()
        optimizer.step()
        scheduler.step()
        
        if batch_idx % 100 == 0:
            print(f"Epoch {epoch}, Batch {batch_idx}, Loss {loss.item()}")

        # Compute and print the validation loss
        if batch_idx % 2000 == 0:
            model.eval()  # Set the model to evaluation mode
            with torch.no_grad():
                val_loss = 0
                for val_x, val_lengths, val_y in dev_loader:
                    val_target_packed_embedded = nn.utils.rnn.pack_padded_sequence(val_y.to(device), val_lengths, batch_first=True, enforce_sorted=False)
                    val_target, val_target_lengths = nn.utils.rnn.pad_packed_sequence(val_target_packed_embedded, batch_first=True)
                    
                    val_output = model(val_x.to(device), val_lengths.cpu())
                    val_loss += criterion(val_output, val_target.to(device)).item()
                
                print(f"Epoch {epoch}, Batch {batch_idx}, Validation Loss {val_loss/len(dev_loader)}")
                if min_val_loss>val_loss/len(dev_loader):
                  min_val_loss=val_loss/len(dev_loader)
                  torch.save(model.state_dict(), 'task_minloss.pt')


            model.train()  # Set the model back to training model


Epoch 0, Batch 0, Loss 2.2467987537384033
Epoch 0, Batch 0, Validation Loss 2.1530335838232024
Epoch 0, Batch 100, Loss 0.5747454166412354
Epoch 0, Batch 200, Loss 0.3972536027431488
Epoch 0, Batch 300, Loss 0.2924852967262268
Epoch 0, Batch 400, Loss 0.27404943108558655
Epoch 0, Batch 500, Loss 0.22460637986660004
Epoch 0, Batch 600, Loss 0.3476652204990387
Epoch 0, Batch 700, Loss 0.21437563002109528
Epoch 0, Batch 800, Loss 0.276003360748291
Epoch 0, Batch 900, Loss 0.2197766900062561
Epoch 0, Batch 1000, Loss 0.32000187039375305
Epoch 0, Batch 1100, Loss 0.2562519609928131
Epoch 0, Batch 1200, Loss 0.8866047263145447
Epoch 0, Batch 1300, Loss 0.19038942456245422
Epoch 0, Batch 1400, Loss 0.16637755930423737
Epoch 0, Batch 1500, Loss 0.738795816898346
Epoch 0, Batch 1600, Loss 0.7032221555709839
Epoch 0, Batch 1700, Loss 0.30590152740478516
Epoch 0, Batch 1800, Loss 0.3025318682193756
Epoch 1, Batch 0, Loss 0.2271697074174881
Epoch 1, Batch 0, Validation Loss 0.9779573119639388
Epoc

# Model Saving and Loading

In [61]:
torch.save(model.state_dict(), 'blstm1.pt')


In [62]:
model2 = BLSTM(len(train_dataset.word2idx), EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, DROPOUT, OUTPUT_LABEL_DIM).to(device)


In [63]:
model2.load_state_dict(torch.load('blstm1.pt'))
#model2.load_state_dict(torch.load('task_minloss.pt'))


<All keys matched successfully>

# Dev Prediction

In [64]:
from sklearn.metrics import precision_recall_fscore_support

model2.eval()
predicted_labels = []
true_labels = []

with torch.no_grad():
    for x, lengths, y in dev_loader:
        x = x.to(device)
        y = y.to(device)
        target_packed_embedded = nn.utils.rnn.pack_padded_sequence(y, lengths, batch_first=True, enforce_sorted=False)
        target, target_lengths = nn.utils.rnn.pad_packed_sequence(target_packed_embedded, batch_first=True)

        output = model2(x, lengths)

        predicted = torch.argmax(output, dim=1)
        predicted_labels.extend(predicted.cpu().numpy().tolist())
        true_labels.extend(target.cpu().numpy().tolist())


In [66]:
y_pred = [element for sub_list in predicted_labels for element in sub_list]
y_true=[element for sub_list in true_labels for element in sub_list]

list1 = y_true.copy()
list2 = y_pred.copy()
value_to_remove = train_loader.dataset.pad_idx
i = 0
while i < len(list1):
    if list1[i] == value_to_remove:
        # remove the element from list1
        list1.pop(i)
        # remove the corresponding element from list2 by using its index
        list2.pop(i)
    else:
        # only increment the loop counter if an element wasn't removed
        i += 1

print(list1)  
print(list2) 


[1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 2, 7, 1, 3, 4, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 5, 8, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 3, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 4, 1, 3, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 5, 1, 3, 1, 1, 1, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 0, 1, 3, 4, 1, 1, 1, 1, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 5, 1, 3, 4, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 

In [67]:

precision, recall, f1_score, _ = precision_recall_fscore_support(list1, list2, average='weighted')
print(f'Precision: {precision:.4f}, Recall: {recall:.4f}, F1 score: {f1_score:.4f}')


Precision: 0.9598, Recall: 0.9607, F1 score: 0.9598


## dev.out file for evaluating on PERL




In [68]:

devOutput = open("dev.out", "w")
k=0
i=0
idx2label = {value: key for key, value in train_dataset.label2idx.items()}



with open('/content/dev', 'r') as f:
  for line in f:
    
    line = line.strip().split(' ')
    #print(line)
    if len(line)>1:
      idx,word,gold  = line[0], line[1],line[2]
      pred=predicted_labels[k][i]
      i=i+1
      key = idx2label[pred]
      devOutput.write(f"{idx} {word} {gold} {key}\n")
    else:
      devOutput.write(f"\n")
      k=k+1
      i=0    
f.close()
devOutput.close()


## dev1.out file for submission (in same format as train)

In [69]:

devOutput = open("dev1.out", "w")
k=0
i=0

with open('/content/dev', 'r') as f:
  for line in f:
    line = line.strip().split(' ')
    if len(line)>1:
      idx,word,gold  = line[0], line[1],line[2]
      pred=predicted_labels[k][i]
      i=i+1
      key = idx2label[pred]
      devOutput.write(f"{idx} {word} {key}\n")
    else:
      devOutput.write(f"\n")
      k=k+1
      i=0    
f.close()
devOutput.close()


# Test Predictions

In [70]:
class TestNERDataset(Dataset):
    def __init__(self, filename,word2idx):
        self.data = []
        self.word2idx = word2idx
        self.max_sent_len = 0

        with open(filename, "r") as f:
            sentence = []
            for line in f:
                line = line.strip()
                if len(line) == 0:
                    if len(sentence) > self.max_sent_len:
                        self.max_sent_len = len(sentence)

                    self.data.append(sentence)
                    sentence = []
                else:
                    parts = line.split(" ")
                    word = parts[1]
                    if word not in self.word2idx:
                      if word[0].isupper():
                        word = '<unkcap>'
                      else:
                        word='<unk>'

                    sentence.append(self.word2idx[word])

        if len(sentence) > 0:
            if len(sentence) > self.max_sent_len:
                self.max_sent_len = len(sentence)

            self.data.append(sentence)


        self.pad_idx = self.word2idx['<PAD>']
        
        # Pad sentences
        self.x = [torch.tensor(s) for s in self.data]
        self.x = pad_sequence(self.x, batch_first=True,padding_value=self.pad_idx)

        # Calculate lengths
        self.lengths = [len(s) for s in self.data]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        return self.x[index], self.lengths[index]



In [71]:
test_dataset = TestNERDataset('test',train_dataset.word2idx)
test_loader = DataLoader(test_dataset)

In [72]:
from sklearn.metrics import precision_recall_fscore_support

model2.eval()
predicted_labels = []
true_labels = []

with torch.no_grad():
    for x, lengths in test_loader:
        x = x.to(device)
      
        output = model2(x, lengths)

        predicted = torch.argmax(output, dim=1)
        predicted_labels.extend(predicted.cpu().numpy().tolist())


## test1.out file for submission





In [73]:

testOutput = open("test1.out", "w")
k=0
i=0

with open('/content/test', 'r') as f:
  for line in f:
    line = line.strip().split(' ')
    if len(line)>1:
      idx,word  = line[0], line[1]
      pred=predicted_labels[k][i]
      i=i+1
      key = idx2label[pred]
      testOutput.write(f"{idx} {word} {key}\n")
    else:
      testOutput.write(f"\n")
      k=k+1
      i=0    
f.close()
testOutput.close()
