In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from torch.nn.modules import padding
from torch.optim.lr_scheduler import StepLR

## Finding the distribution of TAGS

In [5]:
with open('train', "r") as f:
  d={}
  for line in f:
    line = line.strip()
    if len(line) != 0:
      parts = line.split(" ")
      label = parts[2]
      d[label]=1+d.get(label,0)
d

{'B-ORG': 6321,
 'O': 170524,
 'B-MISC': 3438,
 'B-PER': 6600,
 'I-PER': 4528,
 'B-LOC': 7140,
 'I-ORG': 3704,
 'I-MISC': 1155,
 'I-LOC': 1157}

## Loading pre-trained Glove Embeddings

In [6]:
#https://www.kaggle.com/code/fyycssx/first-try-lstm-with-glove-by-pytorch
embeddings_dictionary = dict()
glove_file = open('/content/drive/MyDrive/nlp/glove.6B.100d.txt', encoding="utf8")
word2idx={}
for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = np.asarray(records[1:], dtype='float32')
    embeddings_dictionary [word] = vector_dimensions
    if word not in word2idx:
      word2idx[word] = len(word2idx)
glove_file.close()


In [7]:
#adding special words to the vocabulary
word2idx['<unk>']=len(word2idx)
word2idx['<unkcap>']=len(word2idx)
word2idx['<pad>']=len(word2idx)


## Creating Custom Dataset loader for Train and Dev



In [8]:
class NERDataset(Dataset):
    def __init__(self, filename, word2idx):
        self.word2idx = word2idx
        self.label2idx = {}
        self.max_sent_len = 0
        self.data = []
        
        with open(filename, "r") as f:
            sentence, labels, boolean = [], [], []
            for line in f:
                line = line.strip()
                if len(line) == 0:
                    if len(sentence) > self.max_sent_len:
                        self.max_sent_len = len(sentence)
                    self.data.append((sentence, labels, boolean))
                    sentence, labels, boolean = [], [], []
                else:
                    parts = line.split(" ")
                    word = parts[1]
                    label = parts[2]
                    if word[0].isupper():
                        boolean.append(1)
                    else:
                        boolean.append(0)
                    #word=word.lower() 
                    if word.lower() not in self.word2idx:
                        word = '<unkcap>' if word[0].isupper() else '<unk>'
                    if label not in self.label2idx:
                        self.label2idx[label] = len(self.label2idx)
                    sentence.append(self.word2idx[word.lower()])
                    labels.append(self.label2idx[label])
                    
        if len(sentence) > 0:
            if len(sentence) > self.max_sent_len:
                self.max_sent_len = len(sentence)
            self.data.append((sentence, labels, boolean))

        self.word2idx['<pad>'] = len(self.word2idx)
        self.pad_idx = self.word2idx['<pad>']
        
        self.x, self.y, self.mask, self.lengths = [], [], [], []
        for sentence, labels, boolean in self.data:
            self.lengths.append(len(sentence))
            self.x.append(torch.tensor(sentence))
            self.y.append(torch.tensor(labels))
            self.mask.append(torch.tensor(boolean))
        
        self.x = pad_sequence(self.x, batch_first=True, padding_value=self.pad_idx)
        self.y = pad_sequence(self.y, batch_first=True, padding_value=self.pad_idx)
        self.mask = pad_sequence(self.mask, batch_first=True, padding_value=self.pad_idx)
        
        print('done')
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        return self.x[index], self.lengths[index], self.y[index], self.mask[index]


In [9]:
class ValidateNERDataset(Dataset):
    def __init__(self, filename,word2idx,label2idx):
        self.data = []
        self.word2idx = word2idx
        self.label2idx = label2idx
        self.max_sent_len = 0

        
        with open(filename, "r") as f:
            sentence, labels, boolean = [], [], []
            for line in f:
                line = line.strip()
                if len(line) == 0:
                    if len(sentence) > self.max_sent_len:
                        self.max_sent_len = len(sentence)
                    self.data.append((sentence, labels, boolean))
                    sentence, labels, boolean = [], [], []
                else:
                    parts = line.split(" ")
                    word = parts[1]
                    label = parts[2]
                    if word[0].isupper():
                        boolean.append(1)
                    else:
                        boolean.append(0)
                    #word=word.lower() 
                    if word.lower() not in self.word2idx:
                        word = '<unkcap>' if word[0].isupper() else '<unk>'
                    sentence.append(self.word2idx[word.lower()])
                    labels.append(self.label2idx[label])
                    
        if len(sentence) > 0:
            if len(sentence) > self.max_sent_len:
                self.max_sent_len = len(sentence)
            self.data.append((sentence, labels, boolean))

        self.word2idx['<pad>'] = len(self.word2idx)
        self.pad_idx = self.word2idx['<pad>']
        
        self.x, self.y, self.mask, self.lengths = [], [], [], []
        for sentence, labels, boolean in self.data:
            self.lengths.append(len(sentence))
            self.x.append(torch.tensor(sentence))
            self.y.append(torch.tensor(labels))
            self.mask.append(torch.tensor(boolean))
        
        self.x = pad_sequence(self.x, batch_first=True, padding_value=self.pad_idx)
        self.y = pad_sequence(self.y, batch_first=True, padding_value=self.pad_idx)
        self.mask = pad_sequence(self.mask, batch_first=True, padding_value=self.pad_idx)
        
        print('done')
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        return self.x[index], self.lengths[index], self.y[index], self.mask[index]


## BLSTM Modelwith Glove Embedding

In [45]:
class BLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, dropout, label_dim,embedding_mat):
        super().__init__()
        self.embedding = nn.Embedding(embedding_mat.shape[0], embedding_mat.shape[1])
        self.embedding.weight.data.copy_(embedding_mat)
        self.embedding.weight.requires_grad = False
        self.lstm = nn.LSTM(embedding_dim+1, hidden_dim, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(dropout)
        self.act = nn.ELU()
        self.classifier = nn.Linear(output_dim,label_dim)

    def forward(self, x, x_lengths, boolean):

        embedded = self.embedding(x)

        #Adding Boolean mask
        stacked_tensor = torch.cat((embedded, boolean.unsqueeze(-1)), dim=-1)
        
        packed_embedded = nn.utils.rnn.pack_padded_sequence(stacked_tensor, x_lengths, batch_first=True,enforce_sorted=False)
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output, batch_first=True)
        output = self.dropout(output)
        output=self.fc(output)
        output = self.act(output)
        output=self.classifier(output)
        output = output.permute(0, 2, 1)
        return output


## Setting the hyperparameter

In [46]:
batch_size = 32
train_dataset = NERDataset('train',word2idx)

# Hyperparameters
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 128
OUTPUT_LABEL_DIM = len(train_dataset.label2idx)
DROPOUT = 0.33
LEARNING_RATE = .2
EPOCHS = 30
STEP_SIZE = 20
GAMMA = 1


done


## Building the Embedding Matrix from Glove Embedding Dictionary we formed

In [47]:
vectors = list(embeddings_dictionary.values())
unk_vector = .5*np.mean(vectors, axis=0)
unk_cap_vector = np.mean(vectors, axis=0)
#not needed cos anyway i have mebedding matrix all 0 initially
pad=np.zeros(100)

In [48]:
embedding_matrix = torch.zeros((len(train_dataset.word2idx)+1, EMBEDDING_DIM))

In [49]:
#unk tags tak mean of all vecotrs
#unkcap have .5 *unk
#pad all 0 

for word,index in word2idx.items():
  if word=='<unk>':
    embedding_matrix[index]=torch.from_numpy(unk_vector)
  elif word=='<unkcap>':
    embedding_matrix[index]=torch.from_numpy(unk_cap_vector)
  elif word=='<pad>':
    embedding_matrix[index]=torch.from_numpy(pad)
  else:
    embedding_vector = embeddings_dictionary.get(word)
    embedding_matrix[index] = torch.from_numpy(embedding_vector)


In [50]:

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = BLSTM(len(train_dataset.word2idx), EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, DROPOUT, OUTPUT_LABEL_DIM,embedding_matrix).to(device)
train_loader = DataLoader(train_dataset, batch_size=batch_size, pin_memory=True)
optimizer = optim.SGD(model.parameters(), lr=LEARNING_RATE,momentum=0.9)
scheduler = StepLR(optimizer, step_size=STEP_SIZE, gamma=GAMMA)
freq=list(d.values())
sum_freq = sum(freq)
result = [(3.5 - f/sum_freq) for f in freq]
class_weights = torch.tensor(result).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=train_dataset.pad_idx,weight=class_weights).to(device)


In [51]:
dev_dataset = ValidateNERDataset('dev',train_dataset.word2idx,train_dataset.label2idx)
dev_loader = DataLoader(dev_dataset)


done


## Training the Model

In [52]:
# Train the model
for epoch in range(EPOCHS):
    model.train()

    for batch_idx, (x, lengths, y, boolean) in enumerate(train_loader):
        optimizer.zero_grad()

        target_packed_embedded = nn.utils.rnn.pack_padded_sequence(y.to(device), lengths, batch_first=True, enforce_sorted=False)
        target, target_lengths = nn.utils.rnn.pad_packed_sequence(target_packed_embedded, batch_first=True)
        
        
        #output = model(x.to(device), lengths.to(device))
        output = model(x.to(device), lengths.cpu(), boolean.to(device))


        loss = criterion(output, target.to(device))
        loss.backward()
        optimizer.step()
        scheduler.step()
        
        if batch_idx % 100 == 0:
            print(f"Epoch {epoch}, Batch {batch_idx}, Loss {loss.item()}")


        # Compute and print the validation loss
        if batch_idx % 2000 == 0:
            model.eval()  # Set the model to evaluation mode
            with torch.no_grad():
                val_loss = 0
                for val_x, val_lengths, val_y, maskval in dev_loader:
                    val_target_packed_embedded = nn.utils.rnn.pack_padded_sequence(val_y.to(device), val_lengths, batch_first=True, enforce_sorted=False)
                    val_target, val_target_lengths = nn.utils.rnn.pad_packed_sequence(val_target_packed_embedded, batch_first=True)
                    
                    val_output = model(val_x.to(device), val_lengths.cpu(),maskval.to(device))
                    val_loss += criterion(val_output, val_target.to(device)).item()
                
                print(f"Epoch {epoch}, Batch {batch_idx}, Validation Loss {val_loss/len(dev_loader)}")
            model.train()  # Set the model back to training mode


Epoch 0, Batch 0, Loss 2.1771366596221924
Epoch 0, Batch 0, Validation Loss 2.0378872593633712
Epoch 0, Batch 100, Loss 0.18912072479724884
Epoch 0, Batch 200, Loss 0.09165876358747482
Epoch 0, Batch 300, Loss 0.08963826298713684
Epoch 0, Batch 400, Loss 0.06601230055093765
Epoch 1, Batch 0, Loss 0.09416676312685013
Epoch 1, Batch 0, Validation Loss 0.20208752412927436
Epoch 1, Batch 100, Loss 0.05514644458889961
Epoch 1, Batch 200, Loss 0.053480081260204315
Epoch 1, Batch 300, Loss 0.05889945477247238
Epoch 1, Batch 400, Loss 0.048128169029951096
Epoch 2, Batch 0, Loss 0.07594601064920425
Epoch 2, Batch 0, Validation Loss 0.13323234040283588
Epoch 2, Batch 100, Loss 0.04499710723757744
Epoch 2, Batch 200, Loss 0.034022632986307144
Epoch 2, Batch 300, Loss 0.053188879042863846
Epoch 2, Batch 400, Loss 0.04046478867530823
Epoch 3, Batch 0, Loss 0.06296923756599426
Epoch 3, Batch 0, Validation Loss 0.1135679487895133
Epoch 3, Batch 100, Loss 0.04381031543016434
Epoch 3, Batch 200, Loss 0

# Model Saving and Loading

In [53]:
torch.save(model.state_dict(), 'blstm2.pt')


In [54]:
model2 = BLSTM(len(word2idx), EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, DROPOUT, OUTPUT_LABEL_DIM,embedding_matrix).to(device)

model2.load_state_dict(torch.load('blstm2.pt'))


<All keys matched successfully>

## Dev Prediction

In [55]:
from sklearn.metrics import precision_recall_fscore_support

model2.eval()
predicted_labels = []
true_labels = []

with torch.no_grad():
    for x, lengths, y,mask in dev_loader:
        x = x.to(device)

        y = y.to(device)

        target_packed_embedded = nn.utils.rnn.pack_padded_sequence(y, lengths, batch_first=True, enforce_sorted=False)
        target, target_lengths = nn.utils.rnn.pad_packed_sequence(target_packed_embedded, batch_first=True)
        #print('y',y)
        #print('target',target)
        output = model2(x, lengths,mask.to(device))

        predicted = torch.argmax(output, dim=1)
        predicted_labels.extend(predicted.cpu().numpy().tolist())
        true_labels.extend(target.cpu().numpy().tolist())


In [56]:
y_pred = [element for sub_list in predicted_labels for element in sub_list]
y_true=[element for sub_list in true_labels for element in sub_list]

list1 = y_true.copy()
list2 = y_pred.copy()

value_to_remove = train_loader.dataset.pad_idx

i = 0
while i < len(list1):
    if list1[i] == value_to_remove:
        # remove the element from list1
        list1.pop(i)
        # remove the corresponding element from list2 by using its index
        list2.pop(i)
    else:
        # only increment the loop counter if an element wasn't removed
        i += 1

print(list1)  # [1, 2, 4, 5, 6]
print(list2) 


[1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 2, 7, 1, 3, 4, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 5, 8, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 3, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 4, 1, 3, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 5, 1, 3, 1, 1, 1, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 0, 1, 3, 4, 1, 1, 1, 1, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 5, 1, 3, 4, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 

In [57]:

precision, recall, f1_score, _ = precision_recall_fscore_support(list1, list2, average='weighted')
print(f'Precision: {precision:.4f}, Recall: {recall:.4f}, F1 score: {f1_score:.4f}')


Precision: 0.9842, Recall: 0.9845, F1 score: 0.9843


## dev.out file for evaluating on PERL




In [58]:

devOutput = open("dev.out", "w")
k=0
i=0
idx2label = {value: key for key, value in train_dataset.label2idx.items()}

with open('/content/dev', 'r') as f:
  for line in f:
    
    line = line.strip().split(' ')
    #print(line)
    if len(line)>1:
      idx,word,gold  = line[0], line[1],line[2]
      pred=predicted_labels[k][i]
      i=i+1
      key = idx2label[pred]
      devOutput.write(f"{idx} {word} {gold} {key}\n")
    else:
      devOutput.write(f"{line[0]}\n")
      k=k+1
      i=0    
f.close()
devOutput.close()



## dev2.out file for submission (in same format as train)

In [59]:

devOutput = open("dev2.out", "w")
k=0
i=0

with open('/content/dev', 'r') as f:
  for line in f:
    line = line.strip().split(' ')
    if len(line)>1:
      idx,word,gold  = line[0], line[1],line[2]
      pred=predicted_labels[k][i]
      i=i+1
      key = idx2label[pred]
      devOutput.write(f"{idx} {word} {key}\n")
    else:
      devOutput.write(f"\n")
      k=k+1
      i=0    
f.close()
devOutput.close()


# Test Prediction

In [60]:
class TestNERDataset(Dataset):
    def __init__(self, filename,word2idx):
        self.data = []
        self.word2idx = word2idx
        self.max_sent_len = 0

        
        with open(filename, "r") as f:
            sentence, boolean = [], []
            for line in f:
                line = line.strip()
                if len(line) == 0:
                    if len(sentence) > self.max_sent_len:
                        self.max_sent_len = len(sentence)
                    self.data.append((sentence, boolean))
                    sentence, boolean = [], []
                else:
                    parts = line.split(" ")
                    word = parts[1]
                    if word[0].isupper():
                        boolean.append(1)
                    else:
                        boolean.append(0)
                    word=word.lower() 
                    if word not in self.word2idx:
                        word = '<unkcap>' if word[0].isupper() else '<unk>'
                    sentence.append(self.word2idx[word])
                    
        if len(sentence) > 0:
            if len(sentence) > self.max_sent_len:
                self.max_sent_len = len(sentence)
            self.data.append((sentence, boolean))

        self.pad_idx = self.word2idx['<pad>']
        
        self.x,  self.mask, self.lengths = [], [], []
        for sentence, boolean in self.data:
            self.lengths.append(len(sentence))
            self.x.append(torch.tensor(sentence))
            self.mask.append(torch.tensor(boolean))
        
        self.x = pad_sequence(self.x, batch_first=True, padding_value=self.pad_idx)
        self.mask = pad_sequence(self.mask, batch_first=True, padding_value=self.pad_idx)
        
        print('done')
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        return self.x[index], self.lengths[index],  self.mask[index]


In [61]:
test_dataset = TestNERDataset('test',train_dataset.word2idx)
test_loader = DataLoader(test_dataset)

done


In [62]:
from sklearn.metrics import precision_recall_fscore_support

model2.eval()
predicted_labels = []
true_labels = []

with torch.no_grad():
    for x, lengths,mask in test_loader:
        x = x.to(device)

        output = model2(x, lengths,mask.to(device))

        predicted = torch.argmax(output, dim=1)
        predicted_labels.extend(predicted.cpu().numpy().tolist())


## test2.out file for submission





In [63]:

testOutput = open("test2.out", "w")
k=0
i=0



with open('/content/test', 'r') as f:
  for line in f:
    line = line.strip().split(' ')
    if len(line)>1:
      idx,word  = line[0], line[1]
      pred=predicted_labels[k][i]
      i=i+1
      key = idx2label[pred]
      testOutput.write(f"{idx} {word} {key}\n")
    else:
      testOutput.write(f"\n")
      k=k+1
      i=0    
f.close()
testOutput.close()
