In [1]:
import pandas as pd
import numpy as np

import torch
from torch import nn
from torch.utils.data.sampler import SubsetRandomSampler
from collections import defaultdict
import torch.optim as optim
import warnings
import sys
warnings.filterwarnings("ignore")

In [2]:
word_train_sentences = []        
train_labels = []
temp_sentence=[]
temp_labels=[]
tag_set=set()

with open("data/train") as f:
    for sentence in f.read().splitlines():
        if sentence=="":
            word_train_sentences.append(temp_sentence)
            train_labels.append(temp_labels)
            temp_sentence=[]
            temp_labels=[]
            continue
        _,word,label=sentence.split()
        temp_sentence.append(word)
        temp_labels.append(label)
        tag_set.add(label)    
    word_train_sentences.append(temp_sentence)
    train_labels.append(temp_labels)

In [128]:
word_dev_sentences = []        
dev_labels = []
temp_sentence=[]
temp_labels=[]


with open("data/dev") as f:
    for sentence in f.read().splitlines():
        if sentence=="":
            word_dev_sentences.append(temp_sentence)
            dev_labels.append(temp_labels)
            temp_sentence=[]
            temp_labels=[]
            continue
        _,word,label=sentence.split()
        temp_sentence.append(word)
        temp_labels.append(label)
            
    word_dev_sentences.append(temp_sentence)
    dev_labels.append(temp_labels)

In [129]:
word_test_sentences = []        
temp_sentence=[]


with open("data/test") as f:
    for sentence in f.read().splitlines():
        if sentence=="":
            word_test_sentences.append(temp_sentence)
            temp_sentence=[]
            continue
        _,word=sentence.split()
        temp_sentence.append(word)
            
    word_test_sentences.append(temp_sentence)

In [130]:
pad_token = 'PAD'
unk_token = 'UNK'
pad_id = 0
unk_id = 1

label2idx = {'PAD': 0}
word2idx = {'PAD': 0, 'UNK': 1}

In [131]:
#create a dictionary for mapping word to integer and label to integer for creating their embeddings
df_train=pd.read_csv("../data/train", sep="\s", names=["idx","word","tag"])

In [132]:
word_count=df_train['word'].value_counts()
word_count=word_count[word_count>1]
word_set=list(set(word_count.index))
tag_set=list(tag_set)
for i in range(len(word_set)):
    word2idx[word_set[i]]=i+2

for i in range(len(tag_set)):
    label2idx[tag_set[i]]=i+1


In [133]:
idx2label={label2idx[k] : k for k in label2idx}

In [223]:
#pad sentences to the maximum length
def padding_sentences(final_length, sentences, labels):
    train_sentences=[]
    train_labels=[]
    for i in range(len(sentences)):
        lenn=len(sentences[i])
        temp_words=[]
        temp_labels=[]
        for j in range(final_length):
            if j<lenn:
                word = word2idx[sentences[i][j]] if sentences[i][j] in word2idx else 1
                label = label2idx[labels[i][j]] if labels[i][j] in label2idx else 1
                temp_words.append(word)
                temp_labels.append(label)
            else:
                temp_words.append(0)
                temp_labels.append(0)
        train_sentences.append(temp_words)
        train_labels.append(temp_labels)

    return torch.tensor(train_sentences, dtype=torch.long), torch.tensor(train_labels, dtype=torch.long)

train_sentences, train_labels = padding_sentences(28, word_train_sentences, train_labels)

In [135]:
char_set=set()
for seq in word_train_sentences:
    for word in seq:
        for c in word:
            char_set.add(c)
        
char2idx={'PAD': 0}      
char_set=list(char_set)
for i in range(len(char_set)):
    char2idx[char_set[i]]=i+1

idx2char={label2idx[k] : k for k in label2idx}

In [209]:
idx2word={word2idx[k] : k for k in word2idx}

In [215]:
max_char_len=19
def gen_char_sentences(dataset):
    char_sentences=[]
    for sentence in dataset:
        word_temp=[]
        for idx in sentence:
            word=idx2word[int(idx.numpy())]
            l=len(word)
            temp=[]
            if l<max_char_len:
                s=(max_char_len-l)//2
                e= max_char_len-l-s
                temp.extend([0]*s)
                for c in word:
                    temp.append(char2idx[c])
                temp.extend([0]*e)
            else:
                for i in range(max_char_len):
                    temp.append(char2idx[word[i]])
            word_temp.append(temp)
        char_sentences.append(word_temp)
    return torch.tensor(char_sentences, dtype=torch.long)


In [216]:
train_data_chars=gen_char_sentences(train_sentences)

## Task 2

In [139]:
filepath_glove = 'glove.6B.100d.txt'

In [140]:
embeddings_dict = {}
with open(filepath_glove, 'r', encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict[word] = vector

In [141]:
for sentence in word_dev_sentences:
    for word in sentence:
        if word not in word2idx:
            word2idx[word]=len(word2idx)
            
for sentence in word_test_sentences:
    for word in sentence:
        if word not in word2idx:
            word2idx[word]=len(word2idx)
            
            

In [142]:
emb_matrix=np.random.normal(0, 0.1, (len(word2idx), 100))

for word,index in word2idx.items():
    word = word if word in embeddings_dict else word.lower()
   
    if word in embeddings_dict:
        emb_matrix[index]=torch.as_tensor(embeddings_dict[word])    
    
emb_matrix=torch.tensor(emb_matrix, dtype=torch.float32)

In [143]:
class Model_Glove_LSTM_CNN(nn.Module):

    def __init__(self,
                 input_dim,
                 embedding_dim,
                 char_emb_dim,  
                 char_input_dim,  
                 char_cnn_filter_num,  
                 char_cnn_kernel_size,  
                 hidden_dim,
                 linear_output_dim,
                 lstm_layers,
                 emb_dropout,
                 cnn_dropout,  
                 fc_dropout,
                 word_pad_idx,
                 char_pad_idx):  
        super().__init__()

        self.embedding_dim = embedding_dim
        self.embedding = nn.Embedding(
            num_embeddings=input_dim,
            embedding_dim=embedding_dim,
            padding_idx=word_pad_idx
        )
        self.emb_dropout = nn.Dropout(emb_dropout)


        self.char_emb_dim = char_emb_dim
        self.char_emb = nn.Embedding(
            num_embeddings=char_input_dim,
            embedding_dim=char_emb_dim,
            padding_idx=char_pad_idx
        )
        self.char_cnn = nn.Conv1d(
            in_channels=char_emb_dim,
            out_channels=char_emb_dim * char_cnn_filter_num,
            kernel_size=char_cnn_kernel_size,
            groups=char_emb_dim  
        )
        self.cnn_dropout = nn.Dropout(cnn_dropout)

        self.lstm = nn.LSTM(
            input_size=embedding_dim + (char_emb_dim * char_cnn_filter_num),
            hidden_size=hidden_dim,
            num_layers=lstm_layers,
            bidirectional=True, batch_first=True)
      
        self.fc_dropout = nn.Dropout(fc_dropout)
        self.fc = nn.Linear(hidden_dim * 2, linear_output_dim)  
        
        
      
        self.elu = nn.ELU(alpha=1.0, inplace=False)
        
      
        self.linear_classifier = nn.Linear(linear_output_dim, len(label2idx))
        
        
        for name, param in self.named_parameters():
            nn.init.normal_(param.data, mean=0, std=0.1)

    def forward(self, words, chars):
      
        embedding_out = self.emb_dropout(self.embedding(words))
     
        char_emb_out = self.emb_dropout(self.char_emb(chars))
        batch_size, sent_len, word_len, char_emb_dim = char_emb_out.shape
        char_cnn_max_out = torch.zeros(batch_size, sent_len, self.char_cnn.out_channels)
     
        for sent_i in range(sent_len):
            
            sent_char_emb = char_emb_out[:, sent_i, :, :]  
           
            sent_char_emb_p = sent_char_emb.permute(0, 2, 1)  
            
            char_cnn_sent_out = self.char_cnn(sent_char_emb_p)
            char_cnn_max_out[:, sent_i, :], _ = torch.max(char_cnn_sent_out, dim=2) 
        char_cnn = self.cnn_dropout(char_cnn_max_out)

    
        word_features = torch.cat((embedding_out, char_cnn), dim=2)

        lstm_out, _ = self.lstm(word_features)
 
        s = self.fc(self.fc_dropout(lstm_out))
        
        s = self.elu(s)
        s = self.linear_classifier(s)
        return s

    def init_embeddings(self, char_pad_idx, word_pad_idx, pretrained=None, freeze=True):
      
        self.embedding.weight.data[word_pad_idx] = torch.zeros(self.embedding_dim)
        self.char_emb.weight.data[char_pad_idx] = torch.zeros(self.char_emb_dim)
        if pretrained is not None:
            self.embedding = nn.Embedding.from_pretrained(
                embeddings=torch.as_tensor(pretrained),
                padding_idx=word_pad_idx,
                freeze=freeze
            )

    def count_parameters(self):
        return sum(p.numel() for p in self.parameters() if p.requires_grad)

In [144]:
model = Model_Glove_LSTM_CNN(
    input_dim=len(word2idx),
    embedding_dim=100,
    char_emb_dim=30,
    char_input_dim=len(char2idx),
    char_cnn_filter_num=5,
    char_cnn_kernel_size=3,
    hidden_dim=256,
    linear_output_dim=128,
    lstm_layers=1,
    emb_dropout=0.33,
    cnn_dropout=0.25,
    fc_dropout=0.33,
    word_pad_idx=0,
    char_pad_idx=0
)
model.init_embeddings(
    char_pad_idx=0,
    word_pad_idx=0,
    pretrained=emb_matrix,
    freeze=True
)
print(f"The model has {model.count_parameters():,} trainable parameters.")
print(model)

The model has 1,110,488 trainable parameters.
Model_Glove_LSTM_CNN(
  (embedding): Embedding(20610, 100, padding_idx=0)
  (emb_dropout): Dropout(p=0.33, inplace=False)
  (char_emb): Embedding(85, 30, padding_idx=0)
  (char_cnn): Conv1d(30, 150, kernel_size=(3,), stride=(1,), groups=30)
  (cnn_dropout): Dropout(p=0.25, inplace=False)
  (lstm): LSTM(250, 256, batch_first=True, bidirectional=True)
  (fc_dropout): Dropout(p=0.33, inplace=False)
  (fc): Linear(in_features=512, out_features=128, bias=True)
  (elu): ELU(alpha=1.0)
  (linear_classifier): Linear(in_features=128, out_features=10, bias=True)
)


In [145]:
## changing this to suit my input

class Task3(object):

    def __init__(self, model, data, optimizer_cls, loss_fn_cls, LR, Momentum):
        self.model = model
        self.data = data
        self.optimizer = optimizer_cls(model.parameters(), lr=LR, momentum = Momentum)
       
        self.loss_fn = loss_fn_cls(ignore_index=self.data.tag_pad_idx)
       
        
   
    def accuracy(self, preds, y):
        max_preds = preds.argmax(dim=1, keepdim=True)  # get the index of the max probability
        non_pad_elements = (y != self.data.tag_pad_idx).nonzero()  # prepare masking for paddings
        correct = max_preds[non_pad_elements].squeeze(1).eq(y[non_pad_elements])
        return correct.sum() / torch.FloatTensor([y[non_pad_elements].shape[0]])
    
    
    def epoch(self):
        epoch_loss = 0
        epoch_acc = 0
        self.model.train()
        for words,true_tags,chars in self.data.train_iter:
              
            
            self.optimizer.zero_grad()
            pred_tags = self.model(words, chars)  # MODIFIED
       
            pred_tags = pred_tags.view(-1, pred_tags.shape[-1])
            
        
            true_tags = true_tags.view(-1)
            batch_loss = self.loss_fn(pred_tags, true_tags)
            batch_acc = self.accuracy(pred_tags, true_tags)
            batch_loss.backward()
            self.optimizer.step()
            epoch_loss += batch_loss.item()
            epoch_acc += batch_acc.item()
        return epoch_loss / len(self.data.train_iter), epoch_acc / len(self.data.train_iter)

  

    def evaluate(self, iterator):
        epoch_loss = 0
        epoch_acc = 0
        self.model.eval()
        with torch.no_grad():
        
            for batch in iterator:
                
                words = batch[0]
                chars = batch[2] 
                true_tags = batch[1]
                pred_tags = self.model(words, chars)  # MODIFIED
                pred_tags = pred_tags.view(-1, pred_tags.shape[-1])
                true_tags = true_tags.view(-1)
                batch_loss = self.loss_fn(pred_tags, true_tags)
                batch_acc = self.accuracy(pred_tags, true_tags)
                epoch_loss += batch_loss.item()
                epoch_acc += batch_acc.item()
                
        return epoch_loss / len(iterator), epoch_acc / len(iterator)

 
    def train(self, n_epochs):
        for epoch in range(n_epochs):
           
            
            train_loss, train_acc = self.epoch()
            
          
            print(f"Epoch: {epoch + 1:02} ")
            print(f"\tTrn Loss: {train_loss:.3f} ")
            
            val_loss, val_acc = self.evaluate(self.data.val_iter)
            print(f"\tVal Loss: {val_loss:.3f} ")

In [146]:
# Splitting input into train and validation 

num_workers=0
batch_size = 16

indices = list(range(len(word_train_sentences)))
split = int(len(word_train_sentences)*0.1)
train_idx, valid_idx = indices[split:], indices[:split]

# define samplers for obtaining training and validation batches
train_sampler = SubsetRandomSampler(train_idx)
valid_sampler = SubsetRandomSampler(valid_idx)

input_dataset = torch.utils.data.TensorDataset(train_sentences, label_encoded,train_data_chars)

# prepare data loaders
train_loader = torch.utils.data.DataLoader(input_dataset, batch_size=batch_size,
                                            sampler=train_sampler, num_workers=num_workers)
valid_loader = torch.utils.data.DataLoader(input_dataset, batch_size=batch_size, 
                                            sampler=valid_sampler, num_workers=num_workers)

In [147]:
class train_valid(object):
    def __init__(self,train_loader,valid_loader):
        self.train_iter = train_loader
        self.val_iter = valid_loader
        self.tag_pad_idx = pad_id
        self.unk_id = unk_id
        self.word2idx_dev_test = word2idx
        self.tag2idx = label2idx
        
data1=train_valid(train_loader,valid_loader)

In [148]:
t3 = Task3(model=model, data=data1, optimizer_cls=optim.SGD, loss_fn_cls=nn.CrossEntropyLoss, LR=0.1, Momentum=0.9)
t3.train(15)

Epoch: 01 
	Trn Loss: 0.059 
	Val Loss: 0.079 
Epoch: 02 
	Trn Loss: 0.055 
	Val Loss: 0.077 
Epoch: 03 
	Trn Loss: 0.054 
	Val Loss: 0.074 
Epoch: 04 
	Trn Loss: 0.051 
	Val Loss: 0.084 
Epoch: 05 
	Trn Loss: 0.047 
	Val Loss: 0.074 
Epoch: 06 
	Trn Loss: 0.047 
	Val Loss: 0.081 
Epoch: 07 
	Trn Loss: 0.045 
	Val Loss: 0.078 
Epoch: 08 
	Trn Loss: 0.043 
	Val Loss: 0.078 
Epoch: 09 
	Trn Loss: 0.041 
	Val Loss: 0.072 
Epoch: 10 
	Trn Loss: 0.040 
	Val Loss: 0.084 
Epoch: 11 
	Trn Loss: 0.039 
	Val Loss: 0.080 
Epoch: 12 
	Trn Loss: 0.037 
	Val Loss: 0.079 
Epoch: 13 
	Trn Loss: 0.035 
	Val Loss: 0.082 
Epoch: 14 
	Trn Loss: 0.034 
	Val Loss: 0.082 
Epoch: 15 
	Trn Loss: 0.033 
	Val Loss: 0.072 


In [149]:
# save the model to disk
filename = 'blstm3.pt'
torch.save(model, filename)
 

task3_model = torch.load(filename)

In [263]:
def infer(sentence, true_tags=None):
    model.eval()
    word_list=[]
    encoded_word_list=[]
    for word in sentence:
        word_list.append(word)
        encoded_word_list.append(word2idx[word] if word in word2idx else 1)
        



    
    word_temp=[]
    for word in sentence:
        l=len(word)
        temp=[]
        if l<max_char_len:
            s=(max_char_len-l)//2
            e= max_char_len-l-s
            temp.extend([0]*s)
            for c in word:
                temp.append(char2idx[c] if c in char2idx else 0)
            temp.extend([0]*e)
        else:
            for i in range(max_char_len):
                temp.append(char2idx[word[i]] if word[i] in char2idx else 0)
        word_temp.append(temp)
    
    
 
    pred = model(torch.as_tensor(encoded_word_list).unsqueeze(0), torch.as_tensor(word_temp).unsqueeze(0)).argmax(-1)


    return word_list, pred

### Dev Output

In [264]:
out_filename="dev3.out"
open(out_filename, 'w').close()
f1 = open(out_filename, "a")
for i in range(len(word_dev_sentences)):
    word_list, pred = infer(word_dev_sentences[i], dev_labels[i])
    for j in range(len(word_list)):
        f1.write(f'{j+1} {word_list[j]} {dev_labels[i][j]} {idx2label[int(pred[0][j].numpy())]}\n')
    f1.write("\n")
f1.close()

### Test Output

In [265]:
out_filename="test3.out"
open(out_filename, 'w').close()
f1 = open(out_filename, "a")
for i in range(len(word_test_sentences)):
    word_list, pred = infer(word_test_sentences[i])
    for j in range(len(word_list)):
        f1.write(f'{j+1} {word_list[j]} {idx2label[int(pred[0][j].numpy())]}\n')
    f1.write("\n")
f1.close()