In [69]:
import datasets
import itertools
from collections import Counter
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import TensorDataset, DataLoader
from torch.optim import Adam
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import gzip
import shutil
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


dataset = datasets.load_dataset("conll2003")

train_data = dataset['train']
valid_data = dataset['validation']
test_data = dataset['test']

word_frequency = Counter(itertools.chain(*dataset['train']['tokens']))

word_frequency = {
    word: frequency
    for word, frequency in word_frequency.items()
    if frequency >= 3
}

word2idx = {
    word: index
    for index, word in enumerate(word_frequency.keys(), start=2)
}

word2idx['[PAD]'] = 0
word2idx['[UNK]'] = 1

def convert_word_to_id(sample):
    return {
        'input_ids': [word2idx.get(token, word2idx['[UNK]']) for token in sample['tokens']],
        'labels': sample['ner_tags']

    }

dataset = dataset.map(convert_word_to_id)

for split in dataset.keys():
    columns_to_remove = set(dataset[split].column_names) - {'input_ids', 'labels'}
    dataset[split] = dataset[split].remove_columns(list(columns_to_remove))

X_train = [torch.tensor(s['input_ids']) for s in dataset['train']]
y_train = [torch.tensor(s['labels']) for s in dataset['train']]
lengths_train = [len(s['input_ids']) for s in dataset['train']]

X_valid = [torch.tensor(s['input_ids']) for s in dataset['validation']]
y_valid = [torch.tensor(s['labels']) for s in dataset['validation']]
lengths_valid = [len(s['input_ids']) for s in dataset['validation']]

X_test = [torch.tensor(s['input_ids']) for s in dataset['test']]
y_test = [torch.tensor(s['labels']) for s in dataset['test']]
lengths_test = [len(s['input_ids']) for s in dataset['test']]

X_train_padded = pad_sequence(X_train, batch_first=True, padding_value=word2idx['[PAD]'])
y_train_padded = pad_sequence(y_train, batch_first=True, padding_value=9)

X_valid_padded = pad_sequence(X_valid, batch_first=True, padding_value=word2idx['[PAD]'])
y_valid_padded = pad_sequence(y_valid, batch_first=True, padding_value=9)

X_test_padded = pad_sequence(X_test, batch_first=True, padding_value=word2idx['[PAD]'])
y_test_padded = pad_sequence(y_test, batch_first=True, padding_value=9)

lengths_train = torch.tensor(lengths_train)
lengths_valid = torch.tensor(lengths_valid)
lengths_test = torch.tensor(lengths_test)

train_dataset = TensorDataset(X_train_padded, y_train_padded, lengths_train)
valid_dataset = TensorDataset(X_valid_padded, y_valid_padded, lengths_valid)
test_dataset = TensorDataset(X_test_padded, y_test_padded, lengths_test)

train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=64, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=False)

In [70]:
len(word2idx)

8128

In [71]:
class BLSTM(nn.Module):
    def __init__(self, num_embeddings, embedding_dim, lstm_hidden_dim, lstm_out_neurons, num_classes):
        super().__init__()
        self.lstm_hidden_dim = lstm_hidden_dim
        self.embedding = nn.Embedding(num_embeddings, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, lstm_hidden_dim, num_layers=1, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(0.33)
        self.fc1 = nn.Linear(lstm_hidden_dim*2, lstm_out_neurons)
        self.elu = nn.ELU()
        self.fc2 = nn.Linear(lstm_out_neurons,num_classes)
    
    def forward(self, x, lengths):

        embed = self.embedding(x)
        lstm_out, _ = self.lstm(embed)
        drop_out = self.dropout(lstm_out)
        fc1_out = self.fc1(drop_out)
        elu_out = self.elu(fc1_out)
        fc2_out = self.fc2(elu_out)
        
        return fc2_out

model1 = BLSTM(len(word2idx), 100, 256, 128, 9)
criterion1 = nn.CrossEntropyLoss(ignore_index=9)
optimizer1 = torch.optim.Adam(model1.parameters(), lr=0.01)
scheduler1 = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer1,mode='min',patience=3)

In [72]:
def train_model(model, train_dataloader, dev_dataloader, dev_len, num_epochs, criterion, optimizer, scheduler, saved_model):

    min_loss = np.Inf
    for epoch in range(num_epochs):
        model.train()
        for X, y, length in train_dataloader:
            optimizer.zero_grad()
            
            pack_seq = pack_padded_sequence(X, length, batch_first=True, enforce_sorted=False)
            X, _ = pad_packed_sequence(pack_seq, batch_first=True)

            output = model(X,length)

            y_packed = pack_padded_sequence(y, length, batch_first=True, enforce_sorted=False)
            y, _ = pad_packed_sequence(y_packed, batch_first=True)
            padding_mask = (y == 0) & (torch.arange(y.size(1))[None, :] >= length[:, None])
            y[padding_mask] = 9

            loss = criterion(torch.permute(output,(0,2,1)), (y.type(torch.LongTensor)))
            loss.backward()
            optimizer.step()

        model.eval()
        dev_loss = 0
        with torch.no_grad():
            for X, y, length in dev_dataloader:

                pack_seq = pack_padded_sequence(X, length, batch_first=True, enforce_sorted=False)
                X, _ = pad_packed_sequence(pack_seq, batch_first=True)

                output = model(X, length)

                y_packed = pack_padded_sequence(y, length, batch_first=True,enforce_sorted=False)
                y, _ = pad_packed_sequence(y_packed, batch_first=True)
                padding_mask = (y == 0) & (torch.arange(y.size(1))[None, :] >= length[:, None])
                y[padding_mask] = 9
                
                dev_loss = criterion(torch.permute(output,(0,2,1)), (y.type(torch.LongTensor)))
                dev_loss += loss.item()*torch.sum(length)
                
        scheduler.step(dev_loss)
        dev_loss /= dev_len
                
        if dev_loss <= min_loss:
            torch.save(model.state_dict(), saved_model)
            min_loss = dev_loss
    
    model.load_state_dict(torch.load(saved_model))
    return model

In [None]:
model1 = train_model(model1, train_dataloader, valid_dataloader, sum(lengths_valid), 30, criterion1, optimizer1, scheduler1, 'model1.pt')

In [76]:
with gzip.open('glove.6B.100d.gz', 'rb') as f_in:
    with open('glove.6B.100d.txt', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)


vocab_glove,embeddings_glove = [],[]
with open('glove.6B.100d.txt','rt') as fi:
    full_content = fi.read().strip().split('\n')
for i in range(len(full_content)):
    i_word = full_content[i].split(' ')[0]
    i_embeddings = [float(val) for val in full_content[i].split(' ')[1:]]
    vocab_glove.append(i_word)
    embeddings_glove.append(i_embeddings)

vocab_npa = np.array(vocab_glove)
embs_npa = np.array(embeddings_glove)

vocab_npa = np.insert(vocab_npa, 0, '[PAD]')
vocab_npa = np.insert(vocab_npa, 1, '[UNK]')

pad_emb_npa = np.zeros((1,embs_npa.shape[1]))
unk_emb_npa = np.mean(embs_npa,axis=0,keepdims=True)

embs_npa = np.vstack((pad_emb_npa,unk_emb_npa,embs_npa))

In [77]:
vocab_list = vocab_npa.tolist()
glove_embedding_features = []

for word, i in word2idx.items():
    is_title = 1.0 if word.istitle() else 0.0
    is_upper = 1.0 if word.isupper() else 0.0
    is_lower = 1.0 if word.islower() else 0.0

    lower = word.lower()
    if lower in vocab_list:
        idx = vocab_list.index(lower)
        embedding_l = embs_npa[idx]
    else:
        embedding_l = np.zeros((embs_npa.shape[1]))

    embedding_feature = np.concatenate([embedding_l, np.array([is_title, is_upper, is_lower])])
    glove_embedding_features.append(embedding_feature)

glove_embedding_features = np.array(glove_embedding_features)

In [79]:
class BLSTM_Glove(nn.Module):
    def __init__(self, num_embeddings, embedding_dim, lstm_hidden_dim, lstm_out_neurons, num_classes):
        super().__init__()
        self.lstm_hidden_dim = lstm_hidden_dim
        self.embedding = nn.Embedding(num_embeddings, embedding_dim=embedding_dim, padding_idx=0).from_pretrained(torch.from_numpy(glove_embedding_features).float(), freeze=False)
        self.lstm = nn.LSTM(embedding_dim, lstm_hidden_dim, num_layers=1, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(0.33)
        self.fc1 = nn.Linear(lstm_hidden_dim*2, lstm_out_neurons)
        self.elu = nn.ELU()
        self.fc2 = nn.Linear(lstm_out_neurons,num_classes)
    
    def forward(self, x, lengths):

        embed = self.embedding(x)
        lstm_out, _ = self.lstm(embed)
        drop_out = self.dropout(lstm_out)
        fc1_out = self.fc1(drop_out)
        elu_out = self.elu(fc1_out)
        fc2_out = self.fc2(elu_out)
        
        return fc2_out

model2 = BLSTM_Glove(len(word2idx), 103, 256, 128, 9)
criterion2 = nn.CrossEntropyLoss(ignore_index=9)
optimizer2 = torch.optim.Adam(model2.parameters(), lr=0.01)
scheduler2 = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer2, mode='min', patience=3)

In [None]:
model2 = train_model(model2, train_dataloader, valid_dataloader, sum(lengths_valid), 30, criterion2, optimizer2, scheduler2, 'model2.pt')