In [828]:
import pandas as pd 
import numpy as np 
import datasets
import torch
import math
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report
from torch.utils.data import TensorDataset
import time
from itertools import chain
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F
import copy
from torch.optim import lr_scheduler

In [48]:
import torch
import math
# this ensures that the current MacOS version is at least 12.3+
print(torch.backends.mps.is_available())
# this ensures that the current current PyTorch installation was built with MPS activated.
print(torch.backends.mps.is_built())

True
True


In [49]:
dtype = torch.float
device = torch.device("mps")

In [244]:
dataset = datasets.load_dataset("conll2003")

### Convert words/tokens to indices

In [683]:
import itertools
from collections import Counter

word_frequency = Counter(itertools.chain(*dataset['train']['tokens']))  # type: ignore

# Remove words below threshold 3
word_frequency = {
    word: frequency
    for word, frequency in word_frequency.items()
    if frequency >= 3
}

word2idx = {
    word: index
    for index, word in enumerate(word_frequency.keys(), start=2)
}

word2idx['[PAD]'] = 0
word2idx['[UNK]'] = 1

In [1004]:
sample_tokens = dataset['train'][0]['tokens']
sample_tokens

['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']

In [684]:
# the vocab size
vocab_size = max(word2idx.values())+1
vocab_size

8128

In [689]:
def convert_word_to_id(sample):
#Code to convert all tokens to their respective indexes
#If the token is unknown, we set index of 1
    input_ids = [ word2idx.get(token, 1) for token in sample['tokens'] ]

    sample['input_ids'] = input_ids
    return sample

dataset = dataset.map(convert_word_to_id)

Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [692]:
df_train = pd.DataFrame(dataset['train']).drop(columns=['pos_tags', 'chunk_tags', 'id', 'tokens'])
df_train.columns = ['label','input_ids']

df_test = pd.DataFrame(dataset['test']).drop(columns=['pos_tags', 'chunk_tags', 'id', 'tokens'])
df_test.columns = ['label','input_ids']

df_val = pd.DataFrame(dataset['validation']).drop(columns=['pos_tags', 'chunk_tags', 'id', 'tokens'])
df_val.columns = ['label','input_ids']

### Padding

In [1145]:
import pandas as pd
import torch
from torch.utils.data import Dataset

# Create a custom Dataset class
class CustomDataset(Dataset):
    def __init__(self, dataframe):
        self.data = dataframe

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        label = torch.tensor(self.data.loc[idx, "label"], dtype=torch.long)
        input_ids = torch.tensor(self.data.loc[idx, "input_ids"], dtype=torch.long)

        return input_ids, label

# Create an instance of the CustomDataset
dataset_train = CustomDataset(df_train)

# Example: Accessing a single sample
print(dataset_train[2])


(tensor([12, 13]), tensor([5, 0]))


In [1146]:
def custom_collate(batch):
    # Separate input sequences and labels
    input_seqs, labels = zip(*batch)
    
    # Calculate the sequence lengths based on input sequences (assuming they have the same length as labels)
    sequence_lengths = [len(seq) for seq in input_seqs]

    # Sort input sequences and labels by sequence length (descending)
    sorted_seqs_and_labels = sorted(zip(input_seqs, labels), key=lambda x: len(x[0]), reverse=True)
    sorted_input_seqs, sorted_labels = zip(*sorted_seqs_and_labels)

    # Pad input sequences to the maximum length within the batch
    padded_input_seqs = pad_sequence(sorted_input_seqs, batch_first=True, padding_value=0)  # Use 0 as the padding value
    padded_labels = pad_sequence(sorted_labels, batch_first=True, padding_value=0)  # Use 0 as the padding value

    return padded_input_seqs, padded_labels


### Create dataloaders

In [1147]:
def dataloader_generator(df,shuffle):
    dataset_from_df = CustomDataset(df)
    batch_size = 64
    dataloader = DataLoader(dataset_from_df, batch_size=batch_size, collate_fn=custom_collate, shuffle=shuffle)
    return dataloader

train_loader  = dataloader_generator(df_train,shuffle=True)
test_loader  = dataloader_generator(df_test,shuffle=False)
val_loader  = dataloader_generator(df_val,shuffle=False)

for batch in val_loader:
    input_val, target_val = batch
    break

### Building the model

In [340]:
!wget https://raw.githubusercontent.com/sighsmile/conlleval/master/conlleval.py

--2023-11-06 16:54:57--  https://raw.githubusercontent.com/sighsmile/conlleval/master/conlleval.py
正在解析主机 raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
正在连接 raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... 已连接。
已发出 HTTP 请求，正在等待回应... 200 OK
长度：7502 (7.3K) [text/plain]
正在保存至: “conlleval.py.1”


2023-11-06 16:54:57 (14.7 MB/s) - 已保存 “conlleval.py.1” [7502/7502])



In [341]:
from conlleval import evaluate 

In [1150]:
class BiLSTMNER(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, num_layers, dropout):
        super(BiLSTMNER, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.bilstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, 
                              batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(hidden_dim * 2, output_dim)
        self.elu = nn.ELU()
        self.classifier = nn.Linear(output_dim, num_tags)  # num_tags is the number of unique NER tags

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.bilstm(x)
        x = self.dropout(x)
        x = self.linear(x)
        x = self.elu(x)
        x = self.classifier(x)
        return x

#initialize 
num_tags = 9
vocab_size = max(word2idx.values())+1

model = BiLSTMNER(vocab_size, 100, 256, 128, 1, 0.33) 
optimizer = optim.Adam(model.parameters(), lr=0.001)
loss_function = nn.CrossEntropyLoss()

#training
num_epochs = 20
print('start training')
for epoch in range(num_epochs):
    start_time = time.time()
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        inputs, targets = batch
        outputs = model(inputs)
        batch_size = inputs.size()[-1]    
        #From the instruction of CrossEntropy, we need to change the format of outputs 
        loss = loss_function(outputs.permute(0,2,1), targets) 
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    end_time = time.time()
    print(f'Epoch {epoch + 1}, Loss: {total_loss / len(train_loader)}, time: {end_time-start_time}s')
    print('validation error: ')
    precision, recall, f1 = eval(model, val_loader)



start training
Epoch 1, Loss: 0.24208735396916217, time: 52.45967507362366s
validation error: 
processed 152266 tokens with 5942 phrases; found: 2102 phrases; correct: 1080.
accuracy:  20.82%; (non-O)
accuracy:  95.34%; precision:  51.38%; recall:  18.18%; FB1:  26.85
              LOC: precision:  59.36%; recall:  26.08%; FB1:  36.23  807
             MISC: precision:  33.33%; recall:   0.33%; FB1:   0.64  9
              ORG: precision:  30.00%; recall:   3.36%; FB1:   6.04  150
              PER: precision:  48.68%; recall:  30.02%; FB1:  37.14  1136
Epoch 2, Loss: 0.11488994293930856, time: 47.83418798446655s
validation error: 
processed 152266 tokens with 5942 phrases; found: 4661 phrases; correct: 2929.
accuracy:  52.38%; (non-O)
accuracy:  97.05%; precision:  62.84%; recall:  49.29%; FB1:  55.25
              LOC: precision:  72.27%; recall:  65.11%; FB1:  68.50  1655
             MISC: precision:  62.36%; recall:  37.20%; FB1:  46.60  550
              ORG: precision:  45.43%; 

In [1162]:
print(f"Validation: precision = {precision}, recall = {recall}, f1 = {f1}")

precision = 80.42824278888693, recall = 76.48939750925614, f1 = 78.40938497369102


In [1151]:
# SAVE THE MODEL
torch.save(model.state_dict(), 'task1.pth')

In [444]:
# Example reversed_ner_tags dictionary
reversed_ner_tags = {
    0: 'O',
    1: 'B-PER',
    2: 'I-PER',
    3: 'B-ORG',
    4: 'I-ORG',
    5: 'B-LOC',
    6: 'I-LOC',
    7: 'B-MISC',
    8: 'I-MISC'
}

# Example tensor with shape (32, 36)
tensor = torch.randint(0, 9, (32, 36))  # Random integers between 0 and 8

# Map tensor elements using reversed_ner_tags
mapped_tensor = [[reversed_ner_tags[item.item()] for item in row] for row in tensor]

In [398]:
ner_tags = {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}

reversed_ner_tags = {value: key for key, value in ner_tags.items()}
reversed_ner_tags

{0: 'O',
 1: 'B-PER',
 2: 'I-PER',
 3: 'B-ORG',
 4: 'I-ORG',
 5: 'B-LOC',
 6: 'I-LOC',
 7: 'B-MISC',
 8: 'I-MISC'}

In [1149]:
#evaluation
def eval(model, loader):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in loader:
            inputs, targets = batch
            outputs = model(inputs)
            _, preds = torch.max(outputs, -1)
            preds_converted = [[reversed_ner_tags[item.item()] for item in row] for row in preds]
            targets_converted = [[reversed_ner_tags[item.item()] for item in row] for row in targets]
            all_preds.extend(preds_converted)
            all_labels.extend(targets_converted)
    # all_preds = list(chain.from_iterable(all_preds))
    # all_labels = list(chain.from_iterable(all_labels))
    # all_labels = torch.cat(all_labels)
    all_preds = itertools.chain(*all_preds)    
    all_labels =itertools.chain(*all_labels)
    result = evaluate(all_labels, all_preds,verbose=True)
    precision, recall, f1 = result[0], result[1],result[2]
    return precision, recall, f1



In [1163]:
print('Test: ')
precision, recall, f1 = eval(model, test_loader)

Test: 
processed 146937 tokens with 5648 phrases; found: 5146 phrases; correct: 3710.
accuracy:  70.02%; (non-O)
accuracy:  97.95%; precision:  72.09%; recall:  65.69%; FB1:  68.74
              LOC: precision:  84.52%; recall:  75.30%; FB1:  79.64  1486
             MISC: precision:  64.47%; recall:  62.82%; FB1:  63.64  684
              ORG: precision:  67.13%; recall:  57.80%; FB1:  62.12  1430
              PER: precision:  68.11%; recall:  65.12%; FB1:  66.58  1546


In [1164]:
print(f"Test: precision = {precision}, recall = {recall}, f1 = {f1}")

Test: precision = 72.09483093664983, recall = 65.68696883852692, f1 = 68.74189364461739


### Solution for the task 1

1. Hyperparameters:
- vocab_size = 8128
- embedding_dim = 100
- hidden_dim = 256 
- output_dim = 128
- num_layers = 1
- dropout = 0.33
- optimizer learning rate= 0.001
- batch_size = 64
2. Solution:
  At first, I created a vocab that maps all the tokens from the training set to a number, and I gave up the tokens that appeared less than 3 times. Secondly, I custimized a dataset class so that each batch will conatin (input_ids, ner_tags). Next, I used padding_sequence to customize the padding value of 0 in input and 9 in ner_tags. Why do I pad here? I need to make sure for each batch, which contains 32 samples, will have the max_length within one batch. Thirdly, I designed my bilstm model. The model will firstly embed all the inputs to 100-dim vectors and then throw the vectors to the lstm layer. Through elu, dropout, and one more linear layer, it model will predict the name entity for each token in samples.
3. Questions and answers:
- What are the precision, recall, and F1 score on the validation data?
- precision = 80.42824278888693, recall = 76.48939750925614, f1 = 78.40938497369102- What are the precision, recall, and F1 score on the test data?
- precision = 72.09483093664983, recall = 65.68696883852692, f1 = 68.74189364461739

## Task 2: Glove Embedding

### Load Glove Embedding

In [31]:
# Define a function to load GloVe embeddings from a file
def load_glove_embeddings(file_path):
    embeddings_index = {}
    with open(file_path, encoding="utf-8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype="float32")
            embeddings_index[word] = coefs
    return embeddings_index

# Specify the path to your downloaded "glove.6B.100d.txt" file
glove_file_path = "glove.6B.100d"

# Load GloVe embeddings into memory
glove_embeddings = load_glove_embeddings(glove_file_path)

### Create Glove Idx

In [1027]:
def convert_word_to_glove_ids(sample):
    tokens = sample['tokens']
    glove_ids =[]
    for token in tokens:
        token = token.lower()
        indices = np.where(vocab_npa == token)
        if indices[0].size > 0:
            index = indices[0][0]
        else:
            index = 1
        glove_ids.append(index)
    sample['glove_ids'] = glove_ids
    return sample
dataset = dataset.map(convert_word_to_glove_ids)


Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

### Customize the layer

In [831]:
#convert glove into a layer
vocab,embeddings = [],[]
with open('glove.6B.100d',encoding="utf-8") as fi:
    full_content = fi.read().strip().split('\n')
for i in range(len(full_content)):
    i_word = full_content[i].split(' ')[0]
    i_embeddings = [float(val) for val in full_content[i].split(' ')[1:]]
    vocab.append(i_word)
    embeddings.append(i_embeddings)


In [832]:
vocab_npa = np.array(vocab)
embs_npa = np.array(embeddings)

#insert '<pad>' and '<unk>' tokens at start of vocab_npa.
vocab_npa = np.insert(vocab_npa, 0, '<pad>')
vocab_npa = np.insert(vocab_npa, 1, '<unk>')
print(vocab_npa[:10])

pad_emb_npa = np.zeros((1,embs_npa.shape[1]))   #embedding for '<pad>' token.
unk_emb_npa = np.mean(embs_npa,axis=0,keepdims=True)    #embedding for '<unk>' token.

#insert embeddings for pad and unk tokens at top of embs_npa.
embs_npa = np.vstack((pad_emb_npa,unk_emb_npa,embs_npa))
print(embs_npa.shape)

['<pad>' '<unk>' 'the' ',' '.' 'of' 'to' 'and' 'in' 'a']
(400002, 100)


In [877]:
import torch
my_embedding_layer = torch.nn.Embedding.from_pretrained(torch.from_numpy(embs_npa).float(),freeze=True)

assert my_embedding_layer.weight.shape == embs_npa.shape
print(my_embedding_layer.weight.shape)

torch.Size([400002, 100])


### Make Glove case-sensitive -- creating another feature

In [858]:
#add features to the dataloader 
#case 0: lower case - no uppercase
#case 1: first word is uppercase
#case 2: whole word is uppeercase
#case 3: others: e.g. ","
def capital_case(word):
    if word.islower():
        return 0
    elif word.isupper():
        return 2
    elif word.istitle():
        return 1
    else: return 3

def convert_word_to_capital_case(sample):
    capitals = [capital_case(word) for word in sample['tokens'] ]
    sample['capitals'] =capitals
    return sample 

dataset = dataset.map(convert_word_to_capital_case)

Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [1023]:
dataset['train'][2]

{'id': '2',
 'tokens': ['BRUSSELS', '1996-08-22'],
 'pos_tags': [22, 11],
 'chunk_tags': [11, 12],
 'ner_tags': [5, 0],
 'input_ids': [12, 13],
 'capitals': [2, 3],
 'glove_ids': [1, 1]}

### Padding -- glove embedding

In [1165]:
import pandas as pd
import torch
from torch.utils.data import Dataset

# Create a custom Dataset class
class CustomDataset(Dataset):
    
    def __init__(self,data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        label = torch.tensor(self.data[index]['ner_tags'], dtype=torch.long ) 
        glove_ids = torch.tensor(self.data[index]['glove_ids'], dtype=torch.long)
        capital = torch.tensor(self.data[index]['capitals'], dtype=torch.long)
        
        return label, glove_ids, capital

# Create an instance of the CustomDataset
dataset_train = CustomDataset(dataset['train'])
dataset_test = CustomDataset(dataset['test'])
dataset_val = CustomDataset(dataset['validation'])

# Example: Accessing a single sample
print(dataset_train[0])


(tensor([3, 0, 7, 0, 0, 0, 7, 0, 0]), tensor([  646,  7580,   516,   582,     6,  5262,   299, 10240,     4]), tensor([2, 0, 1, 0, 0, 0, 1, 0, 3]))


In [1166]:
def custom_collate(batch):
    label, glove_ids, capital = zip(*batch)
    padded_label = pad_sequence(label, batch_first=True, padding_value=9 )
    padded_glove_ids = pad_sequence(glove_ids, batch_first=True, padding_value=0 )
    padded_capital = pad_sequence(capital, batch_first=True, padding_value=4 )
    return padded_glove_ids, padded_capital, padded_label
    

In [1167]:
batch_size = 64
train_loader = DataLoader(dataset_train, batch_size=batch_size, collate_fn= custom_collate, shuffle=True)
test_loader = DataLoader(dataset_test, batch_size=batch_size, collate_fn= custom_collate, shuffle=False)
val_loader = DataLoader(dataset_val, batch_size=batch_size, collate_fn= custom_collate, shuffle=False)

In [1174]:
class BiLSTMNER(nn.Module):
    def __init__(self,hidden_dim, output_dim, num_layers, dropout):
        super(BiLSTMNER, self).__init__()
        self.embedding = my_embedding_layer
        self.capital_layer = nn.Embedding(num_embeddings=5,embedding_dim=20,padding_idx=4)
        self.bilstm = nn.LSTM(input_size=120, hidden_size=hidden_dim, num_layers=num_layers, 
                              batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(hidden_dim * 2, output_dim,dtype=torch.float32)
        self.elu = nn.ELU()
        self.classifier = nn.Linear(output_dim, num_tags,dtype=torch.float32)  # num_tags is the number of unique NER tags

    def forward(self, x, capital):
        x = self.embedding(x.int())
        capital = self.capital_layer(capital.int())
        x = torch.cat([x, capital], dim=2)
        x, _ = self.bilstm(x)
        x = self.dropout(x)
        x = self.linear(x)
        x = self.elu(x)
        x = self.classifier(x)
        return x


#initialize 
num_tags = 9
model = BiLSTMNER(256,128, 1, 0.33) 
optimizer = optim.Adam(model.parameters(), lr=0.001)
loss_function = nn.CrossEntropyLoss(ignore_index=9)


#training
num_epochs = 20
print('start training')
for epoch in range(num_epochs):
    start_time = time.time()
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        inputs, capitals ,targets = batch
        outputs = model(inputs, capitals)
        batch_size = inputs.size()[-1]    
        #From the instruction of CrossEntropy, we need to change the format of outputs 
        loss = loss_function(outputs.permute(0,2,1), targets) 
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    end_time = time.time()
    print(f'Epoch {epoch + 1}, Loss: {total_loss / len(train_loader)}, time: {end_time-start_time}s')
    print('validation error: ')
    precision, recall, f1 = eval(model, val_loader)



start training
Epoch 1, Loss: 0.2787948575378819, time: 59.194642066955566s
validation error: 
processed 51362 tokens with 5942 phrases; found: 6183 phrases; correct: 5010.
accuracy:  85.81%; (non-O)
accuracy:  97.16%; precision:  81.03%; recall:  84.32%; FB1:  82.64
              LOC: precision:  83.66%; recall:  90.85%; FB1:  87.11  1995
             MISC: precision:  68.96%; recall:  74.95%; FB1:  71.83  1002
              ORG: precision:  72.70%; recall:  70.69%; FB1:  71.68  1304
              PER: precision:  90.44%; recall:  92.40%; FB1:  91.41  1882
Epoch 2, Loss: 0.0857482789422978, time: 59.39733099937439s
validation error: 
processed 51362 tokens with 5942 phrases; found: 6033 phrases; correct: 5285.
accuracy:  89.63%; (non-O)
accuracy:  97.99%; precision:  87.60%; recall:  88.94%; FB1:  88.27
              LOC: precision:  92.77%; recall:  92.16%; FB1:  92.46  1825
             MISC: precision:  79.46%; recall:  79.28%; FB1:  79.37  920
              ORG: precision:  78.05%

In [1179]:
# SAVE THE MODEL
torch.save(model.state_dict(), 'task2.pth')

In [1176]:
print(f"Validation: precision = {precision}, recall = {recall}, f1 = {f1}")

Validation: precision = 91.67769688947716, recall = 93.25143049478291, f1 = 92.45786751209747


In [1172]:
ner_tags = {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8, '<PAD>':9}

reversed_ner_tags = {value: key for key, value in ner_tags.items()}
reversed_ner_tags

{0: 'O',
 1: 'B-PER',
 2: 'I-PER',
 3: 'B-ORG',
 4: 'I-ORG',
 5: 'B-LOC',
 6: 'I-LOC',
 7: 'B-MISC',
 8: 'I-MISC',
 9: '<PAD>'}

In [1177]:
#evaluation
def eval(model, loader):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in loader:
            inputs, capitals ,targets = batch
            #get rid of paddings on targets
            label_unpad = targets
            mask = label_unpad != 9
            label_unpad = label_unpad[mask]
            
            outputs = model(inputs,capitals)
            _, preds = torch.max(outputs, -1)
            #get rid of paddings on pred
            preds = preds[mask]
            
            preds_converted = [reversed_ner_tags[elem.item()] for elem in preds]
            targets_converted = [reversed_ner_tags[elem.item()] for elem in label_unpad]
            all_preds.extend(preds_converted)
            all_labels.extend(targets_converted)
    # all_preds = list(chain.from_iterable(all_preds))
    # all_labels = list(chain.from_iterable(all_labels))
    # all_labels = torch.cat(all_labels)
    # all_preds = itertools.chain(*all_preds)    
    # all_labels =itertools.chain(*all_labels)
    result = evaluate(all_labels, all_preds,verbose=True)
    precision, recall, f1 = result[0], result[1],result[2]
    return precision, recall, f1

# print('Test: ')
# precision, recall, f1 = eval(model, test_loader)

In [1178]:
print(f"Test: precision = {precision}, recall = {recall}, f1 = {f1}")

Test: precision = 91.67769688947716, recall = 93.25143049478291, f1 = 92.45786751209747


### Solution for task2 

1. Hyperparameters:

- embedding_dim = 100
- hidden_dim = 256 
- output_dim = 128
- num_layers = 1
- dropout = 0.33
- optimizer learning rate= 0.001
- ignore_index = 9
- batch_size = 64
2. Solution:
    At first, I loaded the glove embedding and convert it into two arrays. One records all the indices and the other one records the 100-d embeddings for all the tokens. Secondly, since the glove is not case-sensitive, I tried to divide tokens into 4 cases (0: lowercase 1: some uppercases 2: all uppercases 3: lowercase and uppercase are the same). So, I added a new list to the dataset. Thirdly, I mapped all the tokens into indices in the glove embedding. So, I added one more list to the dataset. Forthly, I created a new customized dataset that each batch contains (glove_ids, capitalize, ner_tag). And similiar to the task, I padded all of them while creating the dataloaders. To be notified, I padded 9 to the ner_tag since it is a number that has not been used. I padded the capitalize with 4, which is not used either. Fifthly, I threw the batches into the model, which has the similar structure to the task 1. However, I added one more embedding layer such that the feature capitalize will be converted into 20-d vector and be added to the original 100-d layer. So, the input will become a 120-d vector. Through elu, dropout, and one more linear layer, it model will predict the name entity for each token in samples.
3. Questions and answers:
- What is the precision, recall, and F1 score on the validation data?
- precision = 91.67769688947716, recall = 93.25143049478291, f1 = 92.45786751209747- What are the precision, recall, and F1 score on the test data?
- precision = 91.67769688947716, recall = 93.25143049478291, f1 = 92.45786751209747- BiLSTM with Glove Embeddings outperforms the model without. Can you provide a rationale for this?
- At first, the glove is a bigger vocab than the word2idx, so it will map less unknown words. Secondly, since I added a new embedding layer, the model can better capture whether the word has been capitalized. 

## Task3: Transformer

In [1180]:
import pandas as pd
import torch
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    
    def __init__(self,data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        input = torch.tensor(self.data[index]['input_ids'], dtype=torch.long ) 
        target = torch.tensor(self.data[index]['ner_tags'], dtype=torch.long)
        
        return input, target

# Create an instance of the CustomDataset
dataset_train = CustomDataset(dataset['train'])
dataset_test = CustomDataset(dataset['test'])
dataset_val = CustomDataset(dataset['validation'])

# Example: Accessing a single sample
print(dataset_train[2])

(tensor([12, 13]), tensor([5, 0]))


In [1181]:
def custom_collate(batch):
    input_ids, label = zip(*batch)
    padded_label = pad_sequence(label, batch_first=True, padding_value=9 )
    padded_input_ids = pad_sequence(input_ids, batch_first=True, padding_value=0 )
    return padded_input_ids, padded_label
    

In [1182]:
batch_size = 32
train_loader = DataLoader(dataset_train, batch_size=batch_size, collate_fn= custom_collate, shuffle=True)
test_loader = DataLoader(dataset_test, batch_size=batch_size, collate_fn= custom_collate, shuffle=False)
val_loader = DataLoader(dataset_val, batch_size=batch_size, collate_fn= custom_collate, shuffle=False)

In [1183]:
# inspect the trainloader 
for batch in train_loader:
    inputs, labels = batch
    break

In [1184]:
import torch
import torch.nn as nn

class TransformerNERModel(nn.Module):
    def __init__(self, vocab_size, tag_vocab_size, embed_size=128, num_heads=8, max_seq_length=128, ff_dim=128, num_encoder_layers=6,
                dropout=0.33):
        super(TransformerNERModel, self).__init__()
        
        # Token embedding layer
        self.embedding = TokenEmbedding(vocab_size, embed_size)
        
        # Positional encoding
        self.positional_encoder = PositionalEncoding(emb_size= embed_size, maxlen=max_seq_length)
        
        # Transformer Encoder
        self.transformer_encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(
                d_model=embed_size,
                nhead=num_heads,
                dim_feedforward=ff_dim,
                batch_first=True
            ),
            num_layers=num_encoder_layers,
        )
        
        # Linear layer for classification
        self.fc = nn.Linear(embed_size, tag_vocab_size)
    
    def forward(self, src, src_padding_mask):
        # Token embedding
        x = self.embedding(src)
        
        # Add positional encoding
        x = self.positional_encoder(x)
        
        # Transformer encoder
        x = self.transformer_encoder(x, src_key_padding_mask=src_padding_mask)
        
        # Final linear layer for classification
        x = self.fc(x)
        
        return x

class PositionalEncoding(nn.Module):
    def __init__(self,
                 emb_size: int,
                 dropout: float =0.33,
                 maxlen: int = 5000):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(- torch.arange(0, emb_size, 2)* math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding):
        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])

class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size

    def forward(self, tokens):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)

# Initialize the model
vocab_size = max(word2idx.values())+1# Your vocabulary size
tag_vocab_size = 9 # Your tag vocabulary size
model = TransformerNERModel(vocab_size, tag_vocab_size)

criterion = nn.CrossEntropyLoss(ignore_index=9)
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [1187]:
# Training loop
num_epochs = 25
for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    
    # Iterate over your training data in batches
    for batch_idx, (inputs, targets) in enumerate(train_loader):
        # Zero the gradients
        optimizer.zero_grad()
        
        # Forward pass + src_padding_mask
        src_padding_mask = (inputs == 0).float()
        outputs = model(inputs, src_padding_mask= src_padding_mask)  
        
        # Flatten the outputs and targets for the loss calculation
        outputs = outputs.view(-1, 9)
        targets = targets.view(-1)
        
        # Calculate the loss
        loss = criterion(outputs, targets)
        
        # Backpropagation
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    # Print the average loss for this epoch
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch [{epoch + 1}/{num_epochs}] - Loss: {avg_loss:.4f}")

    print('validation error: ')
    precision, recall, f1 = eval(model, val_loader)

Epoch [1/20] - Loss: 0.5256
validation error: 
processed 51362 tokens with 5942 phrases; found: 3827 phrases; correct: 1857.
accuracy:  29.40%; (non-O)
accuracy:  87.05%; precision:  48.52%; recall:  31.25%; FB1:  38.02
              LOC: precision:  58.47%; recall:  52.04%; FB1:  55.07  1635
             MISC: precision:  64.33%; recall:  23.86%; FB1:  34.81  342
              ORG: precision:  53.95%; recall:  17.30%; FB1:  26.20  430
              PER: precision:  31.62%; recall:  24.38%; FB1:  27.53  1420
Epoch [2/20] - Loss: 0.4477
validation error: 
processed 51362 tokens with 5942 phrases; found: 3723 phrases; correct: 2158.
accuracy:  34.28%; (non-O)
accuracy:  88.53%; precision:  57.96%; recall:  36.32%; FB1:  44.66
              LOC: precision:  75.40%; recall:  54.38%; FB1:  63.19  1325
             MISC: precision:  72.88%; recall:  41.97%; FB1:  53.27  531
              ORG: precision:  48.84%; recall:  29.83%; FB1:  37.04  819
              PER: precision:  35.50%; recall:

In [1191]:
# 3 more epochs
for epoch in range(3):
    model.train()
    total_loss = 0.0
    
    # Iterate over your training data in batches
    for batch_idx, (inputs, targets) in enumerate(train_loader):
        # Zero the gradients
        optimizer.zero_grad()
        
        # Forward pass + src_padding_mask
        src_padding_mask = (inputs == 0).float()
        outputs = model(inputs, src_padding_mask= src_padding_mask)  
        
        # Flatten the outputs and targets for the loss calculation
        outputs = outputs.view(-1, 9)
        targets = targets.view(-1)
        
        # Calculate the loss
        loss = criterion(outputs, targets)
        
        # Backpropagation
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    # Print the average loss for this epoch
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch [{epoch+22}/{25}] - Loss: {avg_loss:.4f}")

    print('validation error: ')
    precision, recall, f1 = eval(model, val_loader)

Epoch [22/25] - Loss: 0.1733
validation error: 
processed 51362 tokens with 5942 phrases; found: 5506 phrases; correct: 3568.
accuracy:  59.12%; (non-O)
accuracy:  92.24%; precision:  64.80%; recall:  60.05%; FB1:  62.33
              LOC: precision:  83.40%; recall:  69.73%; FB1:  75.96  1536
             MISC: precision:  76.79%; recall:  71.04%; FB1:  73.80  853
              ORG: precision:  57.69%; recall:  53.69%; FB1:  55.62  1248
              PER: precision:  48.80%; recall:  49.51%; FB1:  49.15  1869
Epoch [23/25] - Loss: 0.1700
validation error: 
processed 51362 tokens with 5942 phrases; found: 5337 phrases; correct: 3540.
accuracy:  56.79%; (non-O)
accuracy:  92.03%; precision:  66.33%; recall:  59.58%; FB1:  62.77
              LOC: precision:  84.71%; recall:  69.95%; FB1:  76.62  1517
             MISC: precision:  76.77%; recall:  70.61%; FB1:  73.56  848
              ORG: precision:  55.55%; recall:  57.87%; FB1:  56.68  1397
              PER: precision:  52.57%; rec

In [1192]:
print(f"Validation: precision = {precision}, recall = {recall}, f1 = {f1}")

Validation: precision = 61.58337626652928, recall = 60.35005048805117, f1 = 60.96047598810029


In [1193]:
# SAVE THE MODEL
torch.save(model.state_dict(), 'task3.pth')

In [1186]:
#evaluation
def eval(model, loader):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in loader:
            inputs, targets = batch
            #get rid of paddings on targets
            label_unpad = targets
            mask = label_unpad != 9
            label_unpad = label_unpad[mask]
            src_padding_mask = (inputs == 0).float()
                # print('size match:', src_padding_mask.size() == inputs.size())
            outputs = model(inputs,src_padding_mask=src_padding_mask)
            _, preds = torch.max(outputs, -1)
            #get rid of paddings on pred
            preds = preds[mask]
            
            preds_converted = [reversed_ner_tags[elem.item()] for elem in preds]
            targets_converted = [reversed_ner_tags[elem.item()] for elem in label_unpad]
            all_preds.extend(preds_converted)
            all_labels.extend(targets_converted)
    # all_preds = list(chain.from_iterable(all_preds))
    # all_labels = list(chain.from_iterable(all_labels))
    # all_labels = torch.cat(all_labels)
    # all_preds = itertools.chain(*all_preds)    
    # all_labels =itertools.chain(*all_labels)
    result = evaluate(all_labels, all_preds,verbose=True)
    precision, recall, f1 = result[0], result[1],result[2]
    return precision, recall, f1



In [1194]:
print('Test: ')
precision, recall, f1 = eval(model, test_loader)

Test: 
processed 46435 tokens with 5648 phrases; found: 5360 phrases; correct: 2830.
accuracy:  50.48%; (non-O)
accuracy:  89.56%; precision:  52.80%; recall:  50.11%; FB1:  51.42
              LOC: precision:  79.85%; recall:  64.15%; FB1:  71.14  1340
             MISC: precision:  67.76%; recall:  61.97%; FB1:  64.73  642
              ORG: precision:  49.84%; recall:  45.88%; FB1:  47.77  1529
              PER: precision:  30.45%; recall:  34.82%; FB1:  32.49  1849


In [1195]:
print(f"Test: precision = {precision}, recall = {recall}, f1 = {f1}")

Test: precision = 52.79850746268657, recall = 50.106232294617556, f1 = 51.417151162790695


### Solution to task 3

1. Hyperparameters:
- embedding_dim = 100
- hidden_dim = 256 
- output_dim = 128
- num_layers = 1
- dropout = 0.33
- optimizer learning rate= 0.001
- ignore_index = 9
- batch_size = 32
2. Solution:
    Same as task, we still use input_ids as the input. The dataloader will pad 0 to input and 9 to ner_tags. Next, the first layer of the model is an embedding layer, which convert each token into 128-d vector. The positional encoder is a self-attention layer which will generate a sequence as output. For src_padding_mask, it will identify all the padded values and get rid of their impact. Next, the bacthes will be thrown to the transformer encoder and a FFN to predict the results.
3. Questions and answers:
- What is the precision, recall, and F1 score on the validation data?
- precision = 61.58337626652928, recall = 60.35005048805117, f1 = 60.96047598810029
- What are the precision, recall, and F1 score on the test data?
- precision = 52.79850746268657, recall = 50.106232294617556, f1 = 51.417151162790695- What is the reason behind the poor performance of the transformer?
- At first, the transformer typically require big amout of data. Since the word2idx is too small, it cannot generalize well onto the new data. Secondly, the other problem of the small dataset is that the model will probably overfit.