In [1]:
import math
import re
from random import *
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

**Data**

In [2]:
# !pip install --upgrade pyarrow

In [3]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.17.1-py3-none-any.whl (536 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m536.7/536.7 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dill, multiprocess, datasets
Successfully installed datasets-2.17.1 dill-0.3.8 multiprocess-0.70.16


**Data**

In [4]:
from datasets import load_dataset

# Load the mini-bioasq dataset with the 'text-corpus' configuration
dataset = load_dataset("rag-datasets/mini-bioasq", "text-corpus")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/513 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/24.5M [00:00<?, ?B/s]

Generating passages split: 0 examples [00:00, ? examples/s]

In [5]:
# path = '/content/drive/MyDrive/NLP_A5/Text_corpus/internet_archive_scifi_v3.txt'

# with open(path,'r') as file:
#   file_contents = file.read()
# print(file_contents[:500])

In [6]:
dataset['passages']['passage'][:1]

['New data on viruses isolated from patients with subacute thyroiditis de Quervain \nare reported. Characteristic morphological, cytological, some physico-chemical \nand biological features of the isolated viruses are described. A possible role \nof these viruses in human and animal health disorders is discussed. The isolated \nviruses remain unclassified so far.']

In [7]:
# merging the texts together

dataset_joined = '\n'.join(dataset['passages']['passage'][:2])

In [8]:
dataset_joined[:100]

'New data on viruses isolated from patients with subacute thyroiditis de Quervain \nare reported. Char'

**Preprocessing**

Tokenization and numbericalization

In [9]:
import spacy

nlp = spacy.load("en_core_web_sm")

In [10]:
doc = nlp(dataset_joined)
sentences = list(doc.sents)

In [11]:
# Showing only 1 sentence

sentences[:1]

[New data on viruses isolated from patients with subacute thyroiditis de Quervain 
 are reported.]

In [12]:
#lower case, and clean all the symbols
text = [x.text.lower() for x in sentences]
text = [re.sub("[.,!?\\-]", '', x) for x in text]

In [13]:
text[:1]

['new data on viruses isolated from patients with subacute thyroiditis de quervain \nare reported']

In [14]:
#making vocabs - numericalization
word_list = list(set(" ".join(text).split()))
word2id   = {'[PAD]': 0, '[CLS]': 1, '[SEP]': 2, '[MASK]': 3}

In [15]:
word_list[:5]

['previous', 'enzyme', 'blood', 'we', 'for']

In [16]:
word2id

{'[PAD]': 0, '[CLS]': 1, '[SEP]': 2, '[MASK]': 3}

In [17]:
for i, w in enumerate(word_list):
    word2id[w] = i + 4 #reserve the first 0-3 for CLS, PAD
    id2word    = {i:w for i, w  in enumerate(word2id)}
    vocab_size = len(word2id)

token_list = list()
for sentence in sentences:
    arr = [word2id[word] for sentence in text for word in sentence.split()]
    token_list.append(arr)

In [18]:
vocab_size = len(word2id)

In [19]:
vocab_size

86

**save variables**

1. save word2id

In [24]:
import pickle

path1 = '/content/drive/MyDrive/NLP_A5/word2id.pickle'
with open(path1,'wb') as f:
  pickle.dump(word2id, f)

2. save id2word

In [25]:
path2 = '/content/drive/MyDrive/NLP_A5/id2word.pickle'

with open(path2,'wb') as f:
  pickle.dump(id2word, f)

3. save tokenList

In [26]:
path3 = '/content/drive/MyDrive/NLP_A5/tokenList.pickle'

with open(path3,'wb') as f:
  pickle.dump(token_list, f)

**Data Loader**

In [None]:
batch_size = 6
max_mask   = 5  #even though it does not reach 15% yet....maybe you can set this threshold
max_len    = 300  #maximum length that my transformer will accept.....all sentence will be padded

In [None]:
def make_batch():
    batch = []
    positive = negative = 0
    while positive != batch_size / 2 or negative != batch_size / 2:

        #randomly choose two sentence
        tokens_a_index, tokens_b_index = randrange(len(sentences)), randrange(len(sentences))
        tokens_a, tokens_b            = token_list[tokens_a_index], token_list[tokens_b_index]

        #1. token embedding - add CLS and SEP
        input_ids = [word2id['[CLS]']] + tokens_a + [word2id['[SEP]']] + tokens_b + [word2id['[SEP]']]

        #2. segment embedding - which sentence is 0 and 1
        segment_ids = [0] * (1 + len(tokens_a) + 1) + [1] * (len(tokens_b) + 1)

        #3 masking
        n_pred = min(max_mask, max(1, int(round(len(input_ids) * 0.15))))
        #get all the pos excluding CLS and SEP
        candidates_masked_pos = [i for i, token in enumerate(input_ids) if token != word2id['[CLS]']
                                 and token != word2id['[SEP]']]
        shuffle(candidates_masked_pos)
        masked_tokens, masked_pos = [], []
        #simply loop and mask accordingly
        for pos in candidates_masked_pos[:n_pred]:
            masked_pos.append(pos)
            masked_tokens.append(input_ids[pos])
            if random() < 0.1:  #10% replace with random token
                index = randint(0, vocab_size - 1)
                input_ids[pos] = word2id[id2word[index]]
            elif random() < 0.8:  #80 replace with [MASK]
                input_ids[pos] = word2id['[MASK]']
            else:
                pass

        #4. pad the sentence to the max length
        n_pad = max_len - len(input_ids)
        input_ids.extend([0] * n_pad)
        segment_ids.extend([0] * n_pad)

        #5. pad the mask tokens to the max length
        if max_mask > n_pred:
            n_pad = max_mask - n_pred
            masked_tokens.extend([0] * n_pad)
            masked_pos.extend([0] * n_pad)

        #6. check whether is positive or negative
        if tokens_a_index + 1 == tokens_b_index and positive < batch_size / 2:
            batch.append([input_ids, segment_ids, masked_tokens, masked_pos, True])
            positive += 1
        elif tokens_a_index + 1 != tokens_b_index and negative < batch_size / 2:
            batch.append([input_ids, segment_ids, masked_tokens, masked_pos, False])
            negative += 1

    return batch


In [None]:
batch = make_batch()

In [None]:
len(batch)

6

In [None]:
input_ids, segment_ids, masked_tokens, masked_pos, isNext = map(torch.LongTensor, zip(*batch))

In [None]:
input_ids.shape, segment_ids.shape, masked_tokens.shape, masked_pos.shape, isNext

(torch.Size([6, 300]),
 torch.Size([6, 300]),
 torch.Size([6, 5]),
 torch.Size([6, 5]),
 tensor([0, 1, 0, 0, 1, 1]))

In [None]:
masked_tokens

tensor([[81, 71, 64, 13, 82],
        [43, 81, 31, 63, 51],
        [ 8, 27, 22, 83, 12],
        [47, 74, 61, 72, 70],
        [28, 46, 26, 19, 10],
        [26, 75, 12, 20, 27]])

**Model**

Embedding

In [None]:
class Embedding(nn.Module):
    def __init__(self):
        super(Embedding, self).__init__()
        self.tok_embed = nn.Embedding(vocab_size, d_model)  # token embedding
        self.pos_embed = nn.Embedding(max_len, d_model)      # position embedding
        self.seg_embed = nn.Embedding(n_segments, d_model)  # segment(token type) embedding
        self.norm = nn.LayerNorm(d_model)

    def forward(self, x, seg):
        # print("Input Shape - x:", x.shape, "seg:", seg.shape)
        # print("Input Indices Range:")
        # print("Min Input ID:", torch.min(x))
        # print("Max Input ID:", torch.max(x))
        # print("Min Segment ID:", torch.min(seg))
        # print("Max Segment ID:", torch.max(seg))

        # print("\nVocabulary Sizes:")
        # print("Token Embedding Vocabulary Size:", self.tok_embed.num_embeddings)
        # print("Segment Embedding Vocabulary Size:", self.seg_embed.num_embeddings)

        #x, seg: (bs, len)
        seq_len = x.size(1)
        pos = torch.arange(seq_len, dtype=torch.long)
        pos = pos.unsqueeze(0).expand_as(x)  # (len,) -> (bs, len)
        embedding = self.tok_embed(x) + self.pos_embed(pos) + self.seg_embed(seg)
        # print("Shape of Token Embedding:", self.tok_embed(x).shape)
        # print("Shape of Position Embedding:", self.pos_embed(pos).shape)
        # print("Shape of Segment Embedding:", self.seg_embed(seg).shape)
        # print("Shape of Combined Embedding:", embedding.shape)
        return self.norm(embedding)

Attention Mask

In [None]:
def get_attn_pad_mask(seq_q, seq_k):
    batch_size, len_q = seq_q.size()
    batch_size, len_k = seq_k.size()
    # eq(zero) is PAD token
    pad_attn_mask = seq_k.data.eq(0).unsqueeze(1)  # batch_size x 1 x len_k(=len_q), one is masking
    return pad_attn_mask.expand(batch_size, len_q, len_k)  # batch_size x len_q x len_k

Testing the attention mask

In [None]:
print(get_attn_pad_mask(input_ids, input_ids).shape)

torch.Size([6, 300, 300])


Encoder

In [None]:
class EncoderLayer(nn.Module):
    def __init__(self):
        super(EncoderLayer, self).__init__()
        self.enc_self_attn = MultiHeadAttention()
        self.pos_ffn       = PoswiseFeedForwardNet()

    def forward(self, enc_inputs, enc_self_attn_mask):
        enc_outputs, attn = self.enc_self_attn(enc_inputs, enc_inputs, enc_inputs, enc_self_attn_mask) # enc_inputs to same Q,K,V
        enc_outputs = self.pos_ffn(enc_outputs) # enc_outputs: [batch_size x len_q x d_model]
        return enc_outputs, attn

In [None]:
class ScaledDotProductAttention(nn.Module):
    def __init__(self):
        super(ScaledDotProductAttention, self).__init__()

    def forward(self, Q, K, V, attn_mask):
        scores = torch.matmul(Q, K.transpose(-1, -2)) / np.sqrt(d_k) # scores : [batch_size x n_heads x len_q(=len_k) x len_k(=len_q)]
        scores.masked_fill_(attn_mask, -1e9) # Fills elements of self tensor with value where mask is one.
        attn = nn.Softmax(dim=-1)(scores)
        context = torch.matmul(attn, V)
        return context, attn

In [None]:
n_layers = 6    # number of Encoder of Encoder Layer
n_heads  = 8    # number of heads in Multi-Head Attention
d_model  = 768  # Embedding Size
d_ff = 768 * 4  # 4*d_model, FeedForward dimension
d_k = d_v = 64  # dimension of K(=Q), V
n_segments = 2

In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self):
        super(MultiHeadAttention, self).__init__()
        self.W_Q = nn.Linear(d_model, d_k * n_heads)
        self.W_K = nn.Linear(d_model, d_k * n_heads)
        self.W_V = nn.Linear(d_model, d_v * n_heads)
    def forward(self, Q, K, V, attn_mask):
        # q: [batch_size x len_q x d_model], k: [batch_size x len_k x d_model], v: [batch_size x len_k x d_model]
        residual, batch_size = Q, Q.size(0)
        # (B, S, D) -proj-> (B, S, D) -split-> (B, S, H, W) -trans-> (B, H, S, W)
        q_s = self.W_Q(Q).view(batch_size, -1, n_heads, d_k).transpose(1,2)  # q_s: [batch_size x n_heads x len_q x d_k]
        k_s = self.W_K(K).view(batch_size, -1, n_heads, d_k).transpose(1,2)  # k_s: [batch_size x n_heads x len_k x d_k]
        v_s = self.W_V(V).view(batch_size, -1, n_heads, d_v).transpose(1,2)  # v_s: [batch_size x n_heads x len_k x d_v]

        attn_mask = attn_mask.unsqueeze(1).repeat(1, n_heads, 1, 1) # attn_mask : [batch_size x n_heads x len_q x len_k]

        # context: [batch_size x n_heads x len_q x d_v], attn: [batch_size x n_heads x len_q(=len_k) x len_k(=len_q)]
        context, attn = ScaledDotProductAttention()(q_s, k_s, v_s, attn_mask)
        context = context.transpose(1, 2).contiguous().view(batch_size, -1, n_heads * d_v) # context: [batch_size x len_q x n_heads * d_v]
        output = nn.Linear(n_heads * d_v, d_model)(context)
        return nn.LayerNorm(d_model)(output + residual), attn # output: [batch_size x len_q x d_model]

In [None]:
class PoswiseFeedForwardNet(nn.Module):
    def __init__(self):
        super(PoswiseFeedForwardNet, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)

    def forward(self, x):
        # (batch_size, len_seq, d_model) -> (batch_size, len_seq, d_ff) -> (batch_size, len_seq, d_model)
        return self.fc2(F.gelu(self.fc1(x)))

BERT

In [None]:
class BERT(nn.Module):
    def __init__(self):
        super(BERT, self).__init__()
        self.embedding = Embedding()
        self.layers = nn.ModuleList([EncoderLayer() for _ in range(n_layers)])
        self.fc = nn.Linear(d_model, d_model)
        self.activ = nn.Tanh()
        self.linear = nn.Linear(d_model, d_model)
        self.norm = nn.LayerNorm(d_model)
        self.classifier = nn.Linear(d_model, 2)


        # decoder is shared with embedding layer
        embed_weight = self.embedding.tok_embed.weight
        n_vocab, n_dim = embed_weight.size()
        self.decoder = nn.Linear(n_dim, n_vocab, bias=False)
        self.decoder.weight = embed_weight
        self.decoder_bias = nn.Parameter(torch.zeros(n_vocab))

    def forward(self, input_ids, segment_ids, masked_pos):
        output = self.embedding(input_ids, segment_ids)
        enc_self_attn_mask = get_attn_pad_mask(input_ids, input_ids)
        for layer in self.layers:
            output, enc_self_attn = layer(output, enc_self_attn_mask)
        # output : [batch_size, len, d_model], attn : [batch_size, n_heads, d_mode, d_model]

        # 1. predict next sentence
        # it will be decided by first token(CLS)
        h_pooled   = self.activ(self.fc(output[:, 0])) # [batch_size, d_model]
        logits_nsp = self.classifier(h_pooled) # [batch_size, 2]

        # 2. predict the masked token
        masked_pos = masked_pos[:, :, None].expand(-1, -1, output.size(-1)) # [batch_size, max_pred, d_model]
        h_masked = torch.gather(output, 1, masked_pos) # masking position [batch_size, max_pred, d_model]
        h_masked  = self.norm(F.gelu(self.linear(h_masked)))
        logits_lm = self.decoder(h_masked) + self.decoder_bias # [batch_size, max_pred, n_vocab]

        return logits_lm, logits_nsp, h_pooled

**Training**

In [None]:
num_epoch = 5
model = BERT()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.004)

batch = make_batch()
input_ids, segment_ids, masked_tokens, masked_pos, isNext = map(torch.LongTensor, zip(*batch))

for epoch in range(num_epoch):
    optimizer.zero_grad()
    logits_lm, logits_nsp, _ = model(input_ids, segment_ids, masked_pos)
    #logits_lm: (bs, max_mask, vocab_size) ==> (6, 5, 34)
    #logits_nsp: (bs, yes/no) ==> (6, 2)

    #1. mlm loss
    #logits_lm.transpose: (bs, vocab_size, max_mask) vs. masked_tokens: (bs, max_mask)
    loss_lm = criterion(logits_lm.transpose(1, 2), masked_tokens) # for masked LM
    loss_lm = (loss_lm.float()).mean()
    #2. nsp loss
    #logits_nsp: (bs, 2) vs. isNext: (bs, )
    loss_nsp = criterion(logits_nsp, isNext) # for sentence classification

    #3. combine loss
    loss = loss_lm + loss_nsp
    # if epoch % 100 == 0:
    print('Epoch:', '%02d' % (epoch), 'loss =', '{:.6f}'.format(loss))
    loss.backward()
    optimizer.step()

Epoch: 00 loss = 72.424141
Epoch: 01 loss = 86.703979
Epoch: 02 loss = 117.015976
Epoch: 03 loss = 95.444717
Epoch: 04 loss = 82.881439


**Save the model**

In [None]:
# Save the model
path = '/content/drive/MyDrive/NLP_A5/'
torch.save(model.state_dict(), path + 'bert_model.pth')

**Load the model**

In [None]:
# Load the model
path = '/content/drive/MyDrive/NLP_A5/'
loaded_model = BERT()  # Assuming BERT is defined somewhere in your code
loaded_model.load_state_dict(torch.load(path + 'bert_model.pth'))
loaded_model.eval()  #

BERT(
  (embedding): Embedding(
    (tok_embed): Embedding(86, 768)
    (pos_embed): Embedding(300, 768)
    (seg_embed): Embedding(2, 768)
    (norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (layers): ModuleList(
    (0-5): 6 x EncoderLayer(
      (enc_self_attn): MultiHeadAttention(
        (W_Q): Linear(in_features=768, out_features=512, bias=True)
        (W_K): Linear(in_features=768, out_features=512, bias=True)
        (W_V): Linear(in_features=768, out_features=512, bias=True)
      )
      (pos_ffn): PoswiseFeedForwardNet(
        (fc1): Linear(in_features=768, out_features=3072, bias=True)
        (fc2): Linear(in_features=3072, out_features=768, bias=True)
      )
    )
  )
  (fc): Linear(in_features=768, out_features=768, bias=True)
  (activ): Tanh()
  (linear): Linear(in_features=768, out_features=768, bias=True)
  (norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  (classifier): Linear(in_features=768, out_features=2, bias=True)
  (decoder)

**SNIL Dataset**

In [None]:
snil = load_dataset('snli')

In [None]:
# there are -1 values in the label feature, these are where no class could be decided so we remove
snli = snil.filter(
    lambda x: 0 if x['label'] == -1 else 1
)

In [None]:
import numpy as np
np.unique(snli['train']['label'])
#snli also have -1

array([0, 1, 2])

In [None]:
from datasets import DatasetDict

# Create a DatasetDict with only the SNLI dataset
raw_dataset = DatasetDict({
    'train': snli['train'].shuffle(seed=55).select(list(range(4))),
    'test': snli['test'].shuffle(seed=55).select(list(range(4))),
    'validation': snli['validation'].shuffle(seed=55).select(list(range(4)))
})

In [None]:
raw_dataset

DatasetDict({
    train: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 4
    })
    test: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 4
    })
    validation: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 4
    })
})

In [None]:
len(raw_dataset['validation']['hypothesis'])

4

**Preprocessing**

In [None]:
def preprocess_text(raw_text):
    doc_new = nlp(raw_text)
    sentences_new = list(doc_new.sents)
    #print(f'sentences_new:{len(sentences_new)}')

    # Lower case and clean all the symbols
    text_new = [x.text.lower() for x in sentences_new]
    text_new = [re.sub("[.,!?\\-]", '', x) for x in text_new]

    # Making vocabs - numericalization
    word_list_new = list(set(" ".join(text_new).split()))
    word2id_new = {'[PAD]': 0, '[CLS]': 1, '[SEP]': 2, '[MASK]': 3}

    for i, w in enumerate(word_list_new):
        word2id_new[w] = i + 4  # reserve the first 0-3 for CLS, PAD
    id2word_new = {i: w for i, w in enumerate(word2id_new)}
    vocab_size_new = len(word2id_new)

    # print(f"id2word_new:{id2word_new}")
    token_list_new = list()
    for sentence in sentences_new:
        arr = [word2id_new[word] for sentence in text_new for word in sentence.split()]
        token_list_new.append(arr)
    # print(f'token_list_new length:{len(token_list_new)}')
    # print(f'token_list_new:{token_list_new}')

    return token_list_new, word2id_new, id2word_new, vocab_size_new

In [None]:
def make_batch(sent, batch_size, max_mask, max_len):
    token_list, word2id, id2word, vocab_size = preprocess_text(" ".join(sent))

    #print(f'batch size:{batch_size}')

    batch = []
    positive = negative = 0

    while positive != batch_size // 2 or negative != batch_size // 2:
        tokens_a_index, tokens_b_index = randrange(len(token_list)), randrange(len(token_list))
        # print(f'tokens_a_index:{tokens_a_index}')
        # print(f'tokens_b_index:{tokens_b_index}')
        tokens_a, tokens_b = token_list[tokens_a_index], token_list[tokens_b_index]

        input_ids = [word2id['[CLS]']] + tokens_a + [word2id['[SEP]']] + tokens_b + [word2id['[SEP]']]
        segment_ids = [0] * (1 + len(tokens_a) + 1) + [1] * (len(tokens_b) + 1)

        n_pred = min(max_mask, max(1, int(round(len(input_ids) * 0.15))))
        candidates_masked_pos = [i for i, token in enumerate(input_ids) if token != word2id['[CLS]'] and token != word2id['[SEP]']]
        shuffle(candidates_masked_pos)
        masked_tokens, masked_pos = [], []

        # print('here')

        for pos in candidates_masked_pos[:n_pred]:
            # print('for')
            masked_pos.append(pos)
            masked_tokens.append(input_ids[pos])
            if random() < 0.1:
                index = randint(0, vocab_size - 1)
                input_ids[pos] = word2id[id2word[index]]
            elif random() < 0.8:
                input_ids[pos] = word2id['[MASK]']
            else:
                pass

        n_pad = max_len - len(input_ids)
        input_ids.extend([0] * n_pad)
        segment_ids.extend([0] * n_pad)

        if max_mask > n_pred:
            n_pad = max_mask - n_pred
            masked_tokens.extend([0] * n_pad)
            masked_pos.extend([0] * n_pad)

        if tokens_a_index + 1 == tokens_b_index and positive < batch_size / 2:
            batch.append([input_ids, segment_ids, masked_tokens, masked_pos, True])
            positive += 1
        elif tokens_a_index + 1 != tokens_b_index and negative < batch_size / 2:
            batch.append([input_ids, segment_ids, masked_tokens, masked_pos, False])
            negative += 1

    return batch

In [None]:
import torch
from datasets import load_dataset
from torch.utils.data import DataLoader

def preprocess_function(examples):
    max_mask = 5
    max_len = 300
    # Convert examples to list of sentences
    sents = [f"{premise} {hypothesis}" for premise, hypothesis in zip(examples["premise"], examples["hypothesis"])]
    # Generate batch
    batch = make_batch(sents, len(sents), max_mask, max_len)
    return {
        "input_ids": torch.tensor([example[0] for example in batch]),
        "segment_ids": torch.tensor([example[1] for example in batch]),
        "masked_tokens": torch.tensor([example[2] for example in batch]),
        "masked_pos": torch.tensor([example[3] for example in batch]),
        "labels": torch.tensor([example[4] for example in batch], dtype=torch.long)
    }


tokenized_datasets = raw_dataset.map(
    preprocess_function,
    batched=True,
)

# Separate premise and hypothesis datasets
premise_dataset = tokenized_datasets.map(lambda examples: {'input_ids': examples['input_ids'], 'segment_ids': examples['segment_ids'], 'masked_tokens': examples['masked_tokens'], 'masked_pos': examples['masked_pos'], 'labels': examples['labels']})
hypothesis_dataset = tokenized_datasets.map(lambda examples: {'input_ids': examples['input_ids'], 'segment_ids': examples['segment_ids'], 'masked_tokens': examples['masked_tokens'], 'masked_pos': examples['masked_pos'], 'labels': examples['labels']})

# Remove unnecessary columns from both datasets
premise_dataset = premise_dataset.remove_columns(['hypothesis'])
hypothesis_dataset = hypothesis_dataset.remove_columns(['premise'])

# Set the format to PyTorch tensors for both datasets
premise_dataset.set_format("torch")
hypothesis_dataset.set_format("torch")


Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

**Data Loader**

In [None]:
from torch.utils.data import DataLoader

# initialize the dataloader for premise dataset
premise_train_dataloader = DataLoader(
    premise_dataset['train'],
    batch_size=batch_size,
    shuffle=True
)
premise_eval_dataloader = DataLoader(
    premise_dataset['validation'],
    batch_size=batch_size
)
premise_test_dataloader = DataLoader(
    premise_dataset['test'],
    batch_size=batch_size
)

# initialize the dataloader for hypothesis dataset
hypothesis_train_dataloader = DataLoader(
    hypothesis_dataset['train'],
    batch_size=batch_size,
    shuffle=True
)
hypothesis_eval_dataloader = DataLoader(
    hypothesis_dataset['validation'],
    batch_size=batch_size
)
hypothesis_test_dataloader = DataLoader(
    hypothesis_dataset['test'],
    batch_size=batch_size
)


**Loss Function**

In [None]:
def configurations(u,v):
    # build the |u-v| tensor
    uv = torch.sub(u, v)   # batch_size,hidden_dim
    uv_abs = torch.abs(uv) # batch_size,hidden_dim

    # concatenate u, v, |u-v|
    x = torch.cat([u, v, uv_abs], dim=-1) # batch_size, 3*hidden_dim
    return x

def cosine_similarity(u, v):
    dot_product = np.dot(u, v)
    norm_u = np.linalg.norm(u)
    norm_v = np.linalg.norm(v)
    similarity = dot_product / (norm_u * norm_v)
    return similarity

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

classifier_head = torch.nn.Linear(768*3, 3).to(device)

optimizer = torch.optim.Adam(loaded_model.parameters(), lr=0.002)
optimizer_classifier = torch.optim.Adam(classifier_head.parameters(), lr=0.002)

criterion = nn.CrossEntropyLoss()

In [None]:
from transformers import get_linear_schedule_with_warmup

# and setup a warmup for the first ~10% steps
total_steps = int(len(raw_dataset) / batch_size)
warmup_steps = int(0.1 * total_steps)
scheduler = get_linear_schedule_with_warmup(
		optimizer, num_warmup_steps=warmup_steps,
  	num_training_steps=total_steps - warmup_steps
)

# then during the training loop we update the scheduler per step
scheduler.step()

scheduler_classifier = get_linear_schedule_with_warmup(
		optimizer_classifier, num_warmup_steps=warmup_steps,
  	num_training_steps=total_steps - warmup_steps
)

# then during the training loop we update the scheduler per step
scheduler_classifier.step()



**Training Sentence BERT**

In [None]:
from tqdm.auto import tqdm

num_epoch = 10
# 1 epoch should be enough, increase if wanted
for epoch in range(num_epoch):
    loaded_model.train()
    classifier_head.train()
    # initialize the dataloader loop with tqdm (tqdm == progress bar)
    for step, (premise_batch, hypothesis_batch) in enumerate(tqdm(zip(premise_train_dataloader, hypothesis_train_dataloader), leave=True)):
        # zero all gradients on each new step
        optimizer.zero_grad()
        optimizer_classifier.zero_grad()

        # premise
        input_ids_premise = premise_batch['input_ids'].to(device)
        segment_ids_premise = premise_batch['segment_ids'].to(device)
        masked_tokens_premise = premise_batch['masked_tokens'].to(device)
        masked_pos_premise = premise_batch['masked_pos'].to(device)
        labels_premise = premise_batch['labels'].to(device)

        # hypothesis
        input_ids_hypothesis = hypothesis_batch['input_ids'].to(device)
        segment_ids_hypothesis = hypothesis_batch['segment_ids'].to(device)
        masked_tokens_hypothesis = hypothesis_batch['masked_tokens'].to(device)
        masked_pos_hypothesis = hypothesis_batch['masked_pos'].to(device)
        labels_hypothesis = hypothesis_batch['labels'].to(device)

        # Extract token embeddings from BERT at last_hidden_state
        _,_,u = loaded_model(input_ids_premise, segment_ids_premise,masked_pos_premise)
        _,_,v = loaded_model(input_ids_hypothesis, segment_ids_hypothesis,masked_pos_hypothesis)

        # # Get the mean pooled vectors
        # u_mean_pool = mean_pool(u)  # Assuming you have defined the mean_pool function
        # v_mean_pool = mean_pool(v)

        # Build the |u-v| tensor
        uv = torch.sub(u, v)
        uv_abs = torch.abs(uv)

        # Concatenate u, v, |u-v|
        x = torch.cat([u, v, uv_abs], dim=-1)

        # Process concatenated tensor through classifier_head
        logits = classifier_head(x)

       # calculate the 'softmax-loss' between predicted and true label
        loss = criterion(x, labels_premise)

        # using loss, calculate gradients and then optimizerize
        loss.backward()
        optimizer.step()
        optimizer_classifier.step()

        scheduler.step() # update learning rate scheduler
        scheduler_classifier.step()

    print(f'Epoch: {epoch + 1} | loss = {loss.item():.6f}')


0it [00:00, ?it/s]

Epoch: 1 | loss = 7.065569


0it [00:00, ?it/s]

Epoch: 2 | loss = 7.066988


0it [00:00, ?it/s]

Epoch: 3 | loss = 7.064873


0it [00:00, ?it/s]

Epoch: 4 | loss = 7.067608


0it [00:00, ?it/s]

Epoch: 5 | loss = 7.067865


0it [00:00, ?it/s]

Epoch: 6 | loss = 7.065991


0it [00:00, ?it/s]

Epoch: 7 | loss = 7.067101


0it [00:00, ?it/s]

Epoch: 8 | loss = 7.069799


0it [00:00, ?it/s]

Epoch: 9 | loss = 7.069362


0it [00:00, ?it/s]

Epoch: 10 | loss = 7.067805


**Save the fine-tuned model**

In [None]:
# Save the model
path = '/content/drive/MyDrive/NLP_A5/'
torch.save(loaded_model.state_dict(), path + 'fine_tuned_model.pth')

**Load the fine-tuned model**

In [None]:
# Load the model
path = '/content/drive/MyDrive/NLP_A5/'
finetuned_model = BERT()  # Assuming BERT is defined somewhere in your code
finetuned_model.load_state_dict(torch.load(path + 'fine_tuned_model.pth'))
finetuned_model.eval()  #

BERT(
  (embedding): Embedding(
    (tok_embed): Embedding(86, 768)
    (pos_embed): Embedding(300, 768)
    (seg_embed): Embedding(2, 768)
    (norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (layers): ModuleList(
    (0-5): 6 x EncoderLayer(
      (enc_self_attn): MultiHeadAttention(
        (W_Q): Linear(in_features=768, out_features=512, bias=True)
        (W_K): Linear(in_features=768, out_features=512, bias=True)
        (W_V): Linear(in_features=768, out_features=512, bias=True)
      )
      (pos_ffn): PoswiseFeedForwardNet(
        (fc1): Linear(in_features=768, out_features=3072, bias=True)
        (fc2): Linear(in_features=3072, out_features=768, bias=True)
      )
    )
  )
  (fc): Linear(in_features=768, out_features=768, bias=True)
  (activ): Tanh()
  (linear): Linear(in_features=768, out_features=768, bias=True)
  (norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  (classifier): Linear(in_features=768, out_features=2, bias=True)
  (decoder)

**Loading SentenceBERT from Hugging Face**

url = https://huggingface.co/efederici/sentence-bert-base

In [None]:
!pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-2.5.0-py3-none-any.whl (156 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m156.3/156.3 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sentence-transformers
  Attempting uninstall: sentence-transformers
    Found existing installation: sentence-transformers 2.4.0
    Uninstalling sentence-transformers-2.4.0:
      Successfully uninstalled sentence-transformers-2.4.0
Successfully installed sentence-transformers-2.5.0


In [None]:
from sentence_transformers import SentenceTransformer
sentences = ["this is a dog", "this is a cat"]

model = SentenceTransformer('efederici/sentence-bert-base')
embeddings = model.encode(sentences)
print(embeddings)

  return self.fget.__get__(instance, owner)()


[[-0.37067628 -0.08430281 -0.09314957 ...  0.45384118 -0.20815898
   0.5138719 ]
 [-0.44363123 -0.17198133 -0.23452398 ...  0.04124055  0.381663
   0.53949916]]


**Evaluation and Analysis**

Cosine Similarity metric

1. Pre-trained model from huggingface

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Convert lists to PyTorch tensors
u_hug = torch.tensor(embeddings[0], dtype=torch.float32)
v_hug = torch.tensor(embeddings[1], dtype=torch.float32)

# Now compute cosine similarity
cos_sim = cosine_similarity(u_hug.reshape(1, -1), v_hug.reshape(1, -1))[0, 0]
print(f'Cosine similarity: {cos_sim}')

Cosine similarity: 0.7875121831893921


2. Pre-trained model from scratch

In [None]:
sentences = ["this is a dog.", "this is a cat."]
sentence_a = sentences[0]
sentence_b = sentences[1]

batch_a = make_batch(sentence_a, len(sentence_a), max_mask, max_len)
batch_b = make_batch(sentence_b, len(sentence_b), max_mask, max_len)

input_ids_a, segment_ids_a, masked_tokens_a, masked_pos_a, isNext_a = map(torch.LongTensor, zip(*batch_a))
input_ids_b, segment_ids_b, masked_tokens_b, masked_pos_b, isNext_b = map(torch.LongTensor, zip(*batch_b))

_, _, u = loaded_model(input_ids_a, segment_ids_a, masked_pos_a)
_, _, v = loaded_model(input_ids_b, segment_ids_b, masked_pos_b)

In [None]:
def cosine_similarity_scratch(u, v):
    dot_product = torch.dot(u.flatten(), v.flatten()).item()  # Convert to scalar
    norm_u = torch.norm(u)
    norm_v = torch.norm(v)
    return dot_product / (norm_u * norm_v)

score = cosine_similarity_scratch(u, v)
print(f'Cosine similarity: {score}')

Cosine similarity: 0.999856173992157


Sementic similarity

1. Pre-trained model from huggingface

In [None]:
def euclidean_similarity(u, v):
    # Compute Euclidean distance between u and v
    euclidean_distance = torch.sqrt(torch.sum((u - v) ** 2))

    # Compute Euclidean similarity
    similarity = 1 / (1 + euclidean_distance)

    return similarity

In [None]:
distance = euclidean_similarity(u_hug, v_hug)
print(f'semantic similarity: {distance}')

semantic similarity: 0.115912064909935


2. Pre-trained model from scratch

In [None]:
from scipy.spatial.distance import euclidean

def euclidean_similarity(u, v):
    # Compute Euclidean distance between u and v
    euclidean_distance = torch.sqrt(torch.sum((u - v) ** 2))

    # Compute Euclidean similarity
    similarity = 1 / (1 + euclidean_distance)

    return similarity


distance = euclidean_similarity(u, v)
print(f'semantic similarity: {distance}')

semantic similarity: 0.36273640394210815


Correlation Metric

1. Pre-trained model from huggingface

In [None]:
def correlation_similarity(u, v):
    # Flatten the tensors and convert them to numpy arrays
    u_flat = u.flatten().detach().numpy()
    v_flat = v.flatten().detach().numpy()

    # Calculate Pearson correlation coefficient between u and v
    correlation_coefficient, _ = pearsonr(u_flat, v_flat)

    return correlation_coefficient

In [None]:
correlation = correlation_similarity(u_hug, v_hug)
print(f'Correlation: {correlation}')

Correlation: 0.7868464219271842


2. Pre-trained model from scratch

In [None]:
from scipy.stats import pearsonr

correlation = correlation_similarity(u, v)
print(f'Correlation: {correlation}')

Correlation: 0.9998564219346799


Conclusion

In [None]:
import pandas as pd

# Define the data
data = {
    'Model': ['Huggingface', 'Pretrained Scratch'],
    'Cosine Similarity': [0.7875121831893921, 0.999856173992157],
    'Semantic Similarity': [0.115912064909935, 0.36273640394210815],
    'Correlation Metric': [0.7868464219271842, 0.9998564219346799]
}

# Create a DataFrame
df = pd.DataFrame(data)
df

Unnamed: 0,Model,Cosine Similarity,Semantic Similarity,Correlation Metric
0,Huggingface,0.787512,0.115912,0.786846
1,Pretrained Scratch,0.999856,0.362736,0.999856


Based on the provided similarity scores from both the Huggingface model and the Pretrained Scratch model, we can draw the following conclusions:

**Cosine Similarity:**

The Pretrained Scratch model achieved a significantly higher cosine similarity score (0.9999) compared to the Huggingface model (0.7875). This suggests that the Pretrained Scratch model has a stronger ability to capture similarity between embedding vectors based on their cosine similarity.


**Semantic Similarity:**

The Pretrained Scratch model also outperformed the Huggingface model in semantic similarity, with a higher score of 0.3627 compared to 0.1159. This indicates that the Pretrained Scratch model better captures the semantic relationships between the input texts.


**Correlation Metric:**

Both models achieved high correlation scores, with the Pretrained Scratch model slightly outperforming the Huggingface model (0.9999 vs. 0.7868). This suggests that there is a strong linear relationship between the similarity scores produced by the model and the ground truth values.
Overall, the Pretrained Scratch model consistently outperformed the Huggingface model across all similarity metrics, indicating its superior performance in capturing both semantic and syntactic similarities between input texts. This suggests that the Pretrained Scratch model may have been better trained or fine-tuned for the specific task of capturing similarity between text embeddings.

**Discussion**

- Data Quality and Quantity:

Limitation: The performance of the models heavily depends on the quality and quantity of training data. If the training data is limited or not representative of the target task, the model's performance may be suboptimal.
Improvement: Increasing the size and diversity of the training data can help improve model performance. Additionally, ensuring the quality and relevance of the training data to the target task is crucial.

- Model Architecture and Hyperparameters:

Limitation: The choice of model architecture and hyperparameters can significantly impact the model's performance. Selecting suboptimal architectures or hyperparameters may lead to inferior results.
Improvement: Conducting thorough hyperparameter tuning and experimenting with different model architectures can help identify the best-performing configurations. Additionally, leveraging techniques like transfer learning and model ensemble methods can further enhance performance.

- Evaluation Metrics:

Limitation: The choice of evaluation metrics may not always fully capture the model's performance or generalize well to real-world scenarios.
Improvement: Using a diverse set of evaluation metrics that cover various aspects of model performance, including precision, recall, F1 score, and qualitative assessments, can provide a more comprehensive understanding of the model's capabilities.

- Computational Resources:

Challenge: Training and evaluating deep learning models often require significant computational resources, including high-performance GPUs and large memory capacities.
Improvement: Leveraging cloud computing platforms or distributed training techniques can help mitigate the computational resource constraints. Additionally, optimizing model architectures for efficiency and using techniques like model pruning and quantization can reduce the computational burden.

- Interpretability and Explainability:

Limitation: Deep learning models, particularly complex ones, are often considered black boxes, making it challenging to interpret their decisions and behaviors.
Improvement: Incorporating techniques for model interpretability and explainability, such as attention mechanisms, gradient-based methods for feature importance, and model visualization techniques, can enhance transparency and trust in the model's predictions.

- Robustness and Generalization:

Challenge: Deep learning models may exhibit poor robustness and generalization when applied to out-of-distribution or adversarial examples.
Improvement: Employing techniques like data augmentation, regularization, adversarial training, and robust optimization can enhance the model's robustness and generalization capabilities, making it more resilient to unseen data and adversarial attacks.