In [1]:
!pip install sentence_transformers


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0[0m[39;49m -> [0m[32;49m23.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.10 -m pip install --upgrade pip[0m


In [2]:
from sentence_transformers import SentenceTransformer, SentencesDataset, InputExample, losses, util, models, evaluation
from transformers import AutoTokenizer, AutoModel
import torch

In [4]:
import csv
from post_parser_record import PostParserRecord

def read_tsv_test_data(file_path):
  # Takes in the file path for test file and generate a dictionary
  # of question id as the key and the list of question ids similar to it
  # as value. It also returns the list of all question ids that have
  # at least one similar question
  dic_similar_questions = {}
  lst_all_test = []
  with open(file_path) as fd:
    rd = csv.reader(fd, delimiter="\t", quotechar='"')
    for row in rd:
        question_id = int(row[0])
        lst_similar = list(map(int, row[1:]))
        dic_similar_questions[question_id] = lst_similar
        lst_all_test.append(question_id)
        lst_all_test.extend(lst_similar)
  return dic_similar_questions, lst_all_test

dic_similar_questions, lst_all_test = read_tsv_test_data("duplicate_questions.tsv")
post_reader = PostParserRecord("Posts_law.xml")

In [4]:
## https://huggingface.co/nlpaueb/legal-bert-base-uncased
## https://www.sbert.net/examples/applications/computing-embeddings/README.html?highlight=autotokenizer


#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model1_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask



#Sentences we want sentence embeddings for
sentences = ['If a company agrees to pay travel cost for a job interview, is the promise binding and enforceable?',
             'Is a job offer letter sent and accepted by email, legally binding?']

#Load AutoModel from huggingface model repository
tokenizer = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")
model = AutoModel.from_pretrained("nlpaueb/legal-bert-base-uncased")

#Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, max_length=128, return_tensors='pt')

#Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)

#Perform pooling. In this case, mean pooling
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
cos = torch.nn.CosineSimilarity(dim=0)
output = cos(sentence_embeddings[0], sentence_embeddings[1])
print(output)

Downloading (…)okenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/222k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at nlpaueb/legal-bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tensor(0.8943)


In [5]:
from sentence_transformers import SentenceTransformer, util
import torch

# in question one, we are using the pre-trained model on quora with no further fine-tuning
model_name = 'distilbert-base-nli-stsb-quora-ranking'
model = SentenceTransformer(model_name)

# list of text to be indexed (encoded)
corpus = []
# this dictionary is used as key: corpus index [0, 1, 2, ...] and value: corresponding question id
index_to_question_id = {}
idx = 0

lst_test_question_ids = list(dic_similar_questions.keys())

# indexing all the questions in the law stack exchange -- only using the question titles
for question_id in post_reader.map_questions:
  question = post_reader.map_questions[question_id]
  text = question.title
  q_id = question.post_id
  corpus.append(text)
  index_to_question_id[idx] = question_id
  idx += 1

# Indexing (embedding) the 
corpus_embeddings = model.encode(corpus, convert_to_tensor=True, show_progress_bar=True)

top_k = 100

p1 = 0

for question_id in lst_test_question_ids:

  query_text = post_reader.map_questions[question_id].title
  query_embedding = model.encode(query_text, convert_to_tensor=True)

  # We use cosine-similarity and torch.topk to find the highest scores
  cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
  top_results = torch.topk(cos_scores, k=2)
  isPrinted = False;
  for score, idx in zip(top_results[0], top_results[1]):
    index = int(idx)
    # printing question id and similarity score
    if question_id != index_to_question_id[index] and not isPrinted:
      # print(str(question_id), "->", index_to_question_id[index], "(Score: {:.4f})".format(score))
      if index_to_question_id[index] in dic_similar_questions[question_id]:
        p1 = p1 + 1
      isPrinted = True

print("P@1 ignoring question of itself: " + str(round(p1 / len(lst_test_question_ids), 3)))
p1 = 0

for question_id in lst_test_question_ids:

  query_text = post_reader.map_questions[question_id].title
  query_embedding = model.encode(query_text, convert_to_tensor=True)

  # We use cosine-similarity and torch.topk to find the highest scores
  cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
  top_results = torch.topk(cos_scores, k=1)
  isPrinted = False;
  for score, idx in zip(top_results[0], top_results[1]):
    index = int(idx)
    if index_to_question_id[index] in dic_similar_questions[question_id]:
      p1 = p1 + 1

print("P@1 keep same question: " + str(round(p1 / len(lst_test_question_ids), 3)))

mrrSum = 0

for question_id in lst_test_question_ids:
  query_text = post_reader.map_questions[question_id].title
  query_embedding = model.encode(query_text, convert_to_tensor=True)

  # We use cosine-similarity and torch.topk to find the highest 5 scores
  cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
  top_results = torch.topk(cos_scores, k=top_k)
  totalCount = 0
  for score, idx in zip(top_results[0], top_results[1]):
    totalCount = totalCount + 1
    if index_to_question_id[index] in dic_similar_questions[question_id]:
      mrrSum = mrrSum + (1 / totalCount)
      break
    index = int(idx)
    # printing question id and similarity score
    # print(index_to_question_id[index], "(Score: {:.4f})".format(score))
print("Mean Reciprocal Rank from 100 queries: " + str(round(mrrSum / len(lst_test_question_ids), 3)))



Batches:   0%|          | 0/756 [00:00<?, ?it/s]

P@1 ignoring question of itself: 0.156
P@1 keep same question: 0.011
Mean Reciprocal Rank from 100 queries: 0.095


Used for testing P@1 value and MRR for fine-tuning

In [6]:
def modelTest(model1, dic_similar_question_test):

  # Indexing (embedding) the 
  corpus_embeddings1 = model1.encode(corpus, convert_to_tensor=True, show_progress_bar=True)

  top_k = 100

  p1 = 0

  lst_test_question_ids = list(dic_similar_question_test.keys())

  for question_id in lst_test_question_ids:

    query_text = post_reader.map_questions[question_id].title
    query_embedding = model1.encode(query_text, convert_to_tensor=True)

    # We use cosine-similarity and torch.topk to find the highest scores
    cos_scores = util.cos_sim(query_embedding, corpus_embeddings1)[0]
    top_results = torch.topk(cos_scores, k=2)
    isPrinted = False;
    for score, idx in zip(top_results[0], top_results[1]):
      index = int(idx)
      # printing question id and similarity score
      if question_id != index_to_question_id[index] and not isPrinted:
        # print(str(question_id), "->", index_to_question_id[index], "(Score: {:.4f})".format(score))
        if index_to_question_id[index] in dic_similar_question_test[question_id]:
          p1 = p1 + 1
        isPrinted = True

  print("P@1 ignoring question of itself: " + str(round(p1 / len(lst_test_question_ids), 4)))
  p1 = 0

  for question_id in lst_test_question_ids:

    query_text = post_reader.map_questions[question_id].title
    query_embedding = model1.encode(query_text, convert_to_tensor=True)

    # We use cosine-similarity and torch.topk to find the highest scores
    cos_scores = util.cos_sim(query_embedding, corpus_embeddings1)[0]
    top_results = torch.topk(cos_scores, k=1)
    isPrinted = False;
    for score, idx in zip(top_results[0], top_results[1]):
      index = int(idx)
      if index_to_question_id[index] in dic_similar_question_test[question_id]:
        p1 = p1 + 1

  print("P@1 keep same question: " + str(round(p1 / len(lst_test_question_ids), 4)))

  mrrSum = 0
  mrrLst = []

  for question_id in lst_test_question_ids:
    query_text = post_reader.map_questions[question_id].title
    query_embedding = model1.encode(query_text, convert_to_tensor=True)

    # We use cosine-similarity and torch.topk to find the highest 5 scores
    cos_scores = util.cos_sim(query_embedding, corpus_embeddings1)[0]
    top_results = torch.topk(cos_scores, k=top_k)
    totalCount = 0
    found = False
    for score, idx in zip(top_results[0], top_results[1]):
      totalCount = totalCount + 1
      if index_to_question_id[index] in dic_similar_question_test[question_id]:
        mrrSum = mrrSum + (1 / totalCount)
        mrrLst.append(1 / totalCount)
        found = True
        break
      index = int(idx)
    if not found:
        mrrLst.append(0)
        
  print("Mean Reciprocal Rank from 100 queries: " + str(round(mrrSum / len(lst_test_question_ids), 4)))

  return mrrLst


Fine-tuning distilbert-base-nli-stsb-quora-ranking with lost function

In [8]:
from torch.utils.data import DataLoader
import random
import statistics
import math

train_samples_MultipleNegativesRankingLoss = []
train_batch_size = 128
num_epochs = 5
model_save_path = "owenBeanModel.bin"
margin = 2
distance_metric = 2

model_name = 'distilbert-base-nli-stsb-quora-ranking'
model = SentenceTransformer(model_name)

count = 0

dic_similar_question_test = {}

questionMap = list(post_reader.map_questions.keys())

for question_id in lst_test_question_ids:

  query_text = post_reader.map_questions[question_id].title
  query_embedding = model.encode(query_text, convert_to_tensor=True)

  if count > len(lst_test_question_ids) * 0.9:
    for duplicate_id in dic_similar_questions[question_id]:
      train_samples_MultipleNegativesRankingLoss.append(InputExample(texts=[post_reader.map_questions[question_id].title, post_reader.map_questions[duplicate_id].title], label=1))
    
    index = random.randint(0, len(post_reader.map_questions) - 1)

    if questionMap[index] not in dic_similar_questions[question_id]:
      train_samples_MultipleNegativesRankingLoss.append(InputExample(texts=[post_reader.map_questions[question_id].title, post_reader.map_questions[questionMap[index]].title], label=0))
  else:
    dic_similar_question_test[question_id] = dic_similar_questions[question_id]

  count = count + 1


# Create data loader and loss for MultipleNegativesRankingLoss
train_dataset_MultipleNegativesRankingLoss = SentencesDataset(train_samples_MultipleNegativesRankingLoss, model=model)
train_dataloader_MultipleNegativesRankingLoss = DataLoader(train_dataset_MultipleNegativesRankingLoss, shuffle=True, batch_size=train_batch_size)
train_loss_MultipleNegativesRankingLoss = losses.MultipleNegativesRankingLoss(model)

# Train the model
model.fit(train_objectives=[(train_dataloader_MultipleNegativesRankingLoss, train_loss_MultipleNegativesRankingLoss)],
          epochs=num_epochs,
          warmup_steps=1000,
          output_path=model_save_path
          )

print("Fine-tune model loss function:")
fine_tune_mrr_list = modelTest(model, dic_similar_question_test)

model_name = 'distilbert-base-nli-stsb-quora-ranking'
model = SentenceTransformer(model_name)

print("Pre-trained model:")
pretrained_mrr_list = modelTest(model, dic_similar_question_test)

fine_tune_mrr = 0
for mrr in fine_tune_mrr_list:
  fine_tune_mrr = fine_tune_mrr + mrr
fine_tune_mrr = fine_tune_mrr / len(fine_tune_mrr_list)

pretrained_mrr = 0
for mrr in pretrained_mrr_list:
  pretrained_mrr = pretrained_mrr + mrr
pretrained_mrr = pretrained_mrr / len(pretrained_mrr_list)


fine_tune_deviation = statistics.stdev(fine_tune_mrr_list)
pretrained_deviation = statistics.stdev(pretrained_mrr_list)

b_a = fine_tune_mrr - pretrained_mrr
b_a_standard = fine_tune_deviation - pretrained_deviation
square_rt_n = math.sqrt(len(pretrained_mrr_list))
print("B - A: " + str(b_a))
print("standard dev: " + str(b_a_standard))
print("Square root n: " + str(square_rt_n))
t = (b_a / b_a_standard) * square_rt_n
print("t: " + str(round(t, 4)))





Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1 [00:00<?, ?it/s]

Fine-tune model loss function:


Batches:   0%|          | 0/756 [00:00<?, ?it/s]

P@1 ignoring question of itself: 0.1417
P@1 keep same question: 0.0118
Mean Reciprocal Rank from 100 queries: 0.0931
Pre-trained model:


Batches:   0%|          | 0/756 [00:00<?, ?it/s]

P@1 ignoring question of itself: 0.1417
P@1 keep same question: 0.0118
Mean Reciprocal Rank from 100 queries: 0.0925
B - A 0.0006575965014532781
standard dev: 0.001447521727046236
Square root n: 15.937377450509228
t: 7.2402
