# Bert and Hybrid Approaches

## Before You Start

### Libraries

In [None]:
# install the essential libraries that are not available on google colab
!pip install transformers
!pip install -qU sentence-transformers

In [3]:
# import libraries 
import codecs
import re
from IPython import display
from tqdm import tqdm
import numpy as np
import pandas as pd

import requests
import time
import yaml
import numpy as np
import torch, gc
from sentence_transformers import models, SentenceTransformer, util, InputExample, losses
from torch.utils.data import DataLoader
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [None]:
# create the utils folder and upload evaluation.ipynb and preprocess.ipynb if you are using google colab
# ! mkdir utils
%run "./utils/evaluation.ipynb"
%run "./utils/preprocess.ipynb"

### Download Data

In [None]:
# download data an unzip
! wget https://raw.githubusercontent.com/language-ml/2-LM-embedding-projects/main/problem3/evaluation_IR.yml -P ./data
! wget https://github.com/language-ml/2-LM-embedding-projects/raw/main/problem3/doc_collection.zip -P ./data
! unzip ./data/doc_collection.zip -d ./data

## Preprocess

### Read and Clean Data
set path of data in `PATH` variable and ground truth in `YAML_PATH` variable

In [6]:
# set path of data
PATH = './data/IR_dataset/'
YAML_PATH = './data/evaluation_IR.yml/'

PATH = PATH.rstrip('/')
YAML_PATH = YAML_PATH.rstrip('/')

# read and clean data
docs = []
for index in range(0, 3258):
    with open(f"{PATH}/{index}.txt", 'r', encoding='utf8') as file_reader:
        doc = file_reader.read()
        doc = clean_text(doc, sentence=True, only_persian=False)
        if doc:
            docs.append(doc)

In [7]:
# sample cleaned text
docs[0]

'برخی از هواداران مصدق یا اعضای جبهه ملی که در زمان نخست وزیری مصدق از جبهه ملی یا از هییت وزیران کنار گذاشته شده یا کنار رفتند پس از جدایی از مصدق به انتقاد از کارنامه وی پرداختند و حتی برای سرنگونی اش تلاش کردند. برخی از این افراد عبارت اند از فضل الله زاهدی نخست وزیر کودتا علی امینی حسین مکی که در آغاز سرباز فداکار وطن نامیده شد ولی در پایان به دلیل مخالفت با مصدق از سوی هواداران جبهه ملی سرباز خطاکار وطن خطاب می شد مظفر بقایی به دلیل اتهام مشارکت در قتل سرتیپ افشار طوس و سپس اتهام شرکت در کودتای مرداد به دستور فرماندار نظامی تهران بازداشت شد و تا روز مرداد در زندان بود. آیت الله کاشانی در تیر در اعتراض به استعفای مصدق تهدید به حکم جهاد کرد ولی در ماه های پایانی ضمن انتقاد شدید از کارنامه دولت از زاهدی پشتیبانی می کرد. او در اعلامیه ای به عنوان تحریم رفراندوم دولت در هشتم مرداد نوشت ابوالحسن حایری زاده در سال در مجلس ضمن مخالفت با لایحه اختیارات ویژه دولت آن را خطر بازگشت دیکتاتوری ساله دانست و سپس در سال بعد با نوشتن نامه به رییس سازمان ملل متحد مصدق را دیکتاتور خواند. این افراد عل

## Semantic Search

### Load Segmented Documents, Queries, Address models

In [8]:
# a function to segment doucumts by number of words
def segment_documents(docs, max_length=1e6):
  
    segmented_docs = []
    # a list indicate what is the original number of the segmented doc
    indices_docs = []

    for index in range(0, len(docs)):
        # split documents bt space
        splitted_words = docs[index].split(" ")
        # if the document length is longer than max_length then split
        if len(splitted_words) > max_length:
            for doc_segment in range(0, len(splitted_words), max_length):
                segmented_docs.append( " ".join(splitted_words[doc_segment:doc_segment + max_length]))
                indices_docs.append(index)
        # if the document is shorter than max_length, then dont split
        else:
            segmented_docs.append(docs[index])
            indices_docs.append(index)

    return segmented_docs, indices_docs

In [9]:
# a function to load model with pooling from hugingface
def load_st_model(model_name_or_path):
    word_embedding_model = models.Transformer(model_name_or_path)
    pooling_model = models.Pooling(
        word_embedding_model.get_word_embedding_dimension(),
        pooling_mode_mean_tokens=True,
        pooling_mode_cls_token=False,
        pooling_mode_max_tokens=False)
    
    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
    return model

In [10]:
# segment documents
# set the max words to segment documents
corpus, ind_s = segment_documents(docs, 100000)
print(f"number of segmented documents: {len(corpus)}")

# load evaluation data
evaluation_data = {}
with open(YAML_PATH, "r") as stream:
    try:
        evaluation_data = dict(yaml.safe_load(stream))
    except yaml.YAMLError as exc:
        print(exc)

# query sentences:
queries = list(evaluation_data.keys())

# Distilbert
# BERT FarsTail
# BERT WikiTriplet
# BERT WikiNLI
model_paths= ['HooshvareLab/distilbert-fa-zwnj-base',
              'm3hrdadfi/bert-fa-base-uncased-wikinli-mean-tokens',
              'm3hrdadfi/bert-fa-base-uncased-wikitriplet-mean-tokens']


number of segmented documents: 3258


### Load Model 

In [11]:
# check if gpu is on
torch.cuda.is_available()

True

In [12]:
# Load the Sentence-Transformer
# model_paths[0] -> HooshvareLab/distilbert-fa-zwnj-base
# model_paths[1] -> m3hrdadfi/bert-fa-base-uncased-wikinli-mean-tokens
model = load_st_model(model_paths[0])

Downloading:   0%|          | 0.00/500 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/289M [00:00<?, ?B/s]

Some weights of the model checkpoint at HooshvareLab/distilbert-fa-zwnj-base were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/292 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/416k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.06M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/134 [00:00<?, ?B/s]

In [13]:
# release the memory of GPU

gc.collect()
torch.cuda.empty_cache()
pt_model = None

!nvidia-smi

Thu Dec 30 18:44:39 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P8    28W / 149W |      3MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

### Compute Embeddings

In [14]:
# compute embeddings by the selected model
corpus_embeddings = model.encode(corpus, convert_to_tensor=True, show_progress_bar=True)

Batches:   0%|          | 0/102 [00:00<?, ?it/s]

### Evaluation 

In [15]:
# a function to sort top result based on the bert model 
# it also has some functionality to sort segmented document from an original document based on their score and index 
def bert_top_k(model, corpus_embeddings, query, ind_s, top_k, alpha=0, beta=1):

    # compute query embedings
    query_embedding = model.encode(query, convert_to_tensor=True, show_progress_bar=False)
    # find scores between query and emedings
    cos_scores = util.pytorch_cos_sim(query_embedding, corpus_embeddings)[0]
    cos_scores = cos_scores.cpu()

    # extract top results and save in top_relative_indices
    top_results = torch.topk(cos_scores, k=top_k)
    top_relative_indices = list(top_results.indices.numpy())

    # FOR SEGMENTED DOUCMNENTS:
    ## a map between segmented documents and the original ones
    ## if the document is not segmented then the result will be unchanged 

    top_real_indices = list(np.array(ind_s)[top_relative_indices])
    indices_info = {}

    for key in list(set(top_real_indices)):
        indices_info[key] = {'number': 0, 'mean': 0, 'sum': 0, 'level': 0}

    idx = 1
    for score, key in zip(top_results[0], top_real_indices):
        indices_info[key]['level'] = indices_info[key]['level'] + 1/idx
        indices_info[key]['number'] = indices_info[key]['number'] + 1
        indices_info[key]['sum'] = indices_info[key]['sum'] + score.numpy()
        indices_info[key]['mean'] = indices_info[key]['sum'] / indices_info[key]['number']
        idx +=1

    # compute joint score of level and mean of scores
    joint_score = []    
    for key in indices_info.keys():
        score_combined = alpha * indices_info[key]['level'] + beta * indices_info[key]['mean']
        joint_score.append((key, score_combined)) 

    # sort final scores
    joint_score_sorted = sorted(joint_score, key=lambda key: key[1], reverse=True)
  
    return np.array([index for index, score in joint_score_sorted])

In [16]:
# Compute MRR metric
def MMR_bert(model, corpus_embeddings, queries, ind_s, evaluation_data, top_k, alpha, beta):
    relevent_match_list = []

    for query_i in tqdm(queries):
        
        top_results_indices_i = bert_top_k(model, corpus_embeddings, query_i, ind_s, top_k=top_k, alpha=alpha, beta=beta)
        boolian_match = (np.array(top_results_indices_i) == evaluation_data[query_i]['relevant'][0]).astype(int)
        relevent_match_list.append(list(boolian_match))
    
    return mean_reciprocal_rank(relevent_match_list)

In [17]:
# Compute PATK metric
def PATK_bert(model, corpus_embeddings, queries, ind_s, evaluation_data, k, top_k, alpha, beta):
    patk = []

    for query_i in tqdm(queries):
        
        top_results_indices_i = bert_top_k(model, corpus_embeddings, query_i, ind_s, top_k=top_k, alpha=alpha, beta=beta)
        boolian_match = np.isin(top_results_indices_i, (evaluation_data[query_i]['relevant'] + evaluation_data[query_i]['similar_high'])).astype(int)
        patk.append(precision_at_k(boolian_match, k))
    
    return sum(patk)/len(patk)

In [18]:
# Compute MAP metric
def MAP_bert(model, corpus_embeddings, queries, ind_s, evaluation_data, top_k, alpha, beta):
    map_i = []

    for query_i in tqdm(queries):
        
        top_results_indices_i = bert_top_k(model, corpus_embeddings, query_i, ind_s, top_k=top_k, alpha=alpha, beta=beta)
        boolian_match = np.isin(top_results_indices_i, (evaluation_data[query_i]['relevant'] + evaluation_data[query_i]['similar_high'])).astype(int)
        map_i.append(list(boolian_match))
    
    return mean_average_precision(map_i)

In [19]:
mrr_score = MMR_bert(model, corpus_embeddings, queries, ind_s, evaluation_data, top_k=100, alpha=0, beta=1)
print(f"\n {mrr_score}")

100%|██████████| 150/150 [00:02<00:00, 64.38it/s]


 0.2997236628197085





In [20]:
pat10_score = PATK_bert(model, corpus_embeddings, queries, ind_s, evaluation_data, k=10, top_k=100, alpha=0, beta=1)
print(f"\n {pat10_score}")

100%|██████████| 150/150 [00:01<00:00, 88.38it/s]


 0.2560000000000002





In [21]:
map_score = MAP_bert(model, corpus_embeddings, queries, ind_s, evaluation_data, top_k=100, alpha=0, beta=1)
print(f"\n {map_score}")

100%|██████████| 150/150 [00:01<00:00, 90.94it/s]


 0.3961095761108741





## Hybrid Approach

### Compute TFIDF Modified

In [22]:
## compute TFIDF of documents (segments) 
min_df = 1
max_df=0.5
max_features=10000
ngram_range = (1,2)
model_tfidf = TfidfVectorizer(analyzer="word", min_df=min_df, max_df=max_df, max_features=max_features, ngram_range=ngram_range)
matrix = model_tfidf.fit_transform(corpus)

In [23]:
# a tfidf function to get top k documnet and their score, this score can ba modified with 2 method named l1 and l2
def tfidf_top_k(query, k=2, method='l2', base_line = 0.75):
    
    query_tfidf = model_tfidf.transform([query])

    # Stores cosine similarity scores
    doc_scores = []

    # Compute the cosine similarity scores
    for doc in matrix:
        doc_scores.append(cosine_similarity(query_tfidf, doc)[0][0])

    # Sort list of doc_scores and return the top k indices of highest scores
    sorted_scores = sorted(enumerate(doc_scores), key=lambda ind_score: ind_score[1], reverse=True)

    # if k=-1: then return all
    if k!=-1:
        sorted_scores = sorted_scores[:k]

    min = sorted_scores[-1][1]
    max = sorted_scores[0][1]
    coef_ = 1/max

    # 2 methods for scaling the scores
    ## method l1: scale all scores by first score, first score will be 1
    ## method l2: scale with baseline + (score - min)/(max - min)[in range 0:1-baseline]
    if method == 'l1':
        modified_scores = [(index, score * coef_) for index, score in sorted_scores]
    elif method == 'l2':
        modified_scores = [(index, (score - min) * (1-base_line) /(max - min)  + base_line) for index, score in sorted_scores]
    else:
        modified_scores = sorted_scores


    return modified_scores

In [24]:
tfidf_top_k(queries[0], 10, method=None)

[(484, 0.575588808337776),
 (482, 0.4444740458546642),
 (345, 0.40059722144781185),
 (343, 0.3997860835205268),
 (344, 0.3996144398900456),
 (349, 0.3800357048669676),
 (481, 0.37554481818650987),
 (466, 0.35928067057214924),
 (341, 0.3549778912545992),
 (347, 0.34986815215968536)]

In [25]:
tfidf_top_k(queries[0], 10, method='l1')

[(484, 1.0),
 (482, 0.7722075888484458),
 (345, 0.6959781282139299),
 (343, 0.6945688966313571),
 (344, 0.694270691336197),
 (349, 0.6602555493816152),
 (481, 0.6524533012916519),
 (466, 0.6241967622853961),
 (341, 0.6167213227785442),
 (347, 0.6078439106035749)]

In [26]:
tfidf_top_k(queries[0], 10, method='l2')

[(484, 1.0),
 (482, 0.8547820515154094),
 (345, 0.8061856745269492),
 (343, 0.805287287621405),
 (344, 0.8050971813708434),
 (349, 0.7834124856117293),
 (481, 0.7784385426455676),
 (466, 0.7604249635055969),
 (341, 0.755659361422025),
 (347, 0.75)]

### Fine Tune

In [27]:
# release the memory of GPU

gc.collect()
torch.cuda.empty_cache()
pt_model = None

!nvidia-smi

Thu Dec 30 18:46:57 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   60C    P0    65W / 149W |    963MiB / 11441MiB |     86%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [28]:
# https://www.sbert.net/docs/training/overview.html
# train_examples = [InputExample(texts=['My first sentence', 'My second sentence'], label=0.8),
#     InputExample(texts=['Another pair', 'Unrelated sentence'], label=0.3)]

train_examples = []

for key in tqdm(evaluation_data.keys()):
    # Most Relevent Document 
    doc_tfidf_scores = tfidf_top_k(key, 10)
    # make train examples joint query and top documents returned by tidf
    for doc_number, score in doc_tfidf_scores:
        train_examples.append(InputExample(texts=[corpus[doc_number], key], label=float(score)))



100%|██████████| 150/150 [08:00<00:00,  3.21s/it]


In [29]:
# to solve the problem of out of memory gpu, we create batches of train_example
train_batch = [train_examples[index:index+5] for index in range(0, len(train_examples), 5)]

In [30]:
#Define your train dataset, the dataloader and the train loss
for batch in tqdm(train_batch):
    train_dataloader = DataLoader(batch, shuffle=True, batch_size=16)
    train_loss = losses.CosineSimilarityLoss(model)

    #Tune the model
    model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=10, warmup_steps=100, show_progress_bar=False)

100%|██████████| 300/300 [27:30<00:00,  5.50s/it]


In [31]:
# compute corpus embeddings with fine tuned model
corpus_embeddings = model.encode(corpus, convert_to_tensor=True, show_progress_bar=True)

Batches:   0%|          | 0/102 [00:00<?, ?it/s]

In [32]:
# how to save the model:

# model.save('10_k_tfidf_l2_bert_tuned')
# ! zip -r 10_k_tfidf_l2_bert_tuned.zip 10_k_tfidf_l2_bert_tuned

# how to save and load the embedings:

# torch.save(corpus_embeddings, 'corpus_embeddings.pt')
# corpus_embeddings = torch.load('corpus_embeddings.pt')

### Evaluation

In [33]:
mrr_score = MMR_bert(model, corpus_embeddings, queries, ind_s, evaluation_data, top_k=100, alpha=0, beta=1)
print(f"\n {mrr_score}")

100%|██████████| 150/150 [00:02<00:00, 60.01it/s]


 0.5329212480199563





In [34]:
pat10_score = PATK_bert(model, corpus_embeddings, queries, ind_s, evaluation_data, k=10, top_k=100, alpha=0, beta=1)
print(f"\n {pat10_score}")

100%|██████████| 150/150 [00:01<00:00, 81.04it/s]


 0.42





In [35]:
map_score = MAP_bert(model, corpus_embeddings, queries, ind_s, evaluation_data, top_k=100, alpha=0, beta=1)
print(f"\n {map_score}")

100%|██████████| 150/150 [00:01<00:00, 79.44it/s]


 0.6919006367351903



