# Bert and Creative Approach

### Libraries

In [1]:
!pip install transformers
!pip install -qU sentence-transformers
!pip install -qU hazm

Collecting transformers
  Downloading transformers-4.15.0-py3-none-any.whl (3.4 MB)
[K     |████████████████████████████████| 3.4 MB 5.6 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 40.7 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 42.8 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 38.6 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 457 kB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attem

In [2]:
import codecs
import re
from IPython import display
from tqdm import tqdm
import numpy as np
import pandas as pd

import hazm
import requests
import time
import yaml
import numpy as np
import torch
from sentence_transformers import models, SentenceTransformer, util, InputExample, losses
from torch.utils.data import DataLoader


In [25]:
# create the utils folder and upload evaluation.ipynb if you are using google colab
# ! mkdir utils
%run "./utils/evaluation.ipynb"
%run "./utils/preprocess.ipynb"

### Download Data

In [None]:
! wget https://raw.githubusercontent.com/language-ml/2-LM-embedding-projects/main/problem3/evaluation_IR.yml -P ./data
! wget https://github.com/language-ml/2-LM-embedding-projects/raw/main/problem3/doc_collection.zip -P ./data
! unzip ./data/doc_collection.zip -d ./data

In [6]:
PATH = './data/IR_dataset/'
YAML_PATH = './data/evaluation_IR.yml/'

PATH = PATH.rstrip('/')
YAML_PATH = YAML_PATH.rstrip('/')

docs = []
for index in range(0, 3258):
    with open(f"{PATH}/{index}.txt", 'r', encoding='utf8') as file_reader:
      doc = file_reader.read()
      doc = clean_text(doc, sentence=True, only_persian=False)
      if doc:
        docs.append(doc)

In [7]:
# sample text
docs[0]

'برخی از هواداران مصدق یا اعضای جبهه ملی که در زمان نخست وزیری مصدق از جبهه ملی یا از هییت وزیران کنار گذاشته شده یا کنار رفتند پس از جدایی از مصدق به انتقاد از کارنامه وی پرداختند و حتی برای سرنگونی اش تلاش کردند. برخی از این افراد عبارت اند از فضل الله زاهدی نخست وزیر کودتا علی امینی حسین مکی که در آغاز سرباز فداکار وطن نامیده شد ولی در پایان به دلیل مخالفت با مصدق از سوی هواداران جبهه ملی سرباز خطاکار وطن خطاب می شد مظفر بقایی به دلیل اتهام مشارکت در قتل سرتیپ افشار طوس و سپس اتهام شرکت در کودتای مرداد به دستور فرماندار نظامی تهران بازداشت شد و تا روز مرداد در زندان بود. آیت الله کاشانی در تیر در اعتراض به استعفای مصدق تهدید به حکم جهاد کرد ولی در ماه های پایانی ضمن انتقاد شدید از کارنامه دولت از زاهدی پشتیبانی می کرد. او در اعلامیه ای به عنوان تحریم رفراندوم دولت در هشتم مرداد نوشت ابوالحسن حایری زاده در سال در مجلس ضمن مخالفت با لایحه اختیارات ویژه دولت آن را خطر بازگشت دیکتاتوری ساله دانست و سپس در سال بعد با نوشتن نامه به رییس سازمان ملل متحد مصدق را دیکتاتور خواند. این افراد عل

In [8]:
# a function to segment doucumts by number of words
def segment_documents(docs, max_doc_length=100000):
  
  segmented_docs = []
  indices_docs = []

  for index in range(0, len(docs)):
    # split documents bt space
    split_to_words = docs[index].split(" ")
    # if the document length is longer than max_doc_length then split
    if len(split_to_words) > max_doc_length:
      for doc_segment in range(0, len(split_to_words), max_doc_length):
        segmented_docs.append( " ".join(split_to_words[doc_segment:doc_segment + max_doc_length]))
        indices_docs.append(index)
    # if the document is shorter than max_doc_length, then dont split
    else:
      segmented_docs.append(docs[index])
      indices_docs.append(index)

  return segmented_docs, indices_docs

In [9]:
# a function to load model with pooling
def load_st_model(model_name_or_path):
    word_embedding_model = models.Transformer(model_name_or_path)
    pooling_model = models.Pooling(
        word_embedding_model.get_word_embedding_dimension(),
        pooling_mode_mean_tokens=True,
        pooling_mode_cls_token=False,
        pooling_mode_max_tokens=False)
    
    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
    return model

### Load Segmented Documents, Queries, Address models

In [10]:
# segment documents
# set the max words to segment documents
corpus, ind_s = segment_documents(docs, 100000)
print(f"number of segmented documents: {len(corpus)}")

# load evaluation data
evaluation_data = {}
with open(YAML_PATH, "r") as stream:
    try:
        evaluation_data = dict(yaml.safe_load(stream))
    except yaml.YAMLError as exc:
        print(exc)

# query sentences:
queries = list(evaluation_data.keys())

# Distilbert
# BERT FarsTail
# BERT WikiTriplet
# BERT WikiNLI
model_paths= ['HooshvareLab/distilbert-fa-zwnj-base',
              'm3hrdadfi/bert-fa-base-uncased-wikinli-mean-tokens',
              'm3hrdadfi/bert-fa-base-uncased-wikitriplet-mean-tokens',
              'm3hrdadfi/bert-fa-base-uncased-farstail-mean-tokens']


number of segmented documents: 3258


### Load Model 

In [11]:
import torch
torch.cuda.is_available()

True

In [77]:
# Load the Sentence-Transformer
model = load_st_model(model_paths[0])

Some weights of the model checkpoint at HooshvareLab/distilbert-fa-zwnj-base were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [78]:
# release the memory of GPU
import torch, gc

gc.collect()
torch.cuda.empty_cache()
pt_model = None

!nvidia-smi

Wed Dec 29 21:35:00 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   73C    P0    82W / 149W |   1581MiB / 11441MiB |     59%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [79]:
### New Idea, Fine tune


In [80]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

min_df = 1
max_df=0.5
max_features=10000
ngram_range = (1,2)
model_tfidf = TfidfVectorizer(analyzer="word", min_df=min_df, max_df=max_df, max_features=max_features, ngram_range=ngram_range)
matrix = model_tfidf.fit_transform(corpus)


In [81]:
def tfidf_top_k(query, k=2, method='l1', base_line = 0.75):
    
    query_tfidf = model_tfidf.transform([query])

    # Stores cosine similarity scores
    doc_scores = []

    # Compute the cosine similarity scores
    for doc in matrix:
        doc_scores.append(cosine_similarity(query_tfidf, doc)[0][0])


    # Sort list of doc_scores and return the top k indices of highest scores
    sorted_scores = sorted(enumerate(doc_scores), key=lambda ind_score: ind_score[1], reverse=True)

    if k!=-1:
        sorted_scores = sorted_scores[:k]

    min = sorted_scores[-1][1]
    max = sorted_scores[0][1]
    coef_ = 1/max

    if method == 'l1':
        modified_scores = [(index, score * coef_) for index, score in sorted_scores]
    elif method == 'l2':
        # normalize with baseline + (score - min)/(max - min)[in range 0:1-baseline]
        modified_scores = [(index, (score - min) * (1-base_line) /(max - min)  + base_line) for index, score in sorted_scores]
    else:
        modified_scores = sorted_scores


    return modified_scores

tfidf_top_k(queries[0], 10, method='l2')

[(484, 1.0),
 (482, 0.8547820515154094),
 (345, 0.8061856745269492),
 (343, 0.805287287621405),
 (344, 0.8050971813708434),
 (349, 0.7834124856117293),
 (481, 0.7784385426455676),
 (466, 0.7604249635055969),
 (341, 0.755659361422025),
 (347, 0.75)]

### Fine Tune

In [82]:

# train_examples = [InputExample(texts=['My first sentence', 'My second sentence'], label=0.8),
#     InputExample(texts=['Another pair', 'Unrelated sentence'], label=0.3)]


train_examples = []

# for key in tqdm(evaluation_data.keys()):
#     # Most Relevent Document
#     doc_number = evaluation_data[key]['relevant'][0]
#     train_examples.append(InputExample(texts=[corpus[doc_number], key], label=1.0))

for key in tqdm(evaluation_data.keys()):
    # Most Relevent Document
    doc_tfidf_scores = tfidf_top_k(key, 10)
    for doc_number, score in doc_tfidf_scores:
        train_examples.append(InputExample(texts=[corpus[doc_number], key], label=float(score)))



100%|██████████| 150/150 [08:35<00:00,  3.43s/it]


In [83]:
train_batch = [train_examples[index:index+5] for index in range(0, len(train_examples), 5)]

In [84]:
#Define your train dataset, the dataloader and the train loss
for batch in tqdm(train_batch):
    train_dataloader = DataLoader(batch, shuffle=True, batch_size=16)
    train_loss = losses.CosineSimilarityLoss(model)

    #Tune the model
    model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=10, warmup_steps=100, show_progress_bar=False)

100%|██████████| 300/300 [28:56<00:00,  5.79s/it]


In [85]:
corpus_embeddings = model.encode(corpus, convert_to_tensor=True, show_progress_bar=True)

Batches:   0%|          | 0/102 [00:00<?, ?it/s]

In [86]:
# Old Codes:

In [87]:
top_k = 100
result = []
for query in queries:
    query_embedding = model.encode(query, convert_to_tensor=True)
    cos_scores = util.pytorch_cos_sim(query_embedding, corpus_embeddings)[0]
    cos_scores = cos_scores.cpu()
    result.append(cos_scores)

In [88]:
#We use torch.topk to find the highest 5 scores
top_results_indices = []
top_results_values = []
for cos_score in result:
  top_results_indices.append(torch.topk(cos_score, k=top_k).indices.numpy())
  top_results_values.append(torch.topk(cos_score, k=top_k).values.numpy())

In [89]:
mrr = []
for i,key in enumerate(evaluation_data.keys()):
  mrr.append(
      list((top_results_indices[i] == evaluation_data[key]['relevant'][0]).astype(int))
  )
  
print(mean_reciprocal_rank(mrr))

0.5329212480199563


In [90]:
patk = []
for i,key in enumerate(evaluation_data.keys()):
  patk.append(
      precision_at_k(np.isin(top_results_indices[i], (evaluation_data[key]['relevant'] + evaluation_data[key]['similar_high'])).astype(int), 10)
  )
sum(patk)/len(patk)

0.42

In [91]:
map_i = []
p = 20 
for i,key in enumerate(evaluation_data.keys()):
    map_i.append(list(np.isin(top_results_indices[i][:p], (evaluation_data[key]['relevant'] + evaluation_data[key]['similar_high'])).astype(int)))

print(mean_average_precision(map_i))

0.6919006367351903


In [92]:
model.save('10_k_tfidf_l2_bert_tuned')

In [93]:
! zip -r 10_k_tfidf_l2_bert_tuned.zip 10_k_tfidf_l2_bert_tuned

  adding: 10_k_tfidf_l2_bert_tuned/ (stored 0%)
  adding: 10_k_tfidf_l2_bert_tuned/special_tokens_map.json (deflated 40%)
  adding: 10_k_tfidf_l2_bert_tuned/config.json (deflated 43%)
  adding: 10_k_tfidf_l2_bert_tuned/README.md (deflated 58%)
  adding: 10_k_tfidf_l2_bert_tuned/1_Pooling/ (stored 0%)
  adding: 10_k_tfidf_l2_bert_tuned/1_Pooling/config.json (deflated 47%)
  adding: 10_k_tfidf_l2_bert_tuned/pytorch_model.bin (deflated 7%)
  adding: 10_k_tfidf_l2_bert_tuned/tokenizer_config.json (deflated 38%)
  adding: 10_k_tfidf_l2_bert_tuned/sentence_bert_config.json (deflated 4%)
  adding: 10_k_tfidf_l2_bert_tuned/vocab.txt (deflated 60%)
  adding: 10_k_tfidf_l2_bert_tuned/modules.json (deflated 53%)
  adding: 10_k_tfidf_l2_bert_tuned/tokenizer.json (deflated 63%)
  adding: 10_k_tfidf_l2_bert_tuned/config_sentence_transformers.json (deflated 27%)


In [53]:
! ls -ahl

total 267M
drwxr-xr-x 1 root root 4.0K Dec 29 19:42 .
drwxr-xr-x 1 root root 4.0K Dec 29 18:21 ..
drwxr-xr-x 3 root root 4.0K Dec 29 19:38 10_k_tfidf_bert_tuned
-rw-r--r-- 1 root root 267M Dec 29 19:42 10_k_tfidf_bert_tuned.zip
drwxr-xr-x 4 root root 4.0K Dec  3 14:33 .config
drwxr-xr-x 3 root root 4.0K Dec 29 18:23 data
drwxr-xr-x 1 root root 4.0K Dec  3 14:33 sample_data
drwxr-xr-x 2 root root 4.0K Dec 29 18:34 utils


In [54]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [55]:
! mv /content/10_k_tfidf_bert_tuned.zip /content/drive/

mv: cannot create regular file '/content/drive/10_k_tfidf_bert_tuned.zip': Operation not supported


In [None]:
# to save the embedings
# torch.save(corpus_embeddings, 'file.pt')
# corpus_embeddings = torch.load('file.pt')

In [30]:
def bert_top_k(model, corpus_embeddings, query, ind_s, top_k, alpha=0, beta=1):
    query_embedding = model.encode(query, convert_to_tensor=True, show_progress_bar=False)
    cos_scores = util.pytorch_cos_sim(query_embedding, corpus_embeddings)[0]
    cos_scores = cos_scores.cpu()


    top_results = torch.topk(cos_scores, k=top_k)
    top_relative_indices = list(top_results.indices.numpy())
    top_real_indices = list(np.array(ind_s)[top_relative_indices])
    indices_info = {}

    for key in list(set(top_real_indices)):
        indices_info[key] = {'number': 0, 'mean': 0, 'sum': 0, 'level': 0}

    idx = 1
    for score, key in zip(top_results[0], top_real_indices):
        indices_info[key]['level'] = indices_info[key]['level'] + 1/idx
        indices_info[key]['number'] = indices_info[key]['number'] + 1
        indices_info[key]['sum'] = indices_info[key]['sum'] + score.numpy()
        indices_info[key]['mean'] = indices_info[key]['sum'] / indices_info[key]['number']
        idx +=1

    joint_score = []    
    for key in indices_info.keys():
        score_combined = alpha * indices_info[key]['level'] + beta * indices_info[key]['mean']
        joint_score.append((key, score_combined)) 


    joint_score_sorted = sorted(joint_score, key=lambda key: key[1], reverse=True)
  
    return np.array([index for index, score in joint_score_sorted])

In [94]:
def MMR_bert(model, corpus_embeddings, queries, ind_s, evaluation_data, top_k, alpha, beta):
    relevent_match_list = []

    for query_i in tqdm(queries):
        
        top_results_indices_i = bert_top_k(model, corpus_embeddings, query_i, ind_s, top_k=top_k, alpha=alpha, beta=beta)
        # print(top_results_indices_i)
        boolian_match = (np.array(top_results_indices_i) == evaluation_data[query_i]['relevant'][0]).astype(int)
        # print(boolian_match)
        relevent_match_list.append(list(boolian_match))
    
    # print(relevent_match_list)
    return mean_reciprocal_rank(relevent_match_list)

In [95]:
mrr_score = MMR_bert(model, corpus_embeddings, queries, ind_s, evaluation_data, top_k=100, alpha=0, beta=1)
mrr_score

100%|██████████| 150/150 [00:03<00:00, 38.22it/s]


0.5329212480199563