In [1]:
import glob
import torch
import numpy as np
from tqdm import tqdm
from sentence_transformers import models, SentenceTransformer, util, CrossEncoder

# **Download all models**

In [2]:
_ = SentenceTransformer('HooshvareLab/bert-fa-zwnj-base')
print('HooshvareLab/bert-fa-zwnj-base downloaded.')

_ = CrossEncoder('m3hrdadfi/bert-fa-base-uncased-wikinli')
print('m3hrdadfi/bert-fa-base-uncased-wikinli downloaded.')


Some weights of the model checkpoint at C:\Users\mozaf/.cache\torch\sentence_transformers\HooshvareLab_bert-fa-zwnj-base were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at C:\Users\mozaf/.cache\torch\sentence_transfo

HooshvareLab/bert-fa-zwnj-base downloaded.
m3hrdadfi/bert-fa-base-uncased-wikinli downloaded.


# **Reading data**

In [4]:
data = []

paths = glob.glob('./corpus/*.txt')
for path in paths:
    with open(path, encoding='utf-8') as f:
        lines = f.readlines()
        for line in tqdm(lines):
            if len(line) < 5:
                continue
            data.append(line)

data = data[:100]

100%|██████████| 10228/10228 [00:00<00:00, 244422.59it/s]


# **Vanilla BERT**

In [49]:
# Load deep model
encoder = SentenceTransformer('HooshvareLab/bert-fa-zwnj-base')

# Compute the embedding of all data
corpus_embeddings = encoder.encode(data, convert_to_tensor=True, show_progress_bar=True)

Some weights of the model checkpoint at C:\Users\mozaf/.cache\torch\sentence_transformers\HooshvareLab_bert-fa-zwnj-base were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at C:\Users\mozaf/.cache\torch\sentence_transfo

In [40]:
query = 'شراب'

query_embedding = encoder.encode(query, convert_to_tensor=True)
cos_scores = util.pytorch_cos_sim(query_embedding, corpus_embeddings)[0, :]
cos_scores = cos_scores.cpu().detach().numpy()
most_relevant = np.argsort(cos_scores)[::-1]

for i in range(10):
    print('Result {}:'.format(i + 1))
    print('\t', hems[most_relevant[i]])
    print('-------------------------------------------')

Result 1:
	 ده روزه مهر گردون افسانه است و افسون

-------------------------------------------
Result 2:
	 ای صاحب کرامت شکرانه سلامت

-------------------------------------------
Result 3:
	 فغان کاین لولیان شوخ شیرین کار شهرآشوب

-------------------------------------------
Result 4:
	 کجاست دیر مغان و شراب ناب کجا

-------------------------------------------
Result 5:
	 نصیحت گوش کن جانا که از جان دوست تر دارند

-------------------------------------------
Result 6:
	 جوانان سعادتمند پند پیر دانا را

-------------------------------------------
Result 7:
	 ز عشق ناتمام ما جمال یار مستغنی است

-------------------------------------------
Result 8:
	 بشد که یاد خوشش باد روزگار وصال

-------------------------------------------
Result 9:
	 ای شیخ پاکدامن معذور دار ما را

-------------------------------------------
Result 10:
	 جواب تلخ می زیبد لب لعل شکرخا را

-------------------------------------------


# **BERT with CrossEncoder**

In [41]:
# Load deep model
encoder = SentenceTransformer('HooshvareLab/bert-fa-zwnj-base')

# Compute the embedding of all data
corpus_embeddings = encoder.encode(data, convert_to_tensor=True, show_progress_bar=True)

Some weights of the model checkpoint at C:\Users\mozaf/.cache\torch\sentence_transformers\HooshvareLab_bert-fa-zwnj-base were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at C:\Users\mozaf/.cache\torch\sentence_transfo

In [54]:
query = 'شراب'

query_embedding = encoder.encode(query, convert_to_tensor=True)
hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=10)
hits = hits[0]

cross_encoder = CrossEncoder('m3hrdadfi/bert-fa-base-uncased-wikinli')
cross_inp = [[query, data[hit['corpus_id']]] for hit in hits]
cross_scores = cross_encoder.predict(cross_inp)
for idx in range(len(cross_scores)):
    hits[idx]['cross-score'] = cross_scores[idx][1]
re_ranked = sorted(hits, key=lambda x: x['cross-score'], reverse=True)

for i in range(10):
    print('Result {}:'.format(i + 1))
    print('\t', data[re_ranked[i]['corpus_id']])
    print('-------------------------------------------')

[{'corpus_id': 34, 'score': 0.5476308465003967, 'cross-score': 0.9781284}, {'corpus_id': 17, 'score': 0.546454668045044, 'cross-score': 0.9260603}, {'corpus_id': 26, 'score': 0.5148657560348511, 'cross-score': 0.89803964}, {'corpus_id': 68, 'score': 0.5676670074462891, 'cross-score': -3.099187}, {'corpus_id': 89, 'score': 0.5120704174041748, 'cross-score': -3.2056913}, {'corpus_id': 72, 'score': 0.5576296448707581, 'cross-score': -3.528982}, {'corpus_id': 36, 'score': 0.5159090161323547, 'cross-score': -3.5747998}, {'corpus_id': 41, 'score': 0.5111278295516968, 'cross-score': -4.1192136}, {'corpus_id': 42, 'score': 0.5260371565818787, 'cross-score': -4.1353583}, {'corpus_id': 43, 'score': 0.521490216255188, 'cross-score': -4.1780953}]
Result 1:
	 فغان کاین لولیان شوخ شیرین کار شهرآشوب

-------------------------------------------
Result 2:
	 کجاست دیر مغان و شراب ناب کجا

-------------------------------------------
Result 3:
	 بشد که یاد خوشش باد روزگار وصال

---------------------------

# **BERT Weighted Sum**