# Preparation

In [1]:
!pip install -qU sentence-transformers
!pip install -qU wikipedia-api
!pip install -qU hazm
!pip install -qU clean-text[gpl]
!pip install -qU emoji

[K     |████████████████████████████████| 71kB 3.2MB/s 
[K     |████████████████████████████████| 1.3MB 7.9MB/s 
[K     |████████████████████████████████| 1.1MB 48.8MB/s 
[K     |████████████████████████████████| 890kB 38.4MB/s 
[K     |████████████████████████████████| 2.9MB 50.4MB/s 
[?25h  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Building wheel for wikipedia-api (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 317kB 4.2MB/s 
[K     |████████████████████████████████| 235kB 11.4MB/s 
[K     |████████████████████████████████| 1.4MB 18.6MB/s 
[?25h  Building wheel for libwapiti (setup.py) ... [?25l[?25hdone
  Building wheel for nltk (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 71kB 3.6MB/s 
[K     |████████████████████████████████| 51kB 5.9MB/s 
[K     |████████████████████████████████| 245kB 18.1MB/s 
[?25h  Building whe

In [2]:
!mkdir resources
!wget -q "https://github.com/sobhe/hazm/releases/download/v0.5/resources-0.5.zip" -P resources
!unzip -qq resources/resources-0.5.zip -d resources

In [3]:
!rm -rf /content/4ccae468eb73bf6c4f4de3075ddb5336
!rm -rf /content/preproc
!rm preprocessing.py utils.py
!mkdir -p /content/preproc
!git clone https://gist.github.com/4ccae468eb73bf6c4f4de3075ddb5336.git /content/preproc/
!mv /content/preproc/* /content/
!rm -rf /content/preproc

rm: cannot remove 'preprocessing.py': No such file or directory
rm: cannot remove 'utils.py': No such file or directory
Cloning into '/content/preproc'...
remote: Enumerating objects: 7, done.[K
remote: Counting objects: 100% (7/7), done.[K
remote: Compressing objects: 100% (7/7), done.[K
remote: Total 7 (delta 1), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (7/7), done.


In [4]:
from preprocessing import cleaning
from IPython import display

import numpy as np
import pandas as pd

import hazm
import requests
import time

import torch
from sentence_transformers import models, SentenceTransformer, util

In [5]:
def rtl_print(outputs, font_size="15px", n_to_br=False):
    outputs = outputs if isinstance(outputs, list) else [outputs] 
    if n_to_br:
        outputs = [output.replace('\n', '<br/>') for output in outputs]
        
    outputs = [f'<p style="text-align: right; direction: rtl; margin-right: 10px; font-size: {font_size};">{output}</p>' for output in outputs]
    display.display(display.HTML(' '.join(outputs)))

    
def load_st_model(model_name_or_path):
    word_embedding_model = models.Transformer(model_name_or_path)
    pooling_model = models.Pooling(
        word_embedding_model.get_word_embedding_dimension(),
        pooling_mode_mean_tokens=True,
        pooling_mode_cls_token=False,
        pooling_mode_max_tokens=False)
    
    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
    return model

# Semantic Search

In [7]:
# Corpus with example sentences
corpus = [
    'مردی در حال خوردن خوراک است.',
    'مردی در حال خوردن یک تکه نان است.',
    'دختری بچه ای را حمل می کند.',
    'یک مرد سوار بر اسب است.',
    'زنی در حال نواختن پیانو است.',
    'دو مرد گاری ها را به داخل جنگل هل دادند.',
    'مردی در حال سواری بر اسب سفید در مزرعه است.',
    'میمونی در حال نواختن طبل است.',
    'یوزپلنگ به دنبال شکار خود در حال دویدن است.',
]

# Query sentences:
queries = [
    "مردی در حال خوردن پاستا است.",
    "کسی با لباس گوریل مشغول نواختن مجموعه ای از طبل است.",
    "یوزبلنگ شکار خود را در یک مزرعه تعقیب می کند.",
]

top_k = 5

## BERT WikiNLI

In [8]:
# Load the Sentence-Transformer
embedder = load_st_model('m3hrdadfi/bert-fa-base-uncased-wikinli-mean-tokens')
corpus_embeddings = embedder.encode(corpus, convert_to_tensor=True, show_progress_bar=True)


for query in queries:
    query_embedding = embedder.encode(query, convert_to_tensor=True, show_progress_bar=True)
    cos_scores = util.pytorch_cos_sim(query_embedding, corpus_embeddings)[0]
    cos_scores = cos_scores.cpu()

    #We use torch.topk to find the highest 5 scores
    top_results = torch.topk(cos_scores, k=top_k)

    rtl_print(f'Query: {query}', '18px')

    for score, idx in zip(top_results[0], top_results[1]):
        rtl_print(f'{corpus[idx]} --- (Score: {score:.4f})')
    
    rtl_print('- - ' * 50)

HBox(children=(FloatProgress(value=0.0, description='Batches', max=1.0, style=ProgressStyle(description_width=…




HBox(children=(FloatProgress(value=0.0, description='Batches', max=1.0, style=ProgressStyle(description_width=…




HBox(children=(FloatProgress(value=0.0, description='Batches', max=1.0, style=ProgressStyle(description_width=…




HBox(children=(FloatProgress(value=0.0, description='Batches', max=1.0, style=ProgressStyle(description_width=…




## BERT WikiTriplet

In [9]:
# Load the Sentence-Transformer
embedder = load_st_model('m3hrdadfi/bert-fa-base-uncased-wikitriplet-mean-tokens')
corpus_embeddings = embedder.encode(corpus, convert_to_tensor=True, show_progress_bar=True)


for query in queries:
    query_embedding = embedder.encode(query, convert_to_tensor=True, show_progress_bar=True)
    cos_scores = util.pytorch_cos_sim(query_embedding, corpus_embeddings)[0]
    cos_scores = cos_scores.cpu()

    #We use torch.topk to find the highest 5 scores
    top_results = torch.topk(cos_scores, k=top_k)

    rtl_print(f'Query: {query}', '18px')

    for score, idx in zip(top_results[0], top_results[1]):
        rtl_print(f'{corpus[idx]} --- (Score: {score:.4f})')
    
    rtl_print('- - ' * 50)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=519.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=651449935.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1198122.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=348.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Batches', max=1.0, style=ProgressStyle(description_width=…




HBox(children=(FloatProgress(value=0.0, description='Batches', max=1.0, style=ProgressStyle(description_width=…




HBox(children=(FloatProgress(value=0.0, description='Batches', max=1.0, style=ProgressStyle(description_width=…




HBox(children=(FloatProgress(value=0.0, description='Batches', max=1.0, style=ProgressStyle(description_width=…




## BERT FarsTail

In [10]:
# Load the Sentence-Transformer
embedder = load_st_model('m3hrdadfi/bert-fa-base-uncased-farstail-mean-tokens')
corpus_embeddings = embedder.encode(corpus, convert_to_tensor=True, show_progress_bar=True)


for query in queries:
    query_embedding = embedder.encode(query, convert_to_tensor=True, show_progress_bar=True)
    cos_scores = util.pytorch_cos_sim(query_embedding, corpus_embeddings)[0]
    cos_scores = cos_scores.cpu()

    #We use torch.topk to find the highest 5 scores
    top_results = torch.topk(cos_scores, k=top_k)

    rtl_print(f'Query: {query}', '18px')

    for score, idx in zip(top_results[0], top_results[1]):
        rtl_print(f'{corpus[idx]} --- (Score: {score:.4f})')
    
    rtl_print('- - ' * 50)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=519.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=651450094.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1198122.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=222.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Batches', max=1.0, style=ProgressStyle(description_width=…




HBox(children=(FloatProgress(value=0.0, description='Batches', max=1.0, style=ProgressStyle(description_width=…




HBox(children=(FloatProgress(value=0.0, description='Batches', max=1.0, style=ProgressStyle(description_width=…




HBox(children=(FloatProgress(value=0.0, description='Batches', max=1.0, style=ProgressStyle(description_width=…


