# Preparation

In [1]:
!pip install -qU sentence-transformers
!pip install -qU wikipedia-api
!pip install -qU hazm
!pip install -qU clean-text[gpl]
!pip install -qU emoji

[K     |████████████████████████████████| 71kB 3.8MB/s 
[K     |████████████████████████████████| 1.3MB 7.6MB/s 
[K     |████████████████████████████████| 2.9MB 17.9MB/s 
[K     |████████████████████████████████| 890kB 44.2MB/s 
[K     |████████████████████████████████| 1.1MB 43.4MB/s 
[?25h  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Building wheel for wikipedia-api (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 317kB 6.3MB/s 
[K     |████████████████████████████████| 235kB 15.1MB/s 
[K     |████████████████████████████████| 1.4MB 17.2MB/s 
[?25h  Building wheel for libwapiti (setup.py) ... [?25l[?25hdone
  Building wheel for nltk (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 71kB 4.3MB/s 
[K     |████████████████████████████████| 51kB 6.3MB/s 
[K     |████████████████████████████████| 245kB 26.8MB/s 
[?25h  Building whe

In [2]:
!mkdir resources
!wget -q "https://github.com/sobhe/hazm/releases/download/v0.5/resources-0.5.zip" -P resources
!unzip -qq resources/resources-0.5.zip -d resources

In [3]:
!rm -rf /content/4ccae468eb73bf6c4f4de3075ddb5336
!rm -rf /content/preproc
!rm preprocessing.py utils.py
!mkdir -p /content/preproc
!git clone https://gist.github.com/4ccae468eb73bf6c4f4de3075ddb5336.git /content/preproc/
!mv /content/preproc/* /content/
!rm -rf /content/preproc

rm: cannot remove 'preprocessing.py': No such file or directory
rm: cannot remove 'utils.py': No such file or directory
Cloning into '/content/preproc'...
remote: Enumerating objects: 7, done.[K
remote: Counting objects: 100% (7/7), done.[K
remote: Compressing objects: 100% (7/7), done.[K
remote: Total 7 (delta 1), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (7/7), done.


In [4]:
from preprocessing import cleaning
from IPython import display

import nltk
import wikipediaapi

import numpy as np
import pandas as pd

import hazm
import requests
import time

import torch
from sentence_transformers import models, SentenceTransformer, util
from sentence_transformers import CrossEncoder

In [5]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [6]:
wiki = wikipediaapi.Wikipedia('fa')

In [7]:
def rtl_print(outputs, font_size="15px", n_to_br=False):
    outputs = outputs if isinstance(outputs, list) else [outputs] 
    if n_to_br:
        outputs = [output.replace('\n', '<br/>') for output in outputs]
        
    outputs = [f'<p style="text-align: right; direction: rtl; margin-right: 10px; font-size: {font_size};">{output}</p>' for output in outputs]
    display.display(display.HTML(' '.join(outputs)))


FILTERD_SECTIONS = [
    # 'تاریخچه',
    'محتویات',
    'پانویس',
    'منابع',
    'منابع و پانویس',
    'جستارهای وابسته',
    'پیوند به بیرون',
    'یادداشت‌ها',
    'یادداشت‌ ها',
    'جوایز',
    'نگارخانه',
    'روابط‌خارجی',
    'روابط خارجی',
    'کتاب‌شناسی',
    'کتاب‌ شناسی',
    'فیلم‌شناسی',
    'فیلم ‌شناسی',
    'دست‌اندرکاران',
    'دست‌اندر کاران',
    'دست‌ اندر کاران',
    'فروش‌های برگزیدهٔ آلبوم',
    'فروش‌ های برگزیدهٔ آلبوم',
    'فروش‌ های برگزیده آلبوم',
    'نمودارهای فروش',
    'نمودار های فروش',
    'فهرست آهنگ‌ها',
    'فهرست آهنگ‌ ها',
    'اعضا',
    'ترانه‌شناسی',
    'ترانه‌ شناسی',
    'نگارخانه',
    'بازیگران',
    'پروژه‌های مشابه',
    'پروژه‌ های مشابه'
]
FILTERD_SECTIONS = [t.strip() for t in list(set(FILTERD_SECTIONS))]

def get_sections(sections, level=0, retrieved=None, keep_title=False):
    retrieved = retrieved if isinstance(retrieved, list) else []

    for s in sections:
        if s.title not in FILTERD_SECTIONS:
            if keep_title:
                text = s.title + '\n' + s.text
            else:
                text = s.text
            
            retrieved.append(text.replace('\n', '[n]'))
            get_sections(s.sections, level=level + 1, retrieved=retrieved, keep_title=keep_title)
    
    return '[nn]'.join(retrieved)

def wiki_document(page_name, make_clean=True, verbose=False, keep_title=True, keep_new_lines=True, n1_rp='\n', n2_rp='\n\n'):
    page_py = wiki.page(page_name)
    document = ''

    if page_py.exists():
        document = []
        document = get_sections(page_py.sections, level=0, retrieved=document, keep_title=keep_title)
        document = cleaning(document) if make_clean else document
        document = document.replace('[nn]', n2_rp).replace('[n]', n1_rp) if keep_new_lines else docuemnt 

        if verbose:
            rtl_print("Page - Title: %s" % page_py.title)
            rtl_print("%s" % requests.utils.unquote(page_py.fullurl))
            rtl_print("Page - Article [1000]: %s ..." % cleaning(document[:1000]))
    
    return document


def load_st_model(model_name_or_path):
    word_embedding_model = models.Transformer(model_name_or_path)
    pooling_model = models.Pooling(
        word_embedding_model.get_word_embedding_dimension(),
        pooling_mode_mean_tokens=True,
        pooling_mode_cls_token=False,
        pooling_mode_max_tokens=False)
    
    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
    return model

# Information Retrieval

In [8]:
document = wiki_document(
    'کوروش_بزرگ', 
    n1_rp=' ',
    n2_rp='\n\n',
    make_clean=True, 
    verbose=True, 
    keep_title=False, 
    keep_new_lines=True)

In [9]:
paragraphs = []
for paragraph in document.replace("\r\n", "\n").split("\n\n"):
    if len(paragraph.strip()) > 0:
        sentences = hazm.sent_tokenize(paragraph.strip())
        paragraphs.append(sentences)

window_size = 3
passages = []
for paragraph in paragraphs:
    for start_idx in range(0, len(paragraph), window_size):
        end_idx = min(start_idx + window_size, len(paragraph))
        passages.append(" ".join(paragraph[start_idx:end_idx]))


rtl_print(f"Paragraphs: {len(paragraphs)}")
rtl_print(f"Sentences: {sum([len(p) for p in paragraphs])}")
rtl_print(f"Passages: {len(passages)}")

queries = [
    'اولین حکمران شهر بابل کی بود؟',
    'در فصل زمستان چه اتفاقی افتاد؟',
    'کوروش چگونه شخصیتی بود؟'
]

## BERT WikiNLI

In [10]:
model = CrossEncoder('m3hrdadfi/bert-fa-base-uncased-wikinli')

for query in queries:
    start_time = time.time()

    # Concatenate the query and all passages and predict the scores for the pairs [query, passage]
    model_inputs = [[query, passage] for passage in passages]
    scores = model.predict(
        model_inputs, 
        show_progress_bar=True)

    # Sort the scores in decreasing order
    results = [{'input': inp, 'score': score} for inp, score in zip(model_inputs, scores)]
    results = sorted(results, key=lambda x: x['score'][-1], reverse=True)

    rtl_print(f"Query: {query}")
    rtl_print("Search took {:.2f} seconds".format(time.time() - start_time))
    for hit in results[:3]:
        rtl_print(f"+‌ {hit['input'][-1]} (Score {hit['score'][-1]:.2f})")


    rtl_print(' - - ' * 50)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=705.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1198122.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=324.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=651458839.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Batches', max=6.0, style=ProgressStyle(description_width=…

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.





HBox(children=(FloatProgress(value=0.0, description='Batches', max=6.0, style=ProgressStyle(description_width=…




HBox(children=(FloatProgress(value=0.0, description='Batches', max=6.0, style=ProgressStyle(description_width=…




## BERT FarsTail

In [11]:
model = CrossEncoder('m3hrdadfi/bert-fa-base-uncased-farstail')

for query in queries:
    start_time = time.time()

    # Concatenate the query and all passages and predict the scores for the pairs [query, passage]
    model_inputs = [[query, passage] for passage in passages]
    scores = model.predict(
        model_inputs, 
        show_progress_bar=True)

    # Sort the scores in decreasing order
    results = [{'input': inp, 'score': score} for inp, score in zip(model_inputs, scores)]
    results = sorted(results, key=lambda x: x['score'][-1], reverse=True)

    rtl_print(f"Query: {query}")
    rtl_print("Search took {:.2f} seconds".format(time.time() - start_time))
    for hit in results[:3]:
        rtl_print(f"+‌ {hit['input'][-1]} (Score {hit['score'][-1]:.2f})")


    rtl_print(' - - ' * 50)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=744.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1198122.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=324.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=651461911.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Batches', max=6.0, style=ProgressStyle(description_width=…

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.





HBox(children=(FloatProgress(value=0.0, description='Batches', max=6.0, style=ProgressStyle(description_width=…




HBox(children=(FloatProgress(value=0.0, description='Batches', max=6.0, style=ProgressStyle(description_width=…


