# Article Retrieval Project

In [None]:
!pip install -U sentence-transformers



In [None]:
import json,glob,nltk,copy,torch,time,sentence_transformers,pickle
import numpy as np
from scipy import spatial
from queue import PriorityQueue
from sentence_transformers import SentenceTransformer,util
from transformers import AutoModelForMaskedLM
from sklearn.metrics.pairwise import cosine_similarity
from IPython import display
nltk.download('punkt')
from google.colab import drive
drive.mount('/content/drive')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Mounted at /content/drive


 Hazm is a natural language processing library for the Persian language

In [None]:
!pip install hazm



### GPU Cuda

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device available for running: {device}")

Device available for running: cpu


In [None]:
data = []
with open('/content/drive/MyDrive/nasi_mashi/IR Final Project/dataset.json', 'r') as f:
    jsonData = json.loads(f.read())

# make a dictionary from json
for line in jsonData:
    title = line['Title'].replace("\u200c", " ")
    abstract = line['Abstract'].replace("\u200c", " ")
    keywords = line['Keyword'].replace("\u200c", " ")
    text = line['Text'].replace("\u200c", " ")

    data.append({
        "title": title,
        "abstract": abstract,
        "keywords": keywords,
        "text": text
    })

In [None]:
# tokenizing each paragraph to a sentence
for i in range(len(data)):
    data[i]['abstract'] = nltk.sent_tokenize(data[i]['abstract'])
    data[i]['text'] = nltk.sent_tokenize(data[i]['text'])
    data[i]['keywords'] = data[i]['keywords'].split('، ')

In [None]:
# creat a list of empty sublist of len data(number of articles)
flatten_data = [[] for i in range(len(data))]

# fill flatten with sentences.
# flatten i is representer of all the sentences from article i(including abstract,text and keywords
for i in range(len(data)):
    abstract = data[i]['abstract']
    body = data[i]['text']
    keywords = data[i]['keywords']

    for sentence in abstract:
        flatten_data[i].append(sentence)
    for sentence in body:
        flatten_data[i].append(sentence)
    for word in keywords:
        flatten_data[i].append(word)

In [None]:
# for intance, it reperesents all the sentences for first article in JSON file
flatten_data[0]

['هدف متناسب سازی نظریه های علوم اجتماعی با فضای بسترهای رسانه ای اجتماعی، موجب ارائه تحلیل های دقیق تری برای درک چرایی و چگونگی تغییر رفتار کاربران بسترهای رسانه ای می شود.',
 'هدف پژوهش حاضر، استخراج الگوهای موجود در محتوای تولید شده توسط کاربران توییتر بود که برای تحقق آن از نظریه تمرکز نظارتی بهره برده شده است.',
 'روش: این پژوهش از روش های متن کاوی استفاده کرده و جامعه آماری آن، شامل متن های انتشار یافته در توییتر طی بازه ی زمانی تیرماه ۱۳۹۹ تا تیرماه ۱۴۰۰ بوده است.',
 'داده های مورد استفاده در این پژوهش با تمرکز بر کلیدواژه های مرتبط با موضوع حجاب گردآوری شده است.',
 'یافته ها: داده های استخراج شده، تعداد ۳۳۱۹۲ توییت بود که پس از پاکسازی، ۱۹۶۷ توییت نمونه بر مبنای نظریه تمرکز نظارتی برچسب گذاری شد.',
 'در این پژوهش، توییت های نمونه با توجه به پیش فرض مثبت یا منفی، به دو دسته توییت های پیشبردی و اجتنابی دسته بندی شدند.',
 'بر این اساس، تعداد ۸۱۵ توییت پیشبردی و ۱۱۵۲ توییت اجتنابی شناسایی شد.',
 'سپس توییت های پیشبردی در دو دسته توییت های برخورداری و محرومیتی و توییت های اجتنابی در

In [None]:
queries = ['آیا متن کاوي شامل مجموعه ابزار هاي هوشمندي است که براي سازماندهی اطلاعات بدون ساختار از آن استفاده میشود؟', 'متن کاوی در کجا استفاده میشود؟', 'رابطه انسان و فناوری چیست؟']

#### rtl_print

In [None]:
def rtl_print(outputs, font_size="15px", n_to_br=False):
    outputs = outputs if isinstance(outputs, list) else [outputs]
    if n_to_br:
        outputs = [output.replace('\n', '<br/>') for output in outputs]

    outputs = [f'<p style="text-align: right; direction: rtl; margin-right: 10px; font-size: {font_size};">{output}</p>' for output in outputs]
    display.display(display.HTML(' '.join(outputs)))

##TF-IDF

In [None]:
import time
from sklearn.feature_extraction.text import TfidfVectorizer

# Example Persian sentences
sentences = []
for i in range(len(data)):
    abstract = data[i]['abstract']
    body = data[i]['text']
    keywords = data[i]['keywords']

    for sentence in abstract:
        sentences.append(sentence)
    for sentence in body:
        sentences.append(sentence)
    for sentence in keywords:
        sentences.append(sentence)


In [None]:
# number of all the sentences
len(sentences)

540

In [None]:
sentences[0]

'هدف متناسب سازی نظریه های علوم اجتماعی با فضای بسترهای رسانه ای اجتماعی، موجب ارائه تحلیل های دقیق تری برای درک چرایی و چگونگی تغییر رفتار کاربران بسترهای رسانه ای می شود.'

In [None]:
# Create a TfidfVectorizer object and fit it to the sentences
vectorizer = TfidfVectorizer()

# by fiting the vectorizer, creates a vocabulary of words and computes their corresponding tf-idf scores.
vectorizer.fit(sentences)

# initializes an empty list to hold the tf-idf embeddings for each article.
tfidf_embeddings = [[] for i in range(len(data))]
for i in range(len(data)):
    abstract = data[i]['abstract']
    body = data[i]['text']
    keywords = data[i]['keywords']

    # initializes an empty list to hold the sentences of the current article.
    article_sentences = []
    for sentence in abstract:
        article_sentences.append(sentence)
    for sentence in body:
        article_sentences.append(sentence)
    for sentence in keywords:
        article_sentences.append(sentence)

    # transforms the sentences of the current article into a matrix of tf-idf scores using the previously fitted
    sentence_vectors = vectorizer.transform(article_sentences)

    # Compute the weighted sum of the sentence vectors to obtain the article embedding
    weights = sentence_vectors.toarray()
    article_embedding = np.sum(weights[:, :, np.newaxis] * sentence_vectors.toarray()[:, np.newaxis, :], axis=1)
    article_embedding = article_embedding.squeeze()

    # Add the article embedding to the list of embeddings
    tfidf_embeddings[i].append(article_embedding)

    tfidf_embeddings[i] = np.concatenate(tfidf_embeddings[i])

In [None]:
sentence_vectors[0]

<1x2354 sparse matrix of type '<class 'numpy.float64'>'
	with 12 stored elements in Compressed Sparse Row format>

In [None]:
# Encode the queries using the tf-idf vectors
query_embeddings = [vectorizer.transform([query]).toarray()[0] for query in queries]

In [None]:
for qindex, query in enumerate(query_embeddings):
    # Declare an empty priority queue
    answer_pq = PriorityQueue()
    # For each article's sentence embedding
    for index, embed in enumerate(tfidf_embeddings):
        # Compute the cosine similarity between the query embedding and each sentence embedding
        similarity = cosine_similarity(query.reshape(1, -1), embed)
        # Find the most similar sentence and add it to the priority queue
        max_sim_index = similarity.argmax()
        answer_pq.put((-similarity[0, max_sim_index], index, max_sim_index))
    # Get the most similar sentences (which are our answers)
    rtl_print([f'سوال: {queries[qindex]}', '\n'], n_to_br=True)
    for idx in range(4):
        res = answer_pq.get()
        rtl_print(f"پاسخ {idx+1}: {flatten_data[res[1]][res[2]]}")
        rtl_print(f"از مقاله: {data[res[1]]['title']}")
        rtl_print(f"امتیاز: {-res[0]}")
        print()
    rtl_print("------------------------------------------------------------------------------------------------------------------------------")
    print()














































## Word2Vec

In [None]:
from hazm import word_tokenize
import gensim
from gensim.models import Word2Vec

for i in range(len(data)):
    abstract = data[i]['abstract']
    body = data[i]['text']
    keywords = data[i]['keywords']

    article_sentences_words = []
    for sentence in abstract:
        article_sentences_words.append(word_tokenize(sentence))
    for sentence in body:
        article_sentences_words.append(word_tokenize(sentence))
    for sentence in keywords:
        article_sentences_words.append(word_tokenize(sentence))

    #  train a word embedding model based on a given words of sentence (i)
    model = Word2Vec(article_sentences_words, vector_size=300, window=5, min_count=5, workers=4)

In [None]:
article_sentences_words[10]

['شرکت',
 'کنندگان',
 '،',
 'جستجوی',
 'خود',
 'را',
 'با',
 'اطلاعات',
 'کلی',
 'مانند',
 'معرفی',
 'و',
 'بررسی',
 'حقایق',
 'آغاز',
 'کردند',
 'و',
 'سپس',
 'هر',
 'یک',
 'بر',
 'روی',
 'جنبه',
 'های',
 'خاصی',
 'تمرکز',
 'کردند',
 '.']

In [None]:
w2v_embeddings = []
for i in range(len(data)):
    sentences_embedding = []
    for sentence in flatten_data[i]:
        sentence_weights = []
        sentence_embedding = np.zeros(300)
        for word in word_tokenize(sentence):
            if word in model.wv:
                word_embedding = model.wv[word]
                sentence_weights.append(word_embedding)
                sentence_embedding += word_embedding
        if len(sentence_weights) > 0:
            sentence_weights = np.array(sentence_weights)
            sentence_weights = sentence_weights / np.sqrt(np.sum(sentence_weights ** 2, axis=1))[:, np.newaxis]
            sentences_embedding.append(sentence_embedding / len(sentence_weights))
    w2v_embeddings.append(sentences_embedding)

In [None]:
query_embeddings = []
for query in queries:
    query_weights = []
    query_embedding = np.zeros(300)
    n_words = 0
    for word in word_tokenize(query):
        if word in model.wv:
            word_embedding = model.wv[word]
            query_weights.append(word_embedding)
            query_embedding += word_embedding
    if len(query_weights) > 0:
        query_weights = np.array(query_weights)
        query_weights = query_weights / np.sqrt(np.sum(query_weights ** 2, axis=1))[:, np.newaxis]
        query_embeddings.append(query_embedding / len(query_weights))

In [None]:
for qindex,query in enumerate(query_embeddings):
    answer_pq = PriorityQueue()
    for i in range(len(data)):
        for j in range(len(w2v_embeddings[i])):
            similarity = cosine_similarity([query_embeddings[qindex]], [w2v_embeddings[i][j]])[0][0]
            answer_pq.put((-similarity,i,j))
    rtl_print([f'سوال: {queries[qindex]}', '\n'], n_to_br=True)
    for idx in range(4):
        res = answer_pq.get()
        rtl_print(f"پاسخ {idx+1}: {flatten_data[res[1]][res[2]]}")
        rtl_print(f"از مقاله: {data[res[1]]['title']}")
        rtl_print(f"امتیاز: {-res[0]}")
        print()
    rtl_print("------------------------------------------------------------------------------------------------------------------------------")
    print()














































##Transformer Bert

This model is based on the "multilingual masked language modeling" approach, which involves training a neural network to predict missing words in a sentence based on the context in which they appear. This is done by masking out certain words in the sentence and requiring the model to predict what they are based on the surrounding words.

In [None]:
fist_start_time = time.time()

model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2').to(device)

# Embeddings of each article
first_sentence_embeddings = [[] for i in range(len(data))]
# Each article sentences of abstract and body text
flatten_data = [[] for i in range(len(data))]

for i in range(len(data)):
    abstract = data[i]['abstract']
    body = data[i]['text']
    keywords = data[i]['keywords']

    for sentence in abstract:
        flatten_data[i].append(sentence)
    for sentence in body:
        flatten_data[i].append(sentence)
    for word in keywords:
        flatten_data[i].append(word)

    first_sentence_embeddings[i].append(model.encode(flatten_data[i], convert_to_tensor=True))

    first_sentence_embeddings[i] = torch.cat(first_sentence_embeddings[i])

first_model_time = (time.time() - fist_start_time)/60
print("Elapsed time: %s minutes" % (round(first_model_time,1)))

Downloading (…)9e268/.gitattributes:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)f2cd19e268/README.md:   0%|          | 0.00/3.77k [00:00<?, ?B/s]

Downloading (…)cd19e268/config.json:   0%|          | 0.00/723 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)9e268/tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/402 [00:00<?, ?B/s]

Downloading (…)d19e268/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

Elapsed time: 3.0 minutes


In [None]:
first_sentence_embeddings[0]

tensor([[ 0.0720, -0.1143, -0.0077,  ...,  0.1630, -0.0522,  0.0752],
        [ 0.0941, -0.1368, -0.0073,  ...,  0.1651, -0.1220, -0.0113],
        [ 0.0441, -0.0222, -0.0105,  ...,  0.1275, -0.1031, -0.0513],
        ...,
        [-0.0473,  0.0303, -0.0159,  ..., -0.0403,  0.0362, -0.0161],
        [-0.0136,  0.0941, -0.0190,  ...,  0.0931,  0.0206, -0.0655],
        [-0.0084,  0.0302, -0.0201,  ...,  0.0497, -0.0335, -0.1463]])

In [None]:
first_queries_embeddings = model.encode(queries,convert_to_tensor=True)

In [None]:
fist_start_time = time.time()

for qindex,query in enumerate(first_queries_embeddings):
  # Declare an empty priority queue
  answer_pq = PriorityQueue()
  # For each article's sentence embedding
  for index,embed in enumerate(first_sentence_embeddings):
    # Find the most similar vector and return it so as to add it to priority queue
    first_results = sentence_transformers.util.semantic_search(query,embed,top_k=1)
    for res in first_results:
      # Add to priority queue triple value of (score,article's index,sentence's index)
      answer_pq.put((-res[0]['score'],index,res[0]['corpus_id']))
  # Get vectors with the best cosine similarity (which are our answers)
  rtl_print([f'سوال: {queries[qindex]}', '\n'], n_to_br=True)
  for idx in range(4):
    res = answer_pq.get()
    rtl_print(f"پاسخ {idx+1}: {flatten_data[res[1]][res[2]]}")
    rtl_print(f"از مقاله: {data[res[1]]['title']}")
    rtl_print(f"امتیاز: {-res[0]}")
    print()
  rtl_print("------------------------------------------------------------------------------------------------------------------------------")
  print()

first_model_time = (time.time() - fist_start_time)/60
print("Elapsed time: %s minutes" % (round(first_model_time,1)))












































Elapsed time: 0.0 minutes
