In [None]:
!pip install hazm
!pip install datasets transformers

In [76]:
!wget https://github.com/sobhe/hazm/releases/download/v0.5/resources-0.5.zip
!wget https://github.com/sobhe/hazm/releases/download/v0.5/resources-stanford.zip
!unzip resources-0.5.zip -d resources
!unzip resources-stanford.zip -d resources

--2023-02-26 15:28:21--  https://github.com/sobhe/hazm/releases/download/v0.5/resources-0.5.zip
Resolving github.com (github.com)... 20.27.177.113
Connecting to github.com (github.com)|20.27.177.113|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://github.com/roshan-research/hazm/releases/download/v0.5/resources-0.5.zip [following]
--2023-02-26 15:28:21--  https://github.com/roshan-research/hazm/releases/download/v0.5/resources-0.5.zip
Reusing existing connection to github.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/13956112/8c6c89ce-1918-11e5-9f06-86f58ea50386?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20230226%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20230226T152822Z&X-Amz-Expires=300&X-Amz-Signature=c48dd1242958a48c422bb84e29461b9e4dea25babbd024da2b65ac2a173a98fb&X-Amz-SignedHeaders=host&actor_id=0&key_

In [77]:
from datasets import load_dataset, load_metric
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pandas as pd
import hazm
import string
from hazm import *

In [78]:
datasets = load_dataset("Gholamreza/pquad")



  0%|          | 0/3 [00:00<?, ?it/s]

In [79]:
def preprocess_document(documents_list):
    # Normalizing the text by removing diacritics and standardizing the characters
    preprocessed_texts = []
    normalizer = hazm.Normalizer()
    tokenizer = hazm.WordTokenizer()
    stopwords = hazm.stopwords_list()
    lemmatizer = hazm.Lemmatizer()
    for text in documents_list:
      text = normalizer.normalize(text)
      # Tokenizing the text into words
      words = tokenizer.tokenize(text)
      # Removing stopwords from the text
      words = [word for word in words if word not in stopwords]
      # Removing punctuation marks from the text
      translator = str.maketrans('', '', string.punctuation)
      words = [word.translate(translator) for word in words]
      # Lemmatizing the words
      # words = [lemmatizer.lemmatize(word) for word in words]
      # Joining the words back into a single string
      preprocessed_text = ' '.join(words)
      preprocessed_texts.append(preprocessed_text)
    return preprocessed_texts

In [91]:
dataset = pd.concat([pd.DataFrame(datasets["train"]),pd.DataFrame(datasets["test"]),pd.DataFrame(datasets["validation"])])
# dataset = pd.concat([pd.DataFrame(datasets["validation"])])
dataset = dataset.reset_index()

In [81]:
def add_document_column(dataset):
    df2 = dataset.groupby('title')['context'].apply(lambda x: ' '.join(x.unique())).reset_index()
    document_embeddings = preprocess_document(df2['context'].tolist())
    # document_embeddings = (df2['context'].tolist())

    df2['context'] = document_embeddings
    return df2

In [92]:
def create_search_query(text,normalizer,lemmatizer,tagger,chunker):
    # normalize = normalizer.normalize(text)
    words = word_tokenize(text)
    pos_tags = tagger.tag(words)
    tree = chunker.parse(pos_tags)
    query_words = []
    for subtree in tree.subtrees(filter=lambda t: t.label() == 'NP'):
        for word, pos in subtree.leaves():
            query_words.append(word)
    query = ' '.join(query_words)
    return query

In [93]:
# create a new column for search query
normalizer = Normalizer()
lemmatizer = Lemmatizer()
tagger = POSTagger(model='resources/postagger.model')
chunker = Chunker(model='resources/chunker.model')
dataset['search_query'] = dataset['question'].apply(lambda x: create_search_query(x,normalizer,lemmatizer,tagger,chunker))

In [94]:
documents_df = add_document_column(dataset)

In [98]:
# Create a TfidfVectorizer object and fit it to the documents
vectorizer = TfidfVectorizer(input='content', analyzer='word', norm=None, smooth_idf=True)
tfidf_matrix = vectorizer.fit_transform((doc for doc in documents_df['context'].tolist()))

# Get the document IDs from the original list of tuples
doc_ids = [title for title in documents_df['title'].tolist()]

In [99]:
s1 = 0
s3 = 0
mrr = 0
for i in range(len(dataset)):
  output_titles=[]
  query = dataset.iloc[i]["search_query"]
  query_title = dataset.iloc[i]["title"]
  query_vector = vectorizer.transform([query])
  # Compute the cosine similarity between the query vector and the document vectors
  cosine_similarities = np.dot(query_vector, tfidf_matrix.T).toarray()[0]
  # Sort the documents by their cosine similarity scores
  sorted_indices = np.argsort(cosine_similarities)[::-1]
  for j in range(3):
    document_index = sorted_indices[j]
    document_id = doc_ids[document_index]
    output_titles.append(document_id)
  for z in range((10)):
    if query_title ==  doc_ids[sorted_indices[z]]:
      mrr += 1/(z+1)
      break
  if query_title in output_titles:
    s3+=1
  if query_title == output_titles[0]:
    s1+=1

s1 = s1/len(dataset)
s3 = s3/len(dataset)
mrr = mrr/len(dataset)

In [100]:
print("S1 = " + str(s1))
print("S3 = " + str(s3))
print("MRR = " + str(mrr))

S1 = 0.7457372116349047
S3 = 0.9139919759277834
MRR = 0.8346333643788472


=======================================

In [88]:
def unique(list1):
    unique_list = []
    for x in list1:
        if x not in unique_list:
            unique_list.append(x)
    return unique_list

In [89]:
titles = unique(dataset["title"])

In [90]:
s1 = 0
s3 = 0
mrr = 0
count = 0
for title in titles:
  df = dataset[dataset["title"] == title]
  df = df.reset_index(drop=True)
  unique_paraghraphs = unique(df['context'].tolist())
  vectorizer = TfidfVectorizer(input='content', analyzer='word', norm='l2', smooth_idf=True)
  tfidf_matrix = vectorizer.fit_transform((doc for doc in df['context'].tolist()))
  # Get the document IDs from the original list of tuples
  doc_ids = [title for title in df['context'].tolist()]
  for i in range(len(df)):
    output_titles=[]
    query = df.iloc[i]["question"]
    query_title = df.iloc[i]["context"]
    query_vector = vectorizer.transform([query])
    # Compute the cosine similarity between the query vector and the document vectors
    cosine_similarities = np.dot(query_vector, tfidf_matrix.T).toarray()[0]
    # Sort the documents by their cosine similarity scores
    sorted_indices = np.argsort(cosine_similarities)[::-1]
    for j in range(3):
      document_index = sorted_indices[j]
      document_id = doc_ids[document_index]
      output_titles.append(document_id)
    for z in range(min(10 ,len(sorted_indices) )):
      if query_title == doc_ids[sorted_indices[z]]:
        mrr += 1/(z+1)
        break
    if query_title in output_titles:
      s3+=1
    if query_title == output_titles[0]:
      s1+=1
    count+=1

s1 = s1/count
s3 = s3/count
mrr = mrr/count

print("S1 = " + str(s1))
print("S3 = " + str(s3))
print("MRR = " + str(mrr))

S1 = 0.8329989969909729
S3 = 0.8336258776328987
MRR = 0.8423647927910722
