In [1]:
import re
import os
import pandas as pd
from snowballstemmer import stemmer
import arabicstopwords.arabicstopwords as ar_stp
import pyterrier as pt
# pyterrier is a Python API for Terrier. Link: https://github.com/terrier-org/pyterrier 
# Terrier IR Platform is a modular open source software for the rapid development of large-scale information retrieval applications.
if not pt.started():
    pt.init(helper_version="0.0.6")

PyTerrier 0.7.1 has loaded Terrier 5.7 (built by craigm on 2022-11-10 18:30)


No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [2]:
data_path = "path_to_your_data_directory"

In [3]:
# define some global constants
TEXT = "text"
QUERY = "query"
LABEL = "label"
RANK = "rank"
TAG = "tag"
SCORE = "score"
QID = "qid"
DOC_NO = "docno"
DOCID = "docid"


## Load the index 

In [4]:
def load_index(index_path):
    try:
        index = pt.IndexFactory.of(index_path)
        print("Index was loaded successfully from this path: ", index_path)
        return index
    except Exception as e:
        print('Cannot load the index, check exception details {}'.format(e))
        return []


# read file based on its extension (tsv or xlsx)
def read_file(input_file, sep="\t", names = ""):
    if input_file.endswith(".xlsx"):
        df = pd.read_excel(input_file)
    else:
        if names != "":
            df = pd.read_csv(input_file, sep=sep, names=names,encoding="utf-8")
        else:
            df = pd.read_csv(input_file, sep=sep,encoding="utf-8")
    return df

In [5]:
index_path = os.path.join(data_path ,"QPC_Index/data.properties")

index = load_index(index_path=index_path)

Index was loaded successfully from this path:  /data/watheq/Quran_QA/quran-qa-2023/Task-A/data/QPC_Index/data.properties


## Search in the index

Before searching in the index, we need to prepare some functions to clean the input text.

### Cleaning 
Clean text from urls, handles, special characters, tabs, line jumps, extra white space, and puntuations.

In [6]:


# Clean text from urls, handles, special characters, tabs, line jumps, and extra white space.
def clean(text):
    text = re.sub(r"http\S+", " ", text)  # remove urls
    text = re.sub(r"@[\w]*", " ", text)  # remove handles
    text = re.sub(r"[\.\,\#_\|\:\?\?\/\=]", " ", text) # remove special characters
    text = re.sub(r"\t", " ", text)  # remove tabs
    text = re.sub(r"\n", " ", text)  # remove line jump
    text = re.sub(r"\s+", " ", text)  # remove extra white space
    text = re.sub(r'[^\w\s]', '', text) # Removing punctuations in string using regex
    text = text.strip()
    return text


### Preprocessing 
Preprocess the arabic input text by performing normalization, stemming, and removing stop words.

In [7]:

# arabic stemmer
ar_stemmer = stemmer("arabic")

# remove arabic stop words
def ar_remove_stop_words(sentence):
    terms=[]
    stopWords= set(ar_stp.stopwords_list())
    for term in sentence.split() : 
        if term not in stopWords :
            terms.append(term)
    return " ".join(terms)


# normalize the arabic text
def normalize_arabic(text):
    text = re.sub("[إأٱآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ؤ", "ء", text)
    text = re.sub("ئ", "ء", text)
    text = re.sub("ة", "ه", text)
    return(text)

# stem the arabic text
def ar_stem(sentence):
    return " ".join([ar_stemmer.stemWord(i) for i in sentence.split()])


# apply all preprocessing steps needed for Arabic text
def preprocess_arabic(text): 
    text = normalize_arabic(text)
    text = ar_remove_stop_words(text)
    text = ar_stem(text)
    return text


def prepare_query_for_search(query_path, query_column=TEXT,
                        id_column=DOC_NO):

        names = [DOC_NO, TEXT]
        print("Cleaning queries and applying preprocessing steps")
        df_query = read_file(query_path, names=names)
        # apply the cleaning functions on the queries/questions
        df_query[QUERY] =df_query[query_column].apply(clean)

        # apply normalization, stemming and stop word removal             
        print("Applying normalization, stemming and stop word removal")
        df_query[QUERY] =df_query[QUERY].apply(preprocess_arabic)

        df_query[QID] = df_query[id_column].astype(str) # convert the id column to string
        df_query = df_query[[QID, QUERY]] # keep the columns needed for search
        print("Done with preparation!")
        return df_query
    

## Search
Search in the index and find the relevant passages.

In [9]:
query_path = os.path.join(data_path, "QQA23_TaskA_dev.tsv")
run_save_path = os.path.join(data_path, "runs/bigIR_BM25.tsv")

# 1. initialize the BM25 retrieval model
BM25_model = pt.BatchRetrieve(index, controls = {"wmodel": "BM25"}, num_results=5)

# 2. read the query file and prepare it for search to match pyterrier format
df_query = prepare_query_for_search(query_path)

# 3. search using BM25 model
df_run = BM25_model.transform(df_query)

# 4. save the run in trec format to a file
df_run["Q0"] = ["Q0"] * len(df_run)
df_run["tag"] = ["BM25"] * len(df_run)
df_run['question-id'] = df_run["qid"]
df_run['passage-id'] = df_run["docno"]
df_run = df_run[["question-id", "Q0", "passage-id", "rank", "score", "tag"]]
df_run.to_csv(run_save_path, sep="\t", index=False, header=False)
df_run

Cleaning queries and applying preprocessing steps
Applying normalization, stemming and stop word removal
Done with preparation!


Unnamed: 0,question-id,Q0,passage-id,rank,score,tag
0,114,Q0,29:38-40,0,11.484285,BM25
1,114,Q0,28:81-84,1,10.095525,BM25
2,114,Q0,75:1-15,2,7.969817,BM25
3,114,Q0,34:1-2,3,3.429409,BM25
4,114,Q0,28:1-6,4,3.274326,BM25
...,...,...,...,...,...,...
116,428,Q0,17:88-89,0,12.692842,BM25
117,428,Q0,17:9-11,1,11.947024,BM25
118,428,Q0,2:170-171,2,10.099939,BM25
119,428,Q0,29:41-43,3,9.653199,BM25


## Evaluation

In [11]:
! python QQA23_TaskA_eval.py \
    -r "path_to_your_data_directory/data/runs/bigIR_BM25.tsv" \
    -q "path_to_your_data_directory/data/qrels/QQA23_TaskA_qrels_dev.gold"

Format check: Passed
     map  recip_rank
0.170291    0.313333


Here, we are just evaluating the perfect run for the dev set

In [19]:
! python QQA23_TaskA_eval.py \
    -r "./data/runs/dev_perfect.tsv" \
    -q "./data/qrels/QQA23_TaskA_qrels_dev.gold"

Format check: Passed
 map  recip_rank
 1.0         1.0
