# **Keyword Based IR System Training and Evlauation Pipeline**

#### To train and evaluate a specific algorithm (TF-IDF, BM25, or Probabilistic relevance model), execute the code cells in its corresponding section, then execute the code cells in Evaluation section.

#### Dataset Preparation

In [1]:
import datasets
full_meme_queries = datasets.load_from_disk(f"./full_meme_queries") 


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
!pip install faiss-cpu sentence-transformers



In [2]:
def make_documents(row):
    img_captions = ', '.join(row['img_captions'])
    meme_captions = ', '.join(row['meme_captions'])
    title = row['title']
    metaphors =str(row['metaphors'])
    try:
        ocr = ', '.join(row['extracted_text']["<OCR_WITH_REGION>"]["labels"]).replace("</s>", "")
    except:
        ocr = "no ocr for this image"
    return img_captions, meme_captions, title, metaphors, ocr

In [3]:
# create the document query pairs dataset 
# img_captions, meme_captions, extracted_text are considered documents 
# queries are considered queries

# create a dataset from full_meme_queries
import uuid
queries = {}
ground_truth_docs = {}
docs = {}
for meme in full_meme_queries:
    # Generate documents from meme
    doc1, doc2, _, _, doc3 = make_documents(meme)
    
    # Assign unique IDs and store each document
    doc_ids = []
    for doc in [doc1, doc2, doc3]:
        if doc and doc != "no ocr for this image":
            doc_id = str(uuid.uuid4())
            docs[doc_id] = doc
            doc_ids.append(doc_id)

    # Assign unique IDs to queries and associate relevant docs
    for query in meme['queries']:
        query_id = str(uuid.uuid4())
        queries[query_id] = query
        ground_truth_docs[query_id] = doc_ids.copy()  # Ensure rel_docs has a separate list per query_id

In [4]:
import pandas as pd 
docs_df = pd.DataFrame(list(docs.items()),columns=['doc_id', 'document'])

In [5]:
docs_df

Unnamed: 0,doc_id,document
0,fdc54a62-60aa-48f3-99e1-88f05ebc28c7,"three heads of Avatars, flash drive and guns"
1,9de9b5ae-18fb-4702-9cbe-823650d21b70,Meme poster is trying to convey that Person fr...
2,22600b9a-b99f-4038-acfc-d22fbfecc705,"Panik, Kalm, Panik"
3,e105740b-ebc4-4dae-a9e3-19fec23de35a,The character is looking surprised.
4,d8dc3e92-731e-4f55-b7d7-a8229f21ab08,Meme poster is trying to convey that some vega...
...,...,...
18549,7d12f7dd-ba22-4aac-a676-13c9c48c3bc0,Meme poster is trying to convey that everyone'...
18550,37564a50-d27a-43a8-876e-b16d476989bd,"I don't fantastic about what I'd do if I won, ..."
18551,c3cdfa5f-9744-479f-9b7f-c9fa369264fb,Luke and Obi Wan argue so Luke decides to walk...
18552,6419e456-0e31-4579-af2d-89b1ab08bd1d,Meme poster is trying to convey that Luke ulti...


## **Keyword Based Models Training on MemeCap**

### TF-IDF

In [184]:
!pip install rank_bm25 nltk


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting click (from nltk)
  Downloading click-8.1.7-py3-none-any.whl.metadata (3.0 kB)
Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m[31m6.3 MB/s[0m eta [36m0:00:01[0m
[?25hDownloading click-8.1.7-py3-none-any.whl (97 kB)
Installing collected packages: click, nltk
Successfully installed click-8.1.7 nltk-3.9.1


In [None]:
top_k = 5

In [51]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform documents into TF-IDF matrix
tfidf_matrix = tfidf_vectorizer.fit_transform(docs_df["document"])

def keyword_search(query, tfidf_vectorizer, tfidf_matrix, top_k=3):
    # Transform the query into the same TF-IDF space
    query_vector = tfidf_vectorizer.transform([query])
    
    # Calculate cosine similarity between the query vector and each document vector
    cosine_similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()
    
    # Get top-k documents with highest similarity
    top_k_indices = np.argsort(cosine_similarities)[-top_k:][::-1]
    top_k_similarities = cosine_similarities[top_k_indices]
    
    # Return the top-k results
    results = [(index, docs_df["document"][index], top_k_similarities[i]) for i, index in enumerate(top_k_indices)]
    return results


In [68]:
retrieved_docs = {}
print("Number of queries =", len(queries))
for qidx, (qid, query) in enumerate(queries.items()):
    top_k_results = keyword_search(query, tfidf_vectorizer, tfidf_matrix, top_k=top_k)
    # Display results
    rel_docs = []
    if qidx%100 == 0:
        print(query)
        print("query number ", qidx)
    for i, (index, doc, score) in enumerate(top_k_results):
        rel_docs.append(docs_df["doc_id"][index])
        if qidx%100 ==0:
            print(f"Rank {i+1} - Document: '{docs_df['document'][index]}', Score: {score:.4f}")
    retrieved_docs[qid] = rel_docs

Number of queries = 36197
meme with three heads and gun that's actually a flash drive
query number  0
Rank 1 - Document: 'three heads of Avatars, flash drive and guns ', Score: 0.6348
Rank 2 - Document: 'Meme poster is trying to convey that Person freaks out at a gun, is fine when they learn the gun has a flash drive magazine, but then starts freaking out again when learning the usb is filled with virtual bullets. ', Score: 0.3465
Rank 3 - Document: 'a picture of a girl smiling and meme heads', Score: 0.3018
Rank 4 - Document: 'a masculine man and lady with some animals and meme heads', Score: 0.2586
Rank 5 - Document: 'two meme heads at the up and down sides', Score: 0.2569
funny doctor meme with monk and wine
query number  100
Rank 1 - Document: 'A monk happily holds up a glass of wine then drinks it.', Score: 0.4714
Rank 2 - Document: 'A picture of  Doctor Nurse  funny Meme.', Score: 0.3941
Rank 3 - Document: 'God is holding up a wine cup.', Score: 0.2960
Rank 4 - Document: 'Meme po

### BM25

In [74]:
from rank_bm25 import BM25Okapi
from nltk.tokenize import word_tokenize
import nltk

# Download NLTK data for tokenization
nltk.download("punkt")
def preprocess_and_tokenize(text):
    tokens = word_tokenize(text.lower())
    return tokens
# Tokenize and preprocess documents
tokenized_corpus = [preprocess_and_tokenize(doc) for doc in docs_df["document"]]


[nltk_data] Downloading package punkt to
[nltk_data]     /home/ahmed.attia/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [75]:
bm25 = BM25Okapi(tokenized_corpus)

In [76]:
retrieved_docs = {}

for qidx, (qid, query) in enumerate(queries.items()):
    tokenized_query = word_tokenize(query.lower())
    # Perform BM25 retrieval
      # Define the number of top results you want
    doc_scores = bm25.get_scores(tokenized_query)
    top_k_indices = sorted(range(len(doc_scores)), key=lambda i: doc_scores[i], reverse=True)[:top_k]
    if qidx%100 == 0:
        print(qidx)
    rel_docs = [docs_df["doc_id"][index] for index in top_k_indices]
    retrieved_docs[qid] = rel_docs


0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6900
7000
7100
7200
7300
7400
7500
7600
7700
7800
7900
8000
8100
8200
8300
8400
8500
8600
8700
8800
8900
9000
9100
9200
9300
9400
9500
9600
9700
9800
9900
10000
10100
10200
10300
10400
10500
10600
10700
10800
10900
11000
11100
11200
11300
11400
11500
11600
11700
11800
11900
12000
12100
12200
12300
12400
12500
12600
12700
12800
12900
13000
13100
13200
13300
13400
13500
13600
13700
13800
13900
14000
14100
14200
14300
14400
14500
14600
14700
14800
14900
15000
15100
15200
15300
15400
15500
15600
15700
15800
15900
16000
16100
16200
16300
16400
16500
16600
16700
16800
16900
17000
17100
17200
17300
17400
17500
17600
17700
17800
17900
18000
18100
18200
18300
18400
18

### Probabiltic Relevance Model

In [7]:
queries_df = pd.DataFrame(list(queries.items()),columns=['query_id', 'query'])

In [8]:
queries_df

Unnamed: 0,query_id,query
0,0ebfc319-dd47-4788-8183-2b202e057d95,meme with three heads and gun that's actually ...
1,ed2161c3-52c2-4d1b-9ce7-4b8a81b9f29f,funny image with person freaking out over virt...
2,7a829dfc-9781-440f-92a6-396a2c2fb00b,plot twist meme with avatar heads and gun with...
3,8a4a654f-81da-445c-8870-ab7415df7c43,picture with caption panik kalm panik and a gu...
4,d4548ebc-fa2f-412d-bb2c-fef3a83bcc65,humorous meme about humanity with three heads ...
...,...,...
36192,750c16c7-1007-4e5f-85b7-58a8e601854b,Star Wars argument about sand people hiding in...
36193,005bbac2-306c-4f81-b7e8-2e0188e9501e,Luke and Obi Wan disagree on desert dwellers' ...
36194,ac682fc5-eb51-4994-9748-8b36aa8f1aa0,Meme about sandpeople riding in single file to...
36195,6f5f1f8f-ba71-4b30-9ee8-346337f47fe0,Another Star Wars myth busted about desert cre...


In [9]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import csr_matrix

# Step 1: Preprocess documents and queries with sparse matrix support
doc_ids = list(docs.keys())
query_ids = list(queries.keys())
document_texts = [docs[doc_id] for doc_id in doc_ids]
query_texts = [queries[q_id] for q_id in query_ids]

# Fit vectorizer on document collection
vectorizer = CountVectorizer(binary=True, stop_words='english')
doc_term_matrix = vectorizer.fit_transform(document_texts)
query_term_matrix = vectorizer.transform(query_texts)
terms = vectorizer.get_feature_names_out()

# Precompute document counts for each term
total_doc_count = doc_term_matrix.shape[0]
term_doc_count = doc_term_matrix.sum(axis=0).A1

# Step 2: Precompute term probabilities for relevant/non-relevant docs
def precompute_term_probabilities():
    term_probabilities = {}
    for query_id in query_ids:
        relevant_doc_ids = ground_truth_docs[query_id]
        relevant_indices = [doc_ids.index(doc_id) for doc_id in relevant_doc_ids]
        relevant_matrix = doc_term_matrix[relevant_indices]
        
        # Calculate relevant and non-relevant term counts
        relevant_counts = relevant_matrix.sum(axis=0).A1
        non_relevant_counts = term_doc_count - relevant_counts
        total_relevant_docs = len(relevant_indices)
        total_non_relevant_docs = total_doc_count - total_relevant_docs

        # Laplace smoothing
        prob_term_relevant = (relevant_counts + 1) / (total_relevant_docs + 2)
        prob_term_non_relevant = (non_relevant_counts + 1) / (total_non_relevant_docs + 2)

        term_probabilities[query_id] = (prob_term_relevant, prob_term_non_relevant)
    return term_probabilities

term_probabilities = precompute_term_probabilities()

In [10]:
# Step 3: Score documents using vectorized operations
def score_documents(query_id, query_idx):
    query_terms = query_term_matrix[query_idx].nonzero()[1]
    prob_relevant, prob_non_relevant = term_probabilities[query_id]
    # Vectorized scoring
    if query_idx%100 == 0:
        print(query_id)
    scores = np.array(doc_term_matrix[:, query_terms].multiply(
        np.log(prob_relevant[query_terms] / prob_non_relevant[query_terms])
    ).sum(axis=1)).flatten()

    ranked_docs = sorted(zip(doc_ids, scores), key=lambda x: x[1], reverse=True)[:top_k]
    return ranked_docs

# Step 4: Run ranking for each query
ranked_results = {query_id: score_documents(query_id, idx) for idx, query_id in enumerate(query_ids)}


0ebfc319-dd47-4788-8183-2b202e057d95
d670b02e-9f78-42d7-8b7e-5a9e8b723838
1897cb8d-e9e1-48e6-b90c-9fa0664dbe3b
bc1f6129-7e7f-47c6-a238-55bbf395765f
131a611a-f0db-4947-ad12-e6d0bcb66e5c
648b6b2c-5861-4b62-b880-000fb969c5af
21ffb495-6c3b-4186-8c9c-37665da37b43
c5d2d15f-0bef-4f3c-9fd2-2f9b95d95fca
3ca1df9c-e315-4f65-bc40-66636cbfba7b
d177e435-92e0-4112-91ea-2039b0092e8b
a53abdb3-8161-40c5-97d2-2cf368c7d6a0
efc212ba-7acf-4b4a-9db7-c2a45fa8e090
6dbb1c56-3881-4909-a4b0-8ce00f4138a9
3545292b-80be-4633-8505-23b11ba223af
6f8fd84a-6eb1-4431-8aef-b9bd6bc53a25
5cdad741-3e77-4636-a9a2-84641556314a
10e0bcd0-c32b-4e4c-9982-50c1bda4d579
0f51bdd2-9ddf-4808-91e8-4edb5de8f09d
121595b9-89c0-498c-ad16-d73ba3c4ea42
fcab4bd9-5a6b-452d-a1c8-eb2963f4061c
e4e48ec7-faad-4e80-85e0-33cebe4c3ac9
5eb5b544-e6de-4094-8b56-0ca244423cc5
7d85a01d-251b-41ba-93d1-bd0cdef1f21f
bc55d79b-1dbc-4114-80f6-73d77cc18b6c
012d8ba4-16e6-49ff-bdd9-75d61f681b55
89d7cb7a-23a9-4750-9115-860789fc47e1
8f308f1c-d78e-4998-8619-c61e8074b9bf
5

In [13]:
retrieved_docs = {}
for query_id, rel_docs in ranked_results.items():
    retrieved_docs[query_id] = [doc[0] for doc in rel_docs]

In [14]:
retrieved_docs

{'0ebfc319-dd47-4788-8183-2b202e057d95': ['fdc54a62-60aa-48f3-99e1-88f05ebc28c7',
  '9de9b5ae-18fb-4702-9cbe-823650d21b70',
  '89735bf1-f8a8-4276-8fee-6988159d5ff9',
  '9470ed07-9a6e-414d-82fa-414dda5715e7',
  'abbf7bdd-3609-4d48-a282-56a5b32fb8ef'],
 'ed2161c3-52c2-4d1b-9ce7-4b8a81b9f29f': ['9de9b5ae-18fb-4702-9cbe-823650d21b70',
  '419af8ef-d60b-464b-a0ac-96d234f5f29c',
  'd8e039be-6099-4e86-be6d-433f03179af2',
  '32b50870-458a-491b-80ab-39671af2da26',
  '744a445d-d883-4b96-b19d-54a6941bafe4'],
 '7a829dfc-9781-440f-92a6-396a2c2fb00b': ['3b166aea-553b-4127-94d4-b45469717734',
  '361440bc-58db-4a05-9de6-5b580ec1d28b',
  '1b729e69-1ee8-4bb1-aa1e-0554ea1f6042',
  '1fa63152-a591-47a8-9ae0-eab9d6cc7e17',
  'cd70db5a-08c4-47f7-9905-56512ea2b33d'],
 '8a4a654f-81da-445c-8870-ab7415df7c43': ['9de9b5ae-18fb-4702-9cbe-823650d21b70',
  '22600b9a-b99f-4038-acfc-d22fbfecc705',
  '0d473ed5-17cf-4309-98ec-e6ab4975877a',
  '120012ab-b186-48be-9f2f-4d9222c52770',
  '302573df-250d-4798-80f1-295f1863d4de

## **Evaluation**

In [15]:
def evaluation_pipeline(retrieved_docs, ground_truth_docs):
    def precision_at_k(retrieved, relevant, k):
        retrieved_k = retrieved[:k]
        relevant_set = set(relevant)
        relevant_retrieved = sum([1 for doc in retrieved_k if doc in relevant_set])
        return relevant_retrieved / k

    def recall_at_k(retrieved, relevant, k):
        retrieved_k = retrieved[:k]
        relevant_set = set(relevant)
        relevant_retrieved = sum([1 for doc in retrieved_k if doc in relevant_set])
        return relevant_retrieved / len(relevant) if relevant else 0

    def average_precision(retrieved, relevant, k):
        retrieved_k = retrieved[:k]
        relevant_set = set(relevant)
        relevant_retrieved = 0
        avg_precision = 0
        for i, doc in enumerate(retrieved_k):
            if doc in relevant_set:
                relevant_retrieved += 1
                avg_precision += relevant_retrieved / (i + 1)
        return avg_precision / len(relevant) if relevant else 0

    def reciprocal_rank(retrieved, relevant):
        relevant_set = set(relevant)
        for i, doc in enumerate(retrieved):
            if doc in relevant_set:
                return 1 / (i + 1)
        return 0

    # Results dictionary
    query_results = {}
    ks = [1, 3, 5]
    # Calculate precision@k and recall@k for each query
    for query_id, relevant_docs in ground_truth_docs.items():
        # Get the top-ranked list for this query from retrieved_docs
        retrieved_docs_list = retrieved_docs[query_id]
        for k in ks:
            # Calculate precision@k and recall@k
            precision_k = precision_at_k(retrieved_docs_list, relevant_docs, k)
            recall_k = recall_at_k(retrieved_docs_list, relevant_docs, k)
            # Store the results in the dictionary
            if query_id not in query_results:
                query_results[query_id] = {}
            
            query_results[query_id][k] = (precision_k, recall_k)
        rr = reciprocal_rank(retrieved_docs_list, relevant_docs)
        ap = average_precision(retrieved_docs_list, relevant_docs, 5)
        query_results[query_id]["rr"] = rr
        query_results[query_id]["ap"] = ap

    # Initialize lists to store AP, RR, and precision/recall@k values across all queries
    average_precisions = []
    reciprocal_ranks = []
    mean_precision_at_k = {k: [] for k in ks}  # Dictionary to store precision@k for each k
    mean_recall_at_k = {k: [] for k in ks}  # Dictionary to store recall@k for each k

    # Iterate over each query result to calculate AP, RR, and precision/recall for Mean Precision/Recall
    for query_id in query_results:
        # Add AP and RR for each query to the lists
        average_precisions.append(query_results[query_id]["ap"])
        reciprocal_ranks.append(query_results[query_id]["rr"])
        
        # Collect precision@k and recall@k for each k value across queries
        for k in ks:
            precision_k = query_results[query_id][k][0]  # precision@k is the first value in (precision, recall) tuple
            recall_k = query_results[query_id][k][1]  # recall@k is the second value in (precision, recall) tuple
            mean_precision_at_k[k].append(precision_k)
            mean_recall_at_k[k].append(recall_k)
    # Calculate MAP
    map_score = sum(average_precisions) / len(average_precisions) if average_precisions else 0

    # Calculate MRR
    mrr_score = sum(reciprocal_ranks) / len(reciprocal_ranks) if reciprocal_ranks else 0

    # Calculate Mean Precision@k and Mean Recall@k for each k
    mean_precision_scores = {k: (sum(precision_list) / len(precision_list) if precision_list else 0) for k, precision_list in mean_precision_at_k.items()}
    mean_recall_scores = {k: (sum(recall_list) / len(recall_list) if recall_list else 0) for k, recall_list in mean_recall_at_k.items()}

    # Add MAP, MRR, Mean Precision, and Mean Recall scores to query_results for overall summary
    query_results["overall"] = {
        "MAP": map_score,
        "MRR": mrr_score,
        "Mean Precision@k": mean_precision_scores,
        "Mean Recall@k": mean_recall_scores
    }
    return query_results


query_results = evaluation_pipeline(retrieved_docs, ground_truth_docs)

In [16]:
query_results["overall"]

{'MAP': 0.38410879661606195,
 'MRR': 0.7471632087373673,
 'Mean Precision@k': {1: 0.6948918418653479,
  3: 0.37835548065681734,
  5: 0.25219217062181526},
 'Mean Recall@k': {1: 0.2407704137543145,
  3: 0.39120184914038986,
  5: 0.4343960365039739}}