In [1]:
from google.colab import drive
drive.mount('/content/drive')

#Load preprocessed dataset from part 1
import json

path = "/content/drive/Shareddrives/UPF_IRWA_project/fashion_products_dataset_preprocessed.json"

with open(path, "r", encoding="utf-8") as f:
    dataset = json.load(f)

print("Loaded documents:", len(dataset))
print("Example fields:", list(dataset[0].keys()))


Mounted at /content/drive
Loaded documents: 28080
Example fields: ['_id', 'pid', 'title', 'description', 'brand', 'category', 'sub_category', 'product_details', 'seller', 'out_of_stock', 'selling_price', 'discount', 'actual_price', 'average_rating', 'url', 'images', 'crawled_at', 'title_proc', 'description_proc']


# **1. Indexing**

**1.1 Build inverted index:**

In [2]:
from collections import defaultdict
from array import array
import re

def build_terms(text):
    """
    Simple tokenizer: splits on whitespace and removes non-alphanumeric characters.
    Input text must be already lowercased and preprocessed.
    """
    tokens = re.findall(r'\b[a-z0-9]+\b', text.lower())
    return tokens


def create_index(dataset):
    """
    Build an inverted index for the fashion products dataset.

    Arguments:
    dataset -- list of documents (each document = dictionary with fields like title_proc, description_proc...)

    Returns:
    index -- dictionary { term: [ [doc_id, [positions]], ... ] }
    pid_map -- dictionary { internal_doc_id: product_title }  (to display titles later)
    """
    index = defaultdict(list)
    pid_map = {}  # map internal doc ids (ints) to titles

    for doc_id, doc in enumerate(dataset):  # numeric id for simplicity
        pid_map[doc_id] = doc.get('title', 'Untitled')

        # Concatenate main searchable text fields
        text = " ".join([
            doc.get("title_proc", ""),
            doc.get("description_proc", ""),
            doc.get("brand_proc", ""),
            doc.get("category_proc", ""),
            doc.get("sub_category_proc", "")
        ])

        terms = build_terms(text)
        current_page_index = {}

        for position, term in enumerate(terms):
            try:
                # if the term is already in current page index â†’ append new position
                current_page_index[term][1].append(position)
            except:
                # otherwise create new posting with doc_id and position
                current_page_index[term] = [doc_id, array('I', [position])]

        # merge with main index
        for term_page, posting_page in current_page_index.items():
            index[term_page].append(posting_page)

    return index, pid_map


# --- Build the index ---
index, pid_map = create_index(dataset)

print("Inverted index built successfully")
print("Number of unique terms:", len(index))
sample_term = list(index.keys())[0]
print(f"Example term: '{sample_term}' â†’ postings: {index[sample_term][:3]}")




Inverted index built successfully
Number of unique terms: 5124
Example term: 'solid' â†’ postings: [[0, array('I', [0])], [1, array('I', [0])], [2, array('I', [0])]]


**1.2 Propose test queries:**

In [3]:
test_queries = [
    "women track pant",
    "men track pant",
    "men pack",
    "women formal shirt",
    "men slim fit formal shirt"
]

print(" Check which queries have matching documents:")

for i, q in enumerate(test_queries, 1):
    tokens = q.lower().split()
    matching_docs = [
        doc['title'] for doc in dataset
        if all(token in doc['title_proc'] for token in tokens)
    ]
    print(f"\nQ{i}: {q}")
    if matching_docs:
        print(f"   {len(matching_docs)} matching documents found")
        for title in matching_docs[:5]:
            print("   ", title)
    else:
        print("   No matching documents")


 Check which queries have matching documents:

Q1: women track pant
   581 matching documents found
    Solid Women Multicolor Track Pants
    Solid Women Multicolor Track Pants
    Solid Women Brown, Grey Track Pants
    Solid Women Multicolor Track Pants
    Solid Women Dark Blue Track Pants

Q2: men track pant
   1172 matching documents found
    Solid Women Multicolor Track Pants
    Solid Men Blue Track Pants
    Solid Men Multicolor Track Pants
    Solid Women Multicolor Track Pants
    Solid Women Brown, Grey Track Pants

Q3: men pack
   4500 matching documents found
    Women Self Design Ankle LengthÂ Â (Pack of 4)
    Solid Men Polo Neck Dark Blue, Blue T-ShirtÂ Â (Pack of 2)
    Solid Men Polo Neck Red, Grey T-ShirtÂ Â (Pack of 2)
    Solid Men Polo Neck Grey, Black T-ShirtÂ Â (Pack of 2)
    Solid Men Polo Neck Dark Blue, Grey T-ShirtÂ Â (Pack of 2)

Q4: women formal shirt
   238 matching documents found
    Women Regular Fit Solid Button Down Collar Formal Shirt
    Women S

**1.3 Rank your results:**

In [4]:
import math
from collections import defaultdict

#term frequency
def compute_tf(term, doc_id, index):
    """TF = frequency of term in document / number of terms in document"""
    postings = index.get(term, [])
    freq = 0
    for doc, positions in postings:
        if doc == doc_id:
            freq = len(positions)
            break
    # total terms in doc (for normalization)
    total_terms = len(doc_texts[doc_id].split())
    return freq / total_terms if total_terms > 0 else 0


# IDF
def compute_idf(term, index, N):
    """IDF = log(N / df(term))"""
    df = len(index.get(term, []))
    return math.log((N + 1) / (df + 1)) + 1   # smooth version to avoid /0


# TF-IDF
def compute_tfidf(query, index, N):
    """
    Returns a dict {doc_id: tfidf_score} for all docs matching the query (AND)
    """
    terms = query.lower().split()

    # get candidate documents that contain ALL query terms (AND)
    candidate_docs = None
    for t in terms:
        docs_with_t = set([doc for doc, _ in index.get(t, [])])
        candidate_docs = docs_with_t if candidate_docs is None else candidate_docs & docs_with_t

    if not candidate_docs:
        return {}

    scores = defaultdict(float)
    for doc_id in candidate_docs:
        for t in terms:
            tf = compute_tf(t, doc_id, index)
            idf = compute_idf(t, index, N)
            scores[doc_id] += tf * idf
    return scores


# Rank
def rank_documents(query, index, N, top_k=10):
    scores = compute_tfidf(query, index, N)
    ranked_docs = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:top_k]
    results = [(pid_map[doc_id], round(score, 4)) for doc_id, score in ranked_docs]
    return results


doc_texts = [" ".join([
    d.get("title_proc", ""),
    d.get("description_proc", ""),
    d.get("brand_proc", ""),
    d.get("category_proc", ""),
    d.get("sub_category_proc", "")
]) for d in dataset]

N = len(dataset)


for i, q in enumerate(test_queries, 1):
    print(f"\nðŸ”¹ Query {i}: {q}")
    ranked = rank_documents(q, index, N, top_k=5)
    if not ranked:
        print("No matching documents.")
    else:
        for rank, (title, score) in enumerate(ranked, 1):
            print(f"{rank}. {title}  |  TF-IDF: {score}")



ðŸ”¹ Query 1: women track pant
1. Solid Women Multicolor Track Pants  |  TF-IDF: 2.2847
2. Solid Women Multicolor Track Pants  |  TF-IDF: 2.2847
3. Solid Women Multicolor Track Pants  |  TF-IDF: 2.2847
4. Solid Women Multicolor Track Pants  |  TF-IDF: 2.2847
5. Solid Women Black Track Pants  |  TF-IDF: 1.8378

ðŸ”¹ Query 2: men track pant
1. Solid Men Multicolor Track Pants  |  TF-IDF: 2.2909
2. Solid Men Multicolor Track Pants  |  TF-IDF: 2.2909
3. Solid Men Blue Track Pants  |  TF-IDF: 1.8327
4. Solid Men Green Track Pants  |  TF-IDF: 1.8327
5. Solid Men Multicolor Track Pants  |  TF-IDF: 1.8327

ðŸ”¹ Query 3: men pack
1. Men BriefÂ Â (Pack of 12)  |  TF-IDF: 1.1269
2. Men BriefÂ Â (Pack of 2)  |  TF-IDF: 1.1269
3. Men BriefÂ Â (Pack of 5)  |  TF-IDF: 1.1269
4. Men BriefÂ Â (Pack of 6)  |  TF-IDF: 1.1269
5. Men BriefÂ Â (Pack of 4)  |  TF-IDF: 1.1269

ðŸ”¹ Query 4: women formal shirt
1. Women Solid Formal Shirt  |  TF-IDF: 1.8863
2. Women Regular Fit Checkered Formal Shirt  |  TF-ID

# **2. Evaluation**

**2.1 Implement the following evaluation metrics to assess the effectiveness of your retrieval
solutions. These metrics will help you measure how well your system retrieves relevant
documents for each query:**

i. Precision@K (P@K)

In [5]:
import numpy as np

# Precision@k
def precision_at_k(relevant_docs, retrieved_docs, k):
    retrieved_k = retrieved_docs[:k]
    retrieved_set = set(retrieved_k)
    relevant_set = set(relevant_docs)
    return len(retrieved_set & relevant_set) / k



ii. Recall@K (R@K)

In [6]:
# Recall@K

def recall_at_k(relevant_docs, retrieved_docs, k):
    retrieved_k = retrieved_docs[:k]
    retrieved_set = set(retrieved_k)
    relevant_set = set(relevant_docs)
    return len(retrieved_set & relevant_set) / len(relevant_set) if relevant_set else 0.0



iii. Average Precision@K (P@K)

In [7]:
# Average Precision@K
def average_precision_at_k(relevant_docs, retrieved_docs, k):
    retrieved_k = retrieved_docs[:k]
    score = 0.0
    relevant_found = 0
    relevant_set = set(relevant_docs)
    counted = set()
    for i, doc in enumerate(retrieved_k, start=1):
        if doc in relevant_set and doc not in counted:
            counted.add(doc)
            relevant_found += 1
            score += relevant_found / i
    return score / len(relevant_set) if relevant_set else 0.0



iv. F1-Score@K

In [8]:
# F1-Score@K
def f1_score_at_k(relevant_docs, retrieved_docs, k):
    """F1@K = 2 * (Precision@K * Recall@K) / (Precision@K + Recall@K)"""
    p = precision_at_k(relevant_docs, retrieved_docs, k)
    r = recall_at_k(relevant_docs, retrieved_docs, k)
    return 0.0 if (p + r) == 0 else 2 * (p * r) / (p + r)


v. Mean Average Precision (MAP)

In [9]:
# MAP
def mean_average_precision(all_queries_results, all_relevants, k):
    ap_values = []
    for qid in all_queries_results:
        retrieved = all_queries_results[qid]
        relevant = all_relevants[qid]
        ap_values.append(average_precision_at_k(relevant, retrieved, k))
    return np.mean(ap_values) if ap_values else 0.0


vi. Mean Reciprocal Rank (MRR)

In [10]:
# MRR
def mean_reciprocal_rank(all_queries_results, all_relevants):
    rr = []
    for qid in all_queries_results:
        retrieved = all_queries_results[qid]
        relevant_set = set(all_relevants[qid])
        for rank, doc in enumerate(retrieved, start=1):
            if doc in relevant_set:
                rr.append(1 / rank)
                break
        else:
            rr.append(0.0)
    return np.mean(rr) if rr else 0.0


vii. Normalized Discounted Cumulative Gain (NDCG)

In [11]:
# NDCG@K
import math

def ndcg_at_k(relevant_docs, retrieved_docs, k):
  # Remove duplicates while keeping order
  seen = set()
  unique_retrieved = []
  for doc in retrieved_docs:
    if doc not in seen:
        unique_retrieved.append(doc)
        seen.add(doc)

  retrieved_k = unique_retrieved[:k]
  relevant_set = set(relevant_docs)


  dcg = 0.0
  for i, doc in enumerate(retrieved_k, start=1):
    rel_i = 1 if doc in relevant_set else 0
    dcg += rel_i / math.log2(i + 1)


  ideal_relevant_count = min(len(relevant_set), k)
  idcg = sum(1 / math.log2(i + 1) for i in range(1, ideal_relevant_count + 1))


  return dcg / idcg if idcg > 0 else 0.0


In [12]:
# results
all_queries_results = {}
for i, q in enumerate(test_queries, 1):
    ranked = rank_documents(q, index, N, top_k=10)
    retrieved = [title for title, score in ranked]
    all_queries_results[f"Q{i}"] = retrieved
    #print(f"Q{i} retrieved:")
    #print(retrieved)


all_relevants = {
    "Q1": ["Solid Women Multicolor Track Pants","Solid Women Black Track Pants"],
    "Q2": ["Solid Men Blue Track Pants", "Solid Men Green Track Pants"],
    "Q3": ["Superhero Men Round Neck Multicolor T-Shirt  (Pack of 2)", "Superhero Women Round Neck Multicolor T-Shirt  (Pack of 2)"],
    "Q4": ["Women Solid Formal Shirt", "Women Slim Fit Solid Formal Shirt"],
    "Q5": ["Men Slim Fit Solid Formal Shirt", "Men Slim Fit Printed Formal Shirt"]
}


#Evaluation
k = 5
for qid in all_queries_results:
    retrieved = all_queries_results[qid]
    relevant = all_relevants[qid]

    p = precision_at_k(relevant, retrieved, k)
    r = recall_at_k(relevant, retrieved, k)
    ap = average_precision_at_k(relevant, retrieved, k)
    f1 = f1_score_at_k(relevant, retrieved, k)
    ndcg = ndcg_at_k(relevant, retrieved, k)

    print(f"\n {qid}")
    print(f"Precision@{k}: {p:.3f}")
    print(f"Recall@{k}: {r:.3f}")
    print(f"AP@{k}: {ap:.3f}")
    print(f"F1@{k}: {f1:.3f}")
    print(f"NDCG@{k}: {ndcg:.3f}")

map_score = mean_average_precision(all_queries_results, all_relevants, k)
mrr_score = mean_reciprocal_rank(all_queries_results, all_relevants)

print("\n====================")
print(f"MAP@{k}: {map_score:.3f}")
print(f"MRR: {mrr_score:.3f}")
print("====================")



 Q1
Precision@5: 0.400
Recall@5: 1.000
AP@5: 0.700
F1@5: 0.571
NDCG@5: 1.000

 Q2
Precision@5: 0.400
Recall@5: 1.000
AP@5: 0.417
F1@5: 0.571
NDCG@5: 0.693

 Q3
Precision@5: 0.000
Recall@5: 0.000
AP@5: 0.000
F1@5: 0.000
NDCG@5: 0.000

 Q4
Precision@5: 0.400
Recall@5: 1.000
AP@5: 0.750
F1@5: 0.571
NDCG@5: 0.877

 Q5
Precision@5: 0.400
Recall@5: 1.000
AP@5: 1.000
F1@5: 0.571
NDCG@5: 1.000

MAP@5: 0.573
MRR: 0.667


**2.2 Apply the evaluation metrics you have implemented to the search results and relevance judgments provided in validation_labels.csv for the predefined queries. When reporting evaluation results, provide only numeric values, rounded to three decimal places. Do not include textual explanations or additional statistics in this section.**

  a. Query 1: women full sleeve sweatshirt cotton
   
  b. Query 2: men slim jeans blue

In [13]:
import pandas as pd

val_path = "/content/drive/Shareddrives/UPF_IRWA_project/validation_labels.csv"
validation = pd.read_csv(val_path)


#print("Validation data loaded")
#print(validation.head())

query_texts = {
    1: "women full sleeve sweatshirt cotton",
    2: "men slim jeans blue"
}

queries = validation["query_id"].unique()
results_numeric = {}

for qid in queries:
    query_text = query_texts[qid]
    print(f"\n Evaluating Query ID: {qid} -> {query_text}")

    # Relevant docs
    q_data = validation[validation["query_id"] == qid]
    relevant_docs = q_data[q_data["labels"] == 1]["pid"].tolist()

    ranked_results = rank_documents(query_text, index, N, top_k=k)
    retrieved_docs = [pid for pid, _ in ranked_results]

    p = precision_at_k(relevant_docs, retrieved_docs, k)
    r = recall_at_k(relevant_docs, retrieved_docs, k)
    ap = average_precision_at_k(relevant_docs, retrieved_docs, k)
    f1 = f1_score_at_k(relevant_docs, retrieved_docs, k)
    ndcg = ndcg_at_k(relevant_docs, retrieved_docs, k)

    results_numeric[qid] = {
        "Precision@10": round(p, 3),
        "Recall@10": round(r, 3),
        "AP@10": round(ap, 3),
        "F1@10": round(f1, 3),
        "NDCG@10": round(ndcg, 3)
    }

# MAP and MRR
map_score = mean_average_precision(
    {qid: validation[validation["query_id"] == qid]["pid"].tolist() for qid in queries},
    {qid: validation[(validation["query_id"] == qid) & (validation["labels"] == 1)]["pid"].tolist() for qid in queries},
    k
)

mrr_score = mean_reciprocal_rank(
    {qid: validation[validation["query_id"] == qid]["pid"].tolist() for qid in queries},
    {qid: validation[(validation["query_id"] == qid) & (validation["labels"] == 1)]["pid"].tolist() for qid in queries}
)

print("\n====================")
for qid, vals in results_numeric.items():
    print(f"Query ID: {qid}")
    for metric, value in vals.items():
        print(f"{metric}: {value:.3f}")
print(f"MAP@10: {map_score:.3f}")
print(f"MRR: {mrr_score:.3f}")
print("====================")


 Evaluating Query ID: 1 -> women full sleeve sweatshirt cotton

 Evaluating Query ID: 2 -> men slim jeans blue

Query ID: 1
Precision@10: 0.000
Recall@10: 0.000
AP@10: 0.000
F1@10: 0.000
NDCG@10: 0.000
Query ID: 2
Precision@10: 0.000
Recall@10: 0.000
AP@10: 0.000
F1@10: 0.000
NDCG@10: 0.000
MAP@10: 0.198
MRR: 1.000


In [14]:
#Testing frequency of terms of the queries in 2.2 in different docs
query_text = "women full sleeve sweatshirt cotton"
terms = query_text.lower().split()

for t in terms:
    docs_with_term = index.get(t, [])
    print(f"Term '{t}' is in {len(docs_with_term)} documents")


Term 'women' is in 13438 documents
Term 'full' is in 3547 documents
Term 'sleeve' is in 3 documents
Term 'sweatshirt' is in 1417 documents
Term 'cotton' is in 8315 documents


**2.3 You will act as expert judges by establishing the ground truth for each document and query.**


a. For the test queries you defined in Part 1, Step 2 during indexing, assign a binary relevance label to each document: 1 if the document is relevant to the query, or 0 if it is not.

In [15]:
import copy

dataset_copy = copy.deepcopy(dataset)

for doc in dataset_copy:
    doc['relevance'] = {}
    title_text = doc.get('title_proc', doc['title']).lower()
    for qid, query in enumerate(test_queries, 1):
        query_terms = query.lower().split()
        doc['relevance'][qid] = int(all(term in title_text for term in query_terms))

# Example check
print(dataset_copy[0])


#Validation
# Initialize relevance field for each document
# for doc in dataset:
#     doc['relevance'] = {}  # Will store {query_id: 0 or 1}

# print("Checking relevance and storing labels:\n")

# for qid, query in enumerate(test_queries, 1):
#     query_terms = query.lower().split()
#     relevant_products = []

#     for doc in dataset:
#         title = doc['title_proc'].lower()
#         # AND logic: all query terms must be in title
#         is_relevant = int(all(term in title for term in query_terms))
#         doc['relevance'][qid] = is_relevant

#         if is_relevant:
#             relevant_products.append(doc['title'])

#     # Print for checking (max 5 examples)
#     print(f"Query {qid}: {query}")
#     if relevant_products:
#         for title in relevant_products[:5]:
#             print(f"  - {title}")
#     else:
#         print("  No relevant products found.")
#     print()

{'_id': 'fa8e22d6-c0b6-5229-bb9e-ad52eda39a0a', 'pid': 'TKPFCZ9EA7H5FYZH', 'title': 'Solid Women Multicolor Track Pants', 'description': 'Yorker trackpants made from 100% rich combed cotton giving it a rich look.Designed for Comfort,Skin friendly fabric,itch-free waistband & great for all year round use Proudly made in India', 'brand': 'York', 'category': 'Clothing and Accessories', 'sub_category': 'Bottomwear', 'product_details': [{'Style Code': '1005COMBO2'}, {'Closure': 'Elastic'}, {'Pockets': 'Side Pockets'}, {'Fabric': 'Cotton Blend'}, {'Pattern': 'Solid'}, {'Color': 'Multicolor'}], 'seller': 'Shyam Enterprises', 'out_of_stock': False, 'selling_price': '921', 'discount': '69% off', 'actual_price': '2,999', 'average_rating': '3.9', 'url': 'https://www.flipkart.com/yorker-solid-men-multicolor-track-pants/p/itmd2c76aadce459?pid=TKPFCZ9EA7H5FYZH&lid=LSTTKPFCZ9EA7H5FYZHVYXWP0&marketplace=FLIPKART&srno=b_1_1&otracker=browse&fm=organic&iid=177a46eb-d053-4732-b3de-fcad6ff59cbd.TKPFCZ9EA7H

b. Comment on each of the evaluation metrics, stating how they differ, and which
information gives each of them. Analyze your results.

c. Analyze the current search system and identify its main problems or limitations. For each issue you find, propose possible ways to resolve it. Consider aspects such as retrieval accuracy, ranking quality, handling of different field types, query formulation, and indexing strategies.