# **Project Part 3: Ranking & Filtering**

IRWA-2025-u214575-u214576-u215107-part-3

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
#Load preprocessed dataset from part 1
import json

path = "/content/drive/Shareddrives/UPF_IRWA_project/fashion_products_dataset_preprocessed.json"

with open(path, "r", encoding="utf-8") as f:
    dataset = json.load(f)

print("Loaded documents:", len(dataset))
print("Example fields:", list(dataset[0].keys()))

Loaded documents: 28080
Example fields: ['_id', 'pid', 'title', 'description', 'brand', 'category', 'sub_category', 'product_details', 'seller', 'out_of_stock', 'selling_price', 'discount', 'actual_price', 'average_rating', 'url', 'images', 'crawled_at', 'title_proc', 'description_proc']


**1. Youâ€™re asked to provide 3 different ways of ranking:**


a. TF-IDF + cosine similarity: Classical scoring, which we have also seen during the practical labs

In [3]:
from collections import defaultdict
import array

def build_terms(text):
    return text.lower().split()

def create_index(dataset):
    index = defaultdict(list)
    doc_texts = []
    pid_map = {}
    for i, doc in enumerate(dataset):
        text = f"{doc['title_proc']} {doc['description_proc']}"
        doc_texts.append(text)
        pid_map[i] = doc['title']  # guardamos tÃ­tulo para mostrar

        terms = build_terms(text)
        current_page_index = {}

        for pos, term in enumerate(terms):
            if term in current_page_index:
                current_page_index[term][1].append(pos)
            else:
                current_page_index[term] = [i, array.array('I', [pos])]

        for term, posting in current_page_index.items():
            index[term].append(posting)

    return index, doc_texts, pid_map

index, doc_texts, pid_map = create_index(dataset)
print("Indexed terms:", len(index))


Indexed terms: 6143


In [4]:
from collections import Counter
import math

N = len(doc_texts)
doc_terms = [t.split() for t in doc_texts]
doc_tf = [Counter(t) for t in doc_terms]
doc_len = [max(1, len(t)) for t in doc_terms]

df = {term: len(postings) for term, postings in index.items()}
idf = {term: math.log((N + 1) / (df_t + 1)) + 1 for term, df_t in df.items()}

print("Vocabulary size:", len(df))


Vocabulary size: 6143


In [5]:

def and_filter_docs(query_terms, index):

    cand = None
    for t in query_terms:
        docs_t = set(doc_id for doc_id, _ in index.get(t, []))
        cand = docs_t if cand is None else cand & docs_t
        if not cand:
            break
    return cand or set()


def tfidf_vector(term_counts):
    vec = {}
    total_terms = sum(term_counts.values()) or 1
    for t, c in term_counts.items():
        if t in idf:
            tf = c / total_terms
            vec[t] = tf * idf[t]
    return vec


def cosine_sparse(a, b):
    if not a or not b:
        return 0.0
    dot = sum(a[t] * b[t] for t in a if t in b)
    na = math.sqrt(sum(v * v for v in a.values()))
    nb = math.sqrt(sum(v * v for v in b.values()))
    return 0.0 if na == 0 or nb == 0 else dot / (na * nb)


doc_tfidf = [tfidf_vector(tf) for tf in doc_tf]


def rank_tfidf_cosine(query, top_k=10):#ranking top10 TF-IDF + cosine similarity
    q_terms = query.lower().split()
    cand_docs = and_filter_docs(q_terms, index)

    if not cand_docs:
        return []

    q_vec = tfidf_vector(Counter(q_terms))

    scores = {}  # title, score (keep highest) to avoid repetitions (own decision)

    for d in cand_docs:
        s = cosine_sparse(q_vec, doc_tfidf[d])
        if s > 0:
            title = pid_map[d]
            if title not in scores or scores[title] < s:#to avoid repetitions of product titles
                scores[title] = s


    sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)

    # Keep only top-K
    return [(title, round(score, 6)) for title, score in sorted_scores[:top_k]]

In [6]:
queries_part2=[
    "women track pant",
    "men track pant",
    "men pack",
    "women formal shirt",
    "men slim fit formal shirt"]

for q in queries_part2:
    print(f"\n Query: {q}")
    results = rank_tfidf_cosine(q, top_k=10)
    for title, score in results:
        print(f"{score} â€” {title}")



 Query: women track pant
0.939464 â€” Solid Women Multicolor Track Pants
0.84696 â€” Solid Women Blue Track Pants
0.844354 â€” Printed Women Blue Track Pants
0.835611 â€” Solid Women Black Track Pants
0.833108 â€” Printed Women Black Track Pants
0.810565 â€” Solid Women White Track Pants
0.805634 â€” Printed Women Multicolor Track Pants
0.798308 â€” Solid Women Grey Track Pants
0.796124 â€” Printed Women Grey Track Pants
0.775207 â€” Solid Women Green Track Pants

 Query: men track pant
0.93962 â€” Solid Men Multicolor Track Pants
0.8473 â€” Solid Men Blue Track Pants
0.844697 â€” Printed Men Blue Track Pants
0.835969 â€” Solid Men Black Track Pants
0.833469 â€” Printed Men Black Track Pants
0.810959 â€” Solid Men White Track Pants
0.806035 â€” Printed Men Multicolor Track Pants
0.798719 â€” Solid Men Grey Track Pants
0.796538 â€” Printed Men Grey Track Pants
0.775646 â€” Solid Men Green Track Pants

 Query: men pack
0.471682 â€” Solid Men Round Neck Black T-ShirtÂ Â (Pack of 2)
0.470

b. BM25

In [7]:
#Parameters BM25
k1 = 1.5
b = 0.75
avgdl = sum(doc_len) / len(doc_len)

def bm25_score_doc(query_terms, doc_id): #Computes BM25 score of one document for the given query.

    score = 0.0
    dl = doc_len[doc_id]
    tf_counts = doc_tf[doc_id]

    for t in query_terms:
        if t not in df or df[t] == 0:
            continue
        idf_t = math.log(1 + (N - df[t] + 0.5) / (df[t] + 0.5))
        tf = tf_counts.get(t, 0)

        denom = tf + k1 * (1 - b + b * dl / avgdl)
        if denom > 0:
            score += idf_t * (tf * (k1 + 1)) / denom

    return score


def rank_bm25(query, top_k=10):#Returns top_k documents ranked by BM25 score.

    q_terms = query.lower().split()
    cand_docs = and_filter_docs(q_terms, index)

    if not cand_docs:
        return []

    scores = {}

    for d in cand_docs:
        s = bm25_score_doc(q_terms, d)
        if s > 0:
            title = pid_map[d]
            if title not in scores or scores[title] < s:# Keep only the highest score for a repeated titles
                scores[title] = s

    # Sort by score
    sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)

    # Return top-k
    return [(title, round(score, 6)) for title, score in sorted_scores[:top_k]]



In [8]:
for q in queries_part2:
    print(f"\n Query: {q}")
    results = rank_bm25(q, top_k=10)
    for title, score in results:
        print(f"{score} â€” {title}")



 Query: women track pant
11.230398 â€” Solid Women Multicolor Track Pants
10.758672 â€” Solid Women Black Track Pants
10.726826 â€” Striped Women Dark Blue Track Pants
10.726826 â€” Color Block Women Black Track Pants
10.538623 â€” Solid Women Grey Track Pants
10.378794 â€” Solid Women Blue Track Pants
10.266344 â€” Solid Women Black, Grey Track Pants
10.069754 â€” Solid Women Dark Blue Track Pants
9.879625 â€” Solid Women Red Track Pants
9.763808 â€” Striped Women Grey Track Pants

 Query: men track pant
11.275604 â€” Solid Men Multicolor Track Pants
10.828808 â€” Striped Men Maroon Track Pants
10.746536 â€” Striped Men Green Track Pants
10.581044 â€” Solid Men Dark Blue Track Pants
10.581044 â€” Striped Men Grey Track Pants
10.383045 â€” Solid Men Grey Track Pants
10.298776 â€” Solid Men Black, Grey Track Pants
10.298776 â€” Solid Men Blue, Black Track Pants
10.158484 â€” Solid Men Maroon Track Pants
10.158484 â€” Solid Men Black Track Pants

 Query: men pack
4.37115 â€” Men BriefÂ 

c. Your Score: Here, the task is to create a new score. (Be creative ðŸŽ¨, think about what factors could make a document more relevant to a query and include them in your formula.)

In [9]:
import numpy as np


def minmax(values):#Auxiliary normalization
    vals = [v for v in values if isinstance(v, (int, float))]
    if not vals:
        return (0, 1)
    mn, mx = min(vals), max(vals)
    if mn == mx:
        mx += 1
    return mn, mx

# Normalize numerical fields of Dataset
mn_price, mx_price = minmax([d.get("selling_price") for d in dataset])
mn_disc,  mx_disc  = minmax([int(str(d.get("discount", "0")).replace("%", "").replace("off", "").strip() or 0)
                             for d in dataset])
mn_rate,  mx_rate  = minmax([float(d.get("average_rating") or 0) for d in dataset])

def normalize(v, mn, mx):
    if v is None:
        return 0.0
    try:
        v = float(str(v).replace("%", "").replace("off", "").strip())
    except:
        v = 0.0
    return (v - mn) / (mx - mn) if mx > mn else 0.0


In [10]:
ALPHA = 0.70  # weight TF-IDF
BETA  = 0.15  # rating
GAMMA = 0.10  # discount
DELTA = 0.05  # penalization price/stock

def hybrid_score(query, top_k=10):
    base_results = rank_tfidf_cosine(query, top_k=1000)
    if not base_results:
        return []

    enriched = []
    for title, tfidf_score in base_results:
        # localizar el documento original
        doc_id = next((i for i, d in enumerate(dataset) if d['title'] == title), None)
        if doc_id is None:
            continue
        doc = dataset[doc_id]

        rating   = normalize(doc.get("average_rating"), mn_rate, mx_rate)
        discount = normalize(doc.get("discount"), mn_disc, mx_disc)
        price    = 1 - normalize(doc.get("selling_price"), mn_price, mx_price)
        stock_penalty = 0.0 if not doc.get("out_of_stock") else -0.2

        score = (
            ALPHA * tfidf_score +
            BETA * rating +
            GAMMA * discount +
            DELTA * price +
            stock_penalty
        )
        enriched.append((doc_id, score))

    enriched.sort(key=lambda x: x[1], reverse=True)
    return [(dataset[d]['title'], round(s, 6)) for d, s in enriched[:top_k]]


In [11]:
for q in queries_part2:
    print(f"\n Query: {q}")
    results = hybrid_score(q, top_k=10)
    for title, score in results:
        print(f"{score} â€” {title}")



 Query: women track pant
0.758867 â€” Solid Women White Track Pants
0.745092 â€” Solid Women Red Track Pants
0.708202 â€” Color Block Women Blue Track Pants
0.708005 â€” Color Block Women Black Track Pants
0.70791 â€” Self Design Women Black Track Pants
0.702829 â€” Solid Women Green Track Pants
0.695404 â€” Printed Women Maroon Track Pants
0.69328 â€” Self Design Women Multicolor Track Pants
0.679811 â€” Graphic Print Women Blue Track Pants
0.674222 â€” Self Design Women Grey Track Pants

 Query: men track pant
0.757911 â€” Printed Men Black Track Pants
0.743987 â€” Solid Men Green Track Pants
0.721664 â€” Striped Men Black Track Pants
0.714827 â€” Solid Men Orange Track Pants
0.693645 â€” Self Design Men Multicolor Track Pants
0.692994 â€” Printed Men White Track Pants
0.688776 â€” Self Design Men Blue Track Pants
0.677578 â€” Color Block Men Green Track Pants
0.675739 â€” Self Design Men Grey Track Pants
0.652746 â€” Color Block Men Dark Blue Track Pants

 Query: men pack
0.536034 

**2. Implement word2vec + cosine ranking score. Return a top-20 list of documents for each of the 5 queries defined in the Part 2 of your project, using search and word2vec + cosine similarity ranking.
To represent a piece of text using word2vec, we create a single vector that represents the entire text. This vector has the same number of dimensions as the word vectors and is calculated by averaging the vectors of all words in the text.**


In [12]:
!pip install gensim



In [13]:
from gensim.models import Word2Vec


corpus_tokens = [doc["description_proc"].split() + doc["title_proc"].split() for doc in dataset]

w2v_model = Word2Vec(
    sentences=corpus_tokens,
    vector_size=100,
    window=5,
    min_count=2,
    sg=1,
    workers=4
)

wv = w2v_model.wv
print(" Word2Vec model trained with", len(wv), "unique tokens.")


âœ… Word2Vec model trained with 4600 unique tokens.


In [14]:


def sent_vector_avg(tokens, wv):
    """Promedia los vectores de todas las palabras del texto."""
    vecs = [wv[t] for t in tokens if t in wv]
    if not vecs:
        return None
    return np.mean(vecs, axis=0)

# Precompute document vectors
doc_vecs = []
for doc in dataset:
    tokens = (doc["title_proc"] + " " + doc["description_proc"]).split()
    v = sent_vector_avg(tokens, wv)
    doc_vecs.append(v)
print(" Document vectors computed.")


âœ… Document vectors computed.


In [15]:
def cosine_np(a, b):
    """Cosine similarity between two numpy vectors."""
    if a is None or b is None:
        return 0.0
    na = np.linalg.norm(a)
    nb = np.linalg.norm(b)
    return 0.0 if na == 0 or nb == 0 else float(np.dot(a, b) / (na * nb))

def rank_w2v_cosine(query, top_k=20):
    """Rank documents by cosine similarity using Word2Vec vector averages,
       removing duplicate product titles."""
    q_tokens = query.lower().split()
    q_vec = sent_vector_avg(q_tokens, wv)
    if q_vec is None:
        return []

    scores = {}  # title -> best score

    for i, v in enumerate(doc_vecs):
        s = cosine_np(q_vec, v)
        if s > 0:
            title = dataset[i]["title"]
            if title not in scores or scores[title] < s:
                scores[title] = s

    # sort by sim
    sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    return [(title, round(s, 6)) for title, s in sorted_scores[:top_k]]


In [17]:
for q in queries_part2:
    print(f"\n Query: {q}")
    results = rank_w2v_cosine(q, top_k=20)
    for title, score in results:  # muestra top-5 para cada uno
        print(f"{score} â€” {title}")



 Query: women track pant
0.958418 â€” Solid Women Multicolor Track Pants
0.956445 â€” Solid Men Multicolor Track Pants
0.944847 â€” Solid Women Black Track Pants
0.943185 â€” Solid Women Olive Track Pants
0.939513 â€” Camouflage Women Blue Track Pants
0.935649 â€” Applique Men Black Track Pants
0.929412 â€” Solid Women Grey Track Pants
0.926964 â€” Solid Women White Track Pants
0.92661 â€” Solid Men Grey Track Pants
0.924791 â€” Solid Men White Track Pants
0.924061 â€” Solid Men Black Track Pants
0.92233 â€” Striped Women Grey Track Pants
0.921872 â€” Solid Women Blue Track Pants
0.921373 â€” Printed Women Grey Track Pants
0.921184 â€” Solid Women Brown Track Pants
0.92033 â€” Printed Men Grey Track Pants
0.919112 â€” Striped Women Black Track Pants
0.918927 â€” Printed Women Black Track Pants
0.918743 â€” Checkered Women Olive Track Pants
0.918722 â€” Striped Men Grey Track Pants

 Query: men track pant
0.958141 â€” Solid Men Multicolor Track Pants
0.956712 â€” Solid Women Multicolor

**3. Can you imagine a better representation than word2vec? Justify your answer. (HINT - what about Doc2vec? Sentence2vec? What are the pros and cons?)**