RANKINGS 

In [37]:
"""
Upload the data and import the libraries:

"""

import json
import math
import os
import re
import collections
from collections import defaultdict
from array import array
import numpy as np
import pandas as pd
import sys

import os
import sys

project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))  # go up from part_3
sys.path.append(project_root)

from part_2.indexing_evaluation import load_processed_docs, create_index_tfidf, search_tf_idf
from part_1.data_processing import build_terms


In [38]:
def searching(query, index):
    query_terms=build_terms(query)
    if not query_terms:
        return []

    #start with first query term
    if query_terms[0] not in index: 
        return []

    
    docs_set= set(posting[0]for posting in index[query_terms[0]])

    #only keeping documents that are present in all term's postings
    for term in query_terms[1:]:
        if term in index:
            term_docs= [posting[0] for posting in index[term]]
            docs_set &= set(term_docs)
        else:
            docs_set=set()
            break
    
    if not docs_set:
        return []
    
    #rank final set of documents
    docs=list(docs_set)
    return query_terms, docs

- TF-IDF + Cosine Similarity

In [39]:
#this function was already implemented in part2:

def ranking_docs(terms,docs,index,idf,tf,title_index):
    """
    Compute a ranking score for each document using the TF-IDF 
    cosine similarity between the query and document vectors
    
    """
    
    #create a map form pid to tf list index
    pid_idx={}
    for term in terms:
        if term in index:
            for i, (pid,posting) in enumerate(index[term]):
                pid_idx.setdefault(pid,{})[term]=i
                
    #initialize query and documnent vectors
    vectors_doc=defaultdict(lambda: [0]*len(terms))
    query_vector= [0]*len(terms)
    
    #compute query term frequencies
    query_counts=collections.Counter(terms)
    query_norm= np.linalg.norm(list(query_counts.values()))

    #build query vector and compute document vectors
    for i,term in enumerate(terms):
        if term not in index:
            continue
        
        #compute query TF-IDF
        query_tf=query_counts[term]/query_norm if query_norm>0 else 0
        query_vector[i]=query_tf*idf.get(term,0.0)

        # compute document TF-IDF for each doc containing the term
        for pid in docs:
            map_term= pid_idx.get(pid,{})
            if term in map_term:
                vectors_doc[pid][i]=tf[term].get(pid,0.0)*idf.get(term,0.0)

    #compute cosine similarity between query and document vectors
    scores=[[np.dot(v,query_vector),doc] for doc, v in vectors_doc.items()]
    scores.sort(reverse=True)
    return [s[1] for s in scores]

In [40]:
def search_tf_idf(query, index, tf, idf, title_index):
    """
    Executes a search query usinf AND logic. 
    Only documents containing all query terms are considered.
    Results are ranqued by TF-IDF cosine similarity.
    
    """
    query_terms, docs= searching(query, index)
   
    return ranking_docs(query_terms,docs,index,idf,tf,title_index)

- BM25

In [41]:
def BM25(terms,docs,index,df,title_index, k=1.5, b=0.75):
    
    N = len(title_index) #total num of documents
    
    #average length accross documents
    doc_length= {}
    for term, postings in index.items():
        for pid, count in postings:
            doc_length[pid]= doc_length.get(pid,0)+count
            
    avg_dl= sum(doc_length.values())/ len(doc_length) if doc_length else 0
    
    
    scores = defaultdict(float)
    
    #Precompute IDF
    for term in terms:
        df_term = df.get(term, 0)
        idf = math.log((N + 0.5)/(df_term + 0.5))
        postings= dict(index.get(term, []))
        
        for pid in docs:
            tf_idf= postings.get(pid,0)
            ld = doc_length. get(pid, avg_dl)
            denominator = tf_idf +k*((1-b)+b*(ld/avg_dl))
            score = idf* ((tf_idf*(k+1))/denominator)
            scores[pid]+= score

    ranked_docs= sorted(scores.items(), key=lambda x:x[1], reverse= True)
    return [doc for doc, _ in ranked_docs]


In [42]:
def search_bm25(query, index, df, title_index, k=1.5, b= 0.75):
    """
    Executes a search query using AND logic. 
    Only documents containing all query terms are considered.
    Results are ranqued by TF-IDF cosine similarity.
    
    """   
    query_terms, docs= searching(query, index)

    return BM25(query_terms,docs,index,df,title_index, k=1.5, b=0.75)

- Our Score:

In [43]:
price_table={}
def get_price(pid):
    return price_table.get(pid,0)

In [44]:
def our_score(terms, docs, index, df, title_index, k,b):
     
    N = len(title_index) #total num of documents
    
    #average length accross documents
    doc_length= {}
    for term, postings in index.items():
        for pid, count in postings:
            doc_length[pid]= doc_length.get(pid,0)+count
            
    avg_dl= sum(doc_length.values())/ len(doc_length) if doc_length else 0
    
    
    scores = defaultdict(float)
    
    #Precompute IDF
    for term in terms:
        df_term = df.get(term, 0)
        idf = math.log((N + 0.5)/(df_term + 0.5))
        postings= dict(index.get(term, []))
        
        for pid in docs:
            tf_idf= postings.get(pid,0)
            ld = doc_length. get(pid, avg_dl)
            denominator = tf_idf +k*((1-b)+b*(ld/avg_dl))
            bm_score = idf* ((tf_idf*(k+1))/denominator)
            price=get_price(pid)

            if price>0:
                price_boost=1+(1+math.log10(price+1))
            else:
                price_boost=1.0
            scores[pid] +=bm_score*price_boost

    ranked_docs= sorted(scores.items(), key=lambda x:x[1], reverse= True)
    return [doc for doc, _ in ranked_docs]


In [45]:
def search_our_score(query,index,df,title_index,k=1.5, b=0.75):
    query_terms, docs= searching(query, index)
    return our_score(query_terms, docs, index, df, title_index, k=1.5, b=0.75)

- Word2vec + cosine

In [46]:
pip install gensim


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [47]:
from gensim.models import Word2Vec
def train_word2vec(docs):
    corpus=[]
    for doc in docs:
        text=(doc["title_clean"]+ " " + doc["description_clean"]).split()
        corpus.append(text)
    model=Word2Vec(sentences=corpus, vector_size=100,window=5,min_count=1,workers=4)
    return model

In [48]:
def compute_vectors(docs,model):
    doc_vectors={}
    for doc in docs:
        pid=doc["pid"]
        text=(doc["title_clean"]+ " " + doc["description_clean"]+ " "+ doc["product_details_clean"]).split()

        vectors=[]
        for w in text:
            if w in model.wv:
                vectors.append(model.wv[w])
        
        if vectors:
            doc_vectors[pid]=np.mean(vectors,axis=0)
        else:
            doc_vectors[pid]=np.zeros(model.vector_size)
        
    return doc_vectors

In [49]:
def embed_query(query,model):
    terms=query.split()
    vectors=[model.wv[w] for w in terms if w in model.wv]

    if not vectors:
        return np.zeros(model.vector_size)
    return np.mean(vectors,axis=0)

In [50]:
def cosine(a,b):
    if np.linalg.norm(a)==0 or  np.linalg.norm(b)==0:
        return 0.0
    return np.dot(a,b) / (np.linalg.norm(a)*np.linalg.norm(b))

In [53]:
def search_word2vec(query,index,model,doc_vectors):
    result=searching(query,index)
    if not result:
        return []
    
    terms,docs=result
    
    query_vec=embed_query(query,model)
    
    scores=[]
    
    for pid in docs:
        doc_vec=doc_vectors[pid]
        sim=cosine(query_vec,doc_vec)
        scores.append((sim,pid))
    
    scores.sort(reverse=True)
    return [pid for _,pid in scores[:20]]

Main

In [55]:
if __name__ == "__main__":
    
    #load preprocessed docs
    docs=load_processed_docs()

    global price_table
    price_table={doc["pid"]:doc.get("selling_price",0) for doc in docs}

    #build inverted index and compute TF-IDF values
    index,tf,df,idf,title_index=create_index_tfidf(docs)

    model=train_word2vec(docs)
    doc_vectors=compute_vectors(docs,model)

    
    #test queries
    queries=[
        "full sleeve black shirt",
        "solid women white polo",
        "print of multicolor neck grey shirt",
        "slim fit men blue jeans",
        "round collar full sleeves t-shirt" 
    ]
    
    #run and display top results for each query
    for q in queries:
        result=search_tf_idf(q,index,tf,idf,title_index)
        if not result:
            print("No matching documents.")
        else:
            print(f"Top results from tf-idf for query: '{q}'")
            for pid in result[:10]:
                print(f"{pid}: {title_index.get(pid,'[No title]')}")
        
        print("\n")
        
        result_bm25=search_bm25(q,index,df,title_index)
        if not result_bm25:
            print("No matching documents.")
        else:
            print(f"Top results from bm25 for query: '{q}'")
            for pid in result_bm25[:10]:
                print(f"{pid}: {title_index.get(pid,'[No title]')}")

        print("\n")
        result_ours= search_our_score(q,index,df,title_index)
        if not result_ours:
            print("No matching documents.")
        else:
            print(f"Top results from our score for query: '{q}'")
            for pid in result_ours[:10]:
                print(f"{pid}: {title_index.get(pid,'[No title]')}")
        
        print("\n")

        results_vector= search_word2vec(q,index,model,doc_vectors)
        print(f"Top results from word2vec for query: '{q}'")
        if not results_vector:
            print("No matching documents.")
        else:
            for pid in results_vector:
                print(f"{pid}: {title_index.get(pid,'[No title]')}")

        print("\n")



Top results from tf-idf for query: 'full sleeve black shirt'
TSHFUTG2TZSYDWTP: print women round neck black shirt
SHTFYPATCHTAN2ZT: men slim fit solid casual shirt
SHTFZP66HZNRS2XR: men regular fit solid mandarin collar casual shirt
TSHF94NKFVSHGVWQ: solid men hood black shirt
TSHFYN37DWMWHPKQ: color block women polo neck multicolor shirt
TSHFHFTQSYKZKFBU: solid men hood neck black shirt
TSHFJVVEDZKWSEHG: solid women round neck black shirt
TSHEM8ZK5BNQMX6V: solid women polo neck black shirt
TSHFVX7ZFNFEMHUG: ombr women round neck black shirt
SHTFZ4B5QHFGRHMG: women slim fit solid casual shirt


Top results from bm25 for query: 'full sleeve black shirt'
TSHF4HFV7DEHTQAH: solid women hood neck black shirt
TSHF94NKFVSHGVWQ: solid men hood black shirt
SHTFYPATCHTAN2ZT: men slim fit solid casual shirt
TSHEM8ZK5BNQMX6V: solid women polo neck black shirt
TSHFZ3QZXJZF9CGP: color block men round neck dark green black shirt
TSHFHFTQSYKZKFBU: solid men hood neck black shirt
TSHEM8ZVNGGUC7JY: soli