RANKINGS 

In [14]:
"""
Upload the data and import the libraries:

"""

import json
import math
import os
import re
import collections
from collections import defaultdict
from array import array
import numpy as np
import pandas as pd
import sys

import os
import sys

project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))  # go up from part_3
sys.path.append(project_root)

from part_2.indexing_evaluation import load_processed_docs, create_index_tfidf, search_tf_idf


- TF-IDF + Cosine Similarity

In [15]:
#this function was already implemented in part2:

def ranking_docs(terms,docs,index,idf,tf,title_index):
    """
    Compute a ranking score for each document using the TF-IDF 
    cosine similarity between the query and document vectors
    
    """
    
    #create a map form pid to tf list index
    pid_idx={}
    for term in terms:
        if term in index:
            for i, (pid,posting) in enumerate(index[term]):
                pid_idx.setdefault(pid,{})[term]=i
                
    #initialize query and documnent vectors
    vectors_doc=defaultdict(lambda: [0]*len(terms))
    query_vector= [0]*len(terms)
    
    #compute query term frequencies
    query_counts=collections.Counter(terms)
    query_norm= np.linalg.norm(list(query_counts.values()))

    #build query vector and compute document vectors
    for i,term in enumerate(terms):
        if term not in index:
            continue
        
        #compute query TF-IDF
        query_tf=query_counts[term]/query_norm if query_norm>0 else 0
        query_vector[i]=query_tf*idf.get(term,0.0)

        # compute document TF-IDF for each doc containing the term
        for pid in docs:
            map_term= pid_idx.get(pid,{})
            if term in map_term:
                vectors_doc[pid][i]=tf[term].get(pid,0.0)*idf.get(term,0.0)

    #compute cosine similarity between query and document vectors
    scores=[[np.dot(v,query_vector),doc] for doc, v in vectors_doc.items()]
    scores.sort(reverse=True)
    return [s[1] for s in scores]

In [16]:
def search_tf_idf(query, index, tf, idf, title_index):
    """
    Executes a search query usinf AND logic. 
    Only documents containing all query terms are considered.
    Results are ranqued by TF-IDF cosine similarity.
    
    """
    query_terms=build_terms(query)
    if not query_terms:
        return []

    #start with first query term
    if query_terms[0] not in index: 
        return []

    
    docs_set= set(posting[0]for posting in index[query_terms[0]])

    #only keeping documents that are present in all term's postings
    for term in query_terms[1:]:
        if term in index:
            term_docs= [posting[0] for posting in index[term]]
            docs_set &= set(term_docs)
        else:
            docs_set=set()
            break
    
    if not docs_set:
        return []
    
    #rank final set of documents
    docs=list(docs_set)
   
    return ranking_docs(query_terms,docs,index,idf,tf,title_index)

- BM25

In [17]:
def BM25(terms,docs,index,df,title_index, k=1.5, b=0.75):
    
    N = len(title_index) #total num of documents
    
    #average length accross documents
    doc_lenght= {}
    for term, postings in index.items():
        for pid, count in postings:
            doc_length[pid]= doc_length.get(pid,0)+count
            
    avg_dl= sum(doc_lenght.values())/ len(doc_length) if doc_length else 0
    
    
    scores = defaultdict(float)
    
    #Precompute IDF
    for term in terms:
        df_term = df.get(term, 0)
        idf = math.log((N + 0.5)/(df_term + 0.5))
        postings= dict(index.get(term, []))
        
        for pid in docs:
            tf_idf= postings.get(pid,0)
            ld = docs_length. get(pid, avg_dl)
            denominator = tf_idf +k1*((1-b)+b*(ld/avg_dl))
            score = idf* ((tf_idf*(k1+1))/denominator)
            scores[pid]+= score
    ranked_docs= sorted(scores.items(), key=lambda x:X[1], reverse= True)
    return [doc for doc, _ in ranked_docs]


In [18]:
def search_bm25(query, index, df, title_index, k=1.5, b= 0.75):
    """
    Executes a search query using AND logic. 
    Only documents containing all query terms are considered.
    Results are ranqued by TF-IDF cosine similarity.
    
    """
    query_terms=build_terms(query)
    if not query_terms:
        return []

    #start with first query term
    if query_terms[0] not in index: 
        return []

    
    docs_set= set(posting[0]for posting in index[query_terms[0]])

    #only keeping documents that are present in all term's postings
    for term in query_terms[1:]:
        if term in index:
            term_docs= [posting[0] for posting in index[term]]
            docs_set &= set(term_docs)
        else:
            docs_set=set()
            break
    
    if not docs_set:
        return []
    
    #rank final set of documents
    docs=list(docs_set)
   
    return BM25(query_terms,docs,index,df,title_index, k=1.5, b=0.75)

- Our Score:

Main

In [24]:
if __name__ == "__main__":
    
    #load preprocessed docs
    docs=load_processed_docs()
    
    #build inverted index and compute TF-IDF values
    index,tf,df,idf,title_index=create_index_tfidf(docs)

    
    #test queries
    queries=[
        "full sleeve black shirt",
        "solid women white polo",
        "print of multicolor neck grey shirt",
        "slim fit men blue jeans",
        "round collar full sleeves t-shirt" 
    ]
    
    #run and display top results for each query
    for q in queries:
        result=search_tf_idf(q,index,tf,idf,title_index)
        if not result:
            print("No matching documents.")
        else:
            print(f"Top results for query: '{q}'")
            for pid in result[:10]:
                print(f"{pid}: {title_index.get(pid,'[No title]')}")
        result_bm25=search_bm25(q,index,df,title_index)
        if not result_bm25:
            print("No matching documents.")
        else:
            print(f"Top results for query: '{q}'")
            for pid in result_bm25[:10]:
                print(f"{pid}: {title_index.get(pid,'[No title]')}")


FileNotFoundError: [Errno 2] No such file or directory: '../data/processed_docs.jsonl'