# RANKINGS 

---
The script applies several ranking models to search queries:
- **TF-IDF + cosine similarity** 
- **BM25** 
- **Our score** : BM25 + price boosting
- **Word2Vec + cosine similarity**

In [1]:
"""
Upload the data and import the libraries:

"""

import json
import math
import os
import re
import collections
from collections import defaultdict
from array import array
import numpy as np
import pandas as pd
import sys
import os

#add project root
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))  # go up from part_3
sys.path.append(project_root)

#import previous functions
from part_2.indexing_evaluation import load_processed_docs, create_index_tfidf, search_tf_idf
from part_1.data_processing import build_terms


## SEARCHING

In [None]:
def searching(query, index):
    '''
    This function process a query and returns a list of normalized query terms 
    and a document containing all query terms.

    For this function the logic AND has been used, so if any term is missing it returns empty.
    '''
    
    query_terms=build_terms(query)
    if not query_terms:
        return []

    #start with first query term
    if query_terms[0] not in index: 
        return []

    
    docs_set= set(posting[0]for posting in index[query_terms[0]])

    #only keeping documents that are present in all term's postings
    for term in query_terms[1:]:
        if term in index:
            term_docs= [posting[0] for posting in index[term]]
            docs_set &= set(term_docs)
        else:
            docs_set=set()
            break
    
    if not docs_set:
        return []
    
    #rank final set of documents
    docs=list(docs_set)
    return query_terms, docs

## TF-IDF + Cosine Similarity

In [3]:
#this function was already implemented in part2
def ranking_docs(terms,docs,index,idf,tf,title_index):
    """
    Compute a ranking score for each document using the TF-IDF 
    cosine similarity between the query and document vectors
    
    """
    
    #create a map form pid to tf list index
    pid_idx={}
    for term in terms:
        if term in index:
            for i, (pid,posting) in enumerate(index[term]):
                pid_idx.setdefault(pid,{})[term]=i
                
    #initialize query and documnent vectors
    vectors_doc=defaultdict(lambda: [0]*len(terms))
    query_vector= [0]*len(terms)
    
    #compute query term frequencies
    query_counts=collections.Counter(terms)
    query_norm= np.linalg.norm(list(query_counts.values()))

    #build query vector and compute document vectors
    for i,term in enumerate(terms):
        if term not in index:
            continue
        
        #compute query TF-IDF
        query_tf=query_counts[term]/query_norm if query_norm>0 else 0
        query_vector[i]=query_tf*idf.get(term,0.0)

        # compute document TF-IDF for each doc containing the term
        for pid in docs:
            map_term= pid_idx.get(pid,{})
            if term in map_term:
                vectors_doc[pid][i]=tf[term].get(pid,0.0)*idf.get(term,0.0)

    #compute cosine similarity between query and document vectors
    scores=[[np.dot(v,query_vector),doc] for doc, v in vectors_doc.items()]
    scores.sort(reverse=True)
    return [s[1] for s in scores]

In [None]:
def search_tf_idf(query, index, tf, idf, title_index):
    """
    Executes a search query using AND logic and TF-IDF cosine ranking. 
    
    """
    query_terms, docs= searching(query, index)
   
    return ranking_docs(query_terms,docs,index,idf,tf,title_index)

## BM25

In [None]:
def BM25(terms,docs,index,df,title_index, k=1.5, b=0.75):
    ''' 
    The basic BM25 scoring has been implemented.
    
    '''
    
    N = len(title_index) #total num of documents
    
    #average length accross documents
    doc_length= {}
    for term, postings in index.items():
        for pid, count in postings:
            doc_length[pid]= doc_length.get(pid,0)+count
            
    avg_dl= sum(doc_length.values())/ len(doc_length) if doc_length else 0
    
    
    scores = defaultdict(float)
    
    #Compute pre-term for BM25
    for term in terms:
        df_term = df.get(term, 0)
        idf = math.log((N + 0.5)/(df_term + 0.5)) #smoothing
        postings= dict(index.get(term, []))
        
        for pid in docs:
            tf_idf= postings.get(pid,0)
            ld = doc_length. get(pid, avg_dl)
            denominator = tf_idf +k*((1-b)+b*(ld/avg_dl))
            score = idf* ((tf_idf*(k+1))/denominator)
            scores[pid]+= score

    ranked_docs= sorted(scores.items(), key=lambda x:x[1], reverse= True)
    return [doc for doc, _ in ranked_docs]


In [6]:
def search_bm25(query, index, df, title_index, k=1.5, b= 0.75):
    """
    BM25 search is implemented.
    
    """   
    query_terms, docs= searching(query, index)

    return BM25(query_terms,docs,index,df,title_index, k=1.5, b=0.75)

## Our Score:BM25 + price boosting

In [None]:
price_table={}
def get_price(pid):
    ''' 
    This function returns the product price, or 0 if there is non.
    
    '''
    return price_table.get(pid,0)

In [None]:
def our_score(terms, docs, index, df, title_index, k,b):
    ''' 
    BM25 structure has been copied but a boosting on documents with higher price has been added
    Boost= 1+ (1+log10(price+1))
    
    '''
     
    N = len(title_index) #total num of documents
    
    #average length accross documents
    doc_length= {}
    for term, postings in index.items():
        for pid, count in postings:
            doc_length[pid]= doc_length.get(pid,0)+count
            
    avg_dl= sum(doc_length.values())/ len(doc_length) if doc_length else 0
    
    
    scores = defaultdict(float)
    
    #Compute pre-term for BM25
    for term in terms:
        df_term = df.get(term, 0)
        idf = math.log((N + 0.5)/(df_term + 0.5)) #smoothing
        postings= dict(index.get(term, []))
        
        for pid in docs:
            tf_idf= postings.get(pid,0)
            ld = doc_length. get(pid, avg_dl)
            denominator = tf_idf +k*((1-b)+b*(ld/avg_dl))
            bm_score = idf* ((tf_idf*(k+1))/denominator)
            
            #price boosting
            price=get_price(pid)
            
            if price>0:
                price_boost=1+(1+math.log10(price+1))
            else:
                price_boost=1.0
            
            scores[pid] +=bm_score*price_boost

    ranked_docs= sorted(scores.items(), key=lambda x:x[1], reverse= True)
    return [doc for doc, _ in ranked_docs]


In [None]:
def search_our_score(query,index,df,title_index,k=1.5, b=0.75):
    ''' 
    Search with BM25 + price boost
    
    '''
    query_terms, docs= searching(query, index)
    return our_score(query_terms, docs, index, df, title_index, k=1.5, b=0.75)

## Word2vec + cosine

In [None]:
#pip install gensim


In [16]:
from gensim.models import Word2Vec

def train_word2vec(docs):
    ''' 
    Word2Vec model is used to train the dataset.
    '''
    corpus=[]
    
    #tokenise sentence (one per document)
    for doc in docs:
        text=(doc["title_clean"]+ " " + doc["description_clean"]).split()
        corpus.append(text)
    
    #train word2vec model 
    model=Word2Vec(sentences=corpus, vector_size=100,window=5,min_count=1,workers=4)
    return model

In [17]:
def compute_vectors(docs,model):
    ''' 
    This function converts the documents into vectors using the average of Word2Vec embeddings.
    If adocument has no known words, a vector of 0 is assigned.
    '''
    doc_vectors={}
    for doc in docs:
        pid=doc["pid"]
       
        #merge clean columns
        text=(doc["title_clean"]+ " " + doc["description_clean"]+ " "+ doc["product_details_clean"]).split()

        vectors=[]
        #for each word that exists in Word2Vec vocabulary
        for w in text:
            if w in model.wv: #if word is known do:
                vectors.append(model.wv[w])
        
        #if at least one vector, average them
        if vectors:
            doc_vectors[pid]=np.mean(vectors,axis=0)
        else:
            doc_vectors[pid]=np.zeros(model.vector_size)
        
    return doc_vectors

In [18]:
def embed_query(query,model):
    ''' 
    This function converts the query into a vector. If no words have embedding, a zero vector is returned.
    '''
    terms=query.split()
    
    #collect embeddings
    vectors=[model.wv[w] for w in terms if w in model.wv]

    if not vectors:
        return np.zeros(model.vector_size)
    return np.mean(vectors,axis=0)

In [19]:
def cosine(a,b):
    ''' 
    Compute the cosine similarity between two vectors.
    '''
    if np.linalg.norm(a)==0 or  np.linalg.norm(b)==0:
        return 0.0
    return np.dot(a,b) / (np.linalg.norm(a)*np.linalg.norm(b))

In [21]:
def search_word2vec(query,index,model,doc_vectors):
    ''' 
    Search with word2vec model using AND logic and cosine similarity.
    '''
    result=searching(query,index)
    if not result:
        return []
    
    terms,docs=result
    
    #converts query into vector
    query_vec=embed_query(query,model)
    
    scores=[]
    
    #compute cosine similarity
    for pid in docs:
        doc_vec=doc_vectors[pid]
        sim=cosine(query_vec,doc_vec)
        scores.append((sim,pid))
    
    scores.sort(reverse=True)

    #get only the top 20
    return [pid for _,pid in scores[:20]]

## Main

In [22]:
#pip install tabulate

In [25]:
from tabulate import tabulate
def print_table(method, results,title_index):
    ''' 
    To make the results more visually appealing, tabulate has been used.
    '''
    rows=[(i+1,pid,title_index.get(pid,"[No title]")) for i,pid in enumerate(results)]
    print(f"\n{method}")
    print(tabulate(rows,headers=["Rank", "PID", "Title"], tablefmt="fancy_grid"))

In [26]:
if __name__ == "__main__":
    #load preprocessed docs
    docs=load_processed_docs()

    #make price table to search
    global price_table
    price_table={doc["pid"]:doc.get("selling_price",0) for doc in docs}

    #build inverted index and compute TF-IDF values
    index,tf,df,idf,title_index=create_index_tfidf(docs)

    #train Word2Vec model  
    model=train_word2vec(docs)

    #compute document embedding
    doc_vectors=compute_vectors(docs,model)

    
    #test queries
    queries=[
        "full sleeve black shirt",
        "solid women white polo",
        "print of multicolor neck grey shirt",
        "slim fit men blue jeans",
        "round collar full sleeves t-shirt" 
    ]  
    
    #all ranking methods
    for q in queries:
        print(f"\n\n===== QUERY: {q} =====")
        print_table("TF-IDF (Top 10)", search_tf_idf(q,index,tf,idf,title_index)[:10], title_index)
        print_table("BM25 (Top 10)", search_bm25(q,index,df,title_index)[:10], title_index)
        print_table("Price Boosted (Top 10)", search_our_score(q,index,df,title_index)[:10], title_index)
        print_table("Word2Vec (Top 20)", search_word2vec(q,index,model,doc_vectors), title_index)



===== QUERY: full sleeve black shirt =====

TF-IDF (Top 10)
╒════════╤══════════════════╤════════════════════════════════════════════════════╕
│   Rank │ PID              │ Title                                              │
╞════════╪══════════════════╪════════════════════════════════════════════════════╡
│      1 │ TSHFUTG2TZSYDWTP │ print women round neck black shirt                 │
├────────┼──────────────────┼────────────────────────────────────────────────────┤
│      2 │ SHTFYPATCHTAN2ZT │ men slim fit solid casual shirt                    │
├────────┼──────────────────┼────────────────────────────────────────────────────┤
│      3 │ SHTFZP66HZNRS2XR │ men regular fit solid mandarin collar casual shirt │
├────────┼──────────────────┼────────────────────────────────────────────────────┤
│      4 │ TSHF94NKFVSHGVWQ │ solid men hood black shirt                         │
├────────┼──────────────────┼────────────────────────────────────────────────────┤
│      5 │ TSHFYN37DWMWHP