1) Updated the code to increase MAP@10 score by leveraging sentence transformers. Sentence transformers provide enhanced semantic similarity measurements, which improve the accuracy of the matching process.

2) Additionally, leveraged the partial match count to provide a fairer assessment of performance. This helps to account for partial matches that are relevant but, may not be exact, giving a more nuanced evaluation of the model's effectiveness.

3) Refactored the code into an Object-Oriented Programming (OOP) structure to make it more modular, maintainable, and ready for production deployment. This includes encapsulating functionality within classes and defining clear interfaces for interacting with different components of the system.



In [5]:
# Install necessary libraries
!pip install datasets trl peft bitsandbytes accelerate sentence_transformers
!pip install --upgrade transformers accelerate pyarrow

#clone the git repo that contains the data and additional information about the dataset
!git clone https://github.com/wayfair/WANDS.git

fatal: destination path 'WANDS' already exists and is not an empty directory.


In [6]:
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
from sentence_transformers import SentenceTransformer
import logging

# Configure logging
logging.basicConfig(level=logging.INFO)

class DataLoader:
    def __init__(self, query_file, product_file, label_file):
        self.query_df = pd.read_csv(query_file, sep='\t')
        self.product_df = pd.read_csv(product_file, sep='\t')
        self.label_df = pd.read_csv(label_file, sep='\t')

    def preprocess_product_data(self):
        self.product_df['average_rating_'] = self.product_df['average_rating'].fillna(0)
        self.product_df['product_description_'] = self.product_df['product_description'].fillna('')
        self.product_df['product_class_'] = self.product_df['product_class'].fillna('')
        self.product_df['category hierarchy_'] = self.product_df['category hierarchy'].fillna('')
        self.product_df['normalized_average_ratings'] = (
            self.product_df['average_rating_'] - self.product_df['average_rating_'].min()
        ) / (self.product_df['average_rating_'].max() - self.product_df['average_rating_'].min())
        self.product_df['combined_text'] = (
            self.product_df['product_name'] + ' ' +
            self.product_df['product_class_'] + ' ' +
            self.product_df['category hierarchy_'] + ' ' +
            self.product_df['product_description_'] + ' ' +
            self.product_df['normalized_average_ratings'].astype(str)
        )
        logging.info("Product data preprocessing complete.")

class EmbeddingGenerator:
    def __init__(self, model_name):
        self.model_name = model_name
        self.model = SentenceTransformer(model_name)

    def generate_embeddings(self, texts):
        try:
            embeddings = self.model.encode(texts, convert_to_tensor=True)
            logging.info("Embeddings generated successfully.")
            return embeddings
        except Exception as e:
            logging.error(f"Error generating embeddings: {e}")
            return None

class SimilarityCalculator:
    @staticmethod
    def calculate_cosine_similarity(query_embeddings, product_embeddings):
        num_queries = len(query_embeddings)
        num_products = len(product_embeddings)
        similarity_matrix = torch.zeros((num_queries, num_products))
        product_embeddings_tensor = product_embeddings.clone().detach()

        for i in range(num_queries):
            query_2d = query_embeddings[i].reshape(1, -1)
            similarities = F.cosine_similarity(query_2d, product_embeddings_tensor)
            similarity_matrix[i] = similarities

        logging.info("Cosine similarity calculation complete.")
        return similarity_matrix

class Evaluator:
    def __init__(self, label_df):
        self.label_df = label_df
        self.grouped_label_df = label_df.groupby('query_id')
        self.relevance_mapping = {
            'Exact': 1.0,
            'Partial': 0.7,
            'Irrelevant': 0.0
        }

    def get_relevant_matches_for_query(self, query_id):
        query_group = self.grouped_label_df.get_group(query_id)
        relevant_products = query_group[query_group['label'].isin(['Partial', 'Exact'])]
        return relevant_products['product_id'].values

    def get_exact_matches_for_query(self, query_id):
        query_group = self.grouped_label_df.get_group(query_id)
        exact_matches = query_group.loc[query_group['label'] == 'Exact']['product_id'].values
        return exact_matches

    def get_relevant_labels_for_query(self, query_id):
        query_group = self.grouped_label_df.get_group(query_id)
        return {p_id: self.relevance_mapping[label] for p_id, label in zip(query_group['product_id'], query_group['label'])}

    @staticmethod
    def map_at_k(true_ids, predicted_ids, relevance_labels=None, k=10):
        if not len(true_ids) or not len(predicted_ids):
            return 0.0

        score = 0.0
        num_hits = 0.0
        num_score_hits = 0.0
        for i, p_id in enumerate(predicted_ids[:k]):
            if p_id in true_ids and p_id not in predicted_ids[:i]:
                if relevance_labels:
                    relevance_score = relevance_labels.get(p_id, 0.0)
                    if relevance_score > 0:
                        num_hits += 1.0
                        num_score_hits += relevance_score
                        score += num_score_hits / (i + 1.0)
                else:
                    num_hits += 1.0
                    score += num_hits / (i + 1.0)

        if relevance_labels:
            return score / min(len(true_ids), k)
        return score / min(len(true_ids), k)

def main():
    logging.info("Starting the retrieval process...")

    # Data loading and preprocessing
    data_loader = DataLoader("WANDS/dataset/query.csv", "WANDS/dataset/product.csv", "WANDS/dataset/label.csv")
    data_loader.preprocess_product_data()

    # Embedding generation
    model_name = 'sentence-transformers/all-MiniLM-L12-v2'  # Change this to use different models
    # model_name = 'sentence-transformers/all-MiniLM-L6-v2'
    # model_name = 'sentence-transformers/all-mpnet-base-v2'
    # model_name = 'sentence-transformers/paraphrase-MiniLM-L6-v2'
    # model_name = 'sentence-transformers/distilbert-base-nli-stsb-mean-tokens'

    embedding_generator = EmbeddingGenerator(model_name)
    query_embeddings = embedding_generator.generate_embeddings(data_loader.query_df['query'].tolist())
    product_embeddings = embedding_generator.generate_embeddings(data_loader.product_df['combined_text'].tolist())

    if query_embeddings is None or product_embeddings is None:
        logging.error("Failed to generate embeddings. Exiting...")
        return

    # Calculate cosine similarities
    similarity_calculator = SimilarityCalculator()
    cosine_similarities = similarity_calculator.calculate_cosine_similarity(query_embeddings, product_embeddings)

    # Rank products and evaluate
    top_product_ids_all = []
    for i in range(len(cosine_similarities)):
        top_product_indices = cosine_similarities[i].argsort(descending=True)[:10]
        top_product_ids = data_loader.product_df.iloc[top_product_indices]['product_id'].tolist()
        top_product_ids_all.append(top_product_ids)

    data_loader.query_df['top_product_ids'] = top_product_ids_all

    evaluator = Evaluator(data_loader.label_df)

    # Calculate MAP@10 without graded relevance
    data_loader.query_df['relevant_ids'] = data_loader.query_df['query_id'].apply(evaluator.get_exact_matches_for_query)
    data_loader.query_df['map@k'] = data_loader.query_df.apply(
        lambda x: Evaluator.map_at_k(
            x['relevant_ids'],
            x['top_product_ids']
        ),
        axis=1
    )
    mean_average_k = data_loader.query_df['map@k'].mean()
    logging.info(f'Mean Average Precision @10: {mean_average_k:.4f}')
    print(f'\nMean Average Precision @10: {mean_average_k:.4f}')

    # Calculate MAP@10 with graded relevance
    data_loader.query_df['relevant_ids'] = data_loader.query_df['query_id'].apply(evaluator.get_relevant_matches_for_query)
    data_loader.query_df['relevance_labels'] = data_loader.query_df['query_id'].apply(evaluator.get_relevant_labels_for_query)
    data_loader.query_df['map@k'] = data_loader.query_df.apply(
        lambda x: Evaluator.map_at_k(
            x['relevant_ids'],
            x['top_product_ids'],
            x['relevance_labels']
        ),
        axis=1
    )
    average_map_k = data_loader.query_df['map@k'].mean()
    logging.info(f'Mean Average Precision @10 - map: {average_map_k:.4f}')
    print(f'\nMean Average Precision @10 - relevance score (Exact: 1, Partial: 0.7, Irrelevent: 0): {average_map_k:.4f}')

if __name__ == "__main__":
    main()


Mean Average Precision @10: 0.4018

Mean Average Precision @10 - relevance score (Exact: 1, Partial: 0.7, Irrelevent: 0): 0.6269


### Appendix - Microsoft's Phi2 large language model.

The following function is to generate embeddings using Phi-2 for product descriptions. This function iteratively processes the descriptions in smaller batches to avoid memory issues. However, the Microsoft's Phi2 model was not pursued any further due to significnatly lower MAP@10 performance metric. Please refer to the Word document attachment.


In [None]:
# Import necessary libraries
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    Trainer,
    GenerationConfig,
    AutoConfig
)
from tqdm import tqdm
from trl import SFTTrainer
import torch
import time
import pandas as pd
import numpy as np
from huggingface_hub import login
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training


# Load datasets for search queries, products, and labels
query_df = pd.read_csv("WANDS/dataset/query.csv", sep='\t')
product_df = pd.read_csv("WANDS/dataset/product.csv", sep='\t')
label_df = pd.read_csv("WANDS/dataset/label.csv", sep='\t')

# Configuration for 4-bit quantization to reduce model size and memory usage
compute_dtype = getattr(torch, "float16")
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type='nf4',
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=False,
    )

# Authenticate with Hugging Face
huggingface_token = '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'
login(token=huggingface_token, add_to_git_credential=True)

# Load the model and tokenizer from Hugging Face
model_name='microsoft/phi-2'
config = AutoConfig.from_pretrained(model_name)
config.output_hidden_states = True
device_map = {"": 0}
original_model = AutoModelForCausalLM.from_pretrained(model_name,
                                                      device_map=device_map,
                                                      quantization_config=bnb_config,
                                                      trust_remote_code=True,
                                                      use_auth_token=True)
tokenizer = AutoTokenizer.from_pretrained(model_name,trust_remote_code=True,padding_side="left",add_eos_token=True,add_bos_token=True,use_fast=False)
tokenizer.pad_token = tokenizer.eos_token

# Print the hidden size of the model
config = original_model.config
print(f"Hidden size: {config.hidden_size}")

# Prepare the model for training with Low-Rank Adaptation (LoRA)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
original_model = prepare_model_for_kbit_training(original_model)
config = LoraConfig(
    r=32, #Rank
    lora_alpha=32,
    target_modules=[
        'q_proj',
        'k_proj',
        'v_proj',
        'dense'
    ],
    bias="none",
    lora_dropout=0.05,  # Conventional
    task_type="CAUSAL_LM",
)
original_model.gradient_checkpointing_enable()
peft_model = get_peft_model(original_model, config)


# Function to generate embeddings for product descriptions.
# This function iteratively processes the descriptions in smaller batches to avoid memory issues.
def get_embeddings(texts, batch_size):
    all_embeddings = []
    num_batches = (len(texts) + batch_size - 1) // batch_size  # Calculate number of batches

    for i in tqdm(range(num_batches)):  # Use tqdm for progress bar
        print(i)
        start = i * batch_size
        end = (i + 1) * batch_size
        batch_texts = texts[start:end]
        if not all(isinstance(text, str) for text in batch_texts):
            raise ValueError("Input is not valid. Should be a list/tuple of strings.")

        inputs = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True)
        with torch.no_grad():
            # outputs = peft_model(**inputs, output_hidden_states=True)
            outputs = original_model(**inputs, output_hidden_states=True)
            hidden_states = outputs.hidden_states

        last_hidden_state = hidden_states[-1]
        batch_embeddings = last_hidden_state.mean(dim=1).cpu().numpy()  # Move to CPU to save GPU memory
        # batch_embeddings = last_hidden_state[:, 0, :].cpu().numpy()  # Move to CPU to save GPU memory
        all_embeddings.append(batch_embeddings)

    return np.concatenate(all_embeddings, axis=0)

# Prepare product descriptions and combine texts for embedding generation
product_df['product_description2'] = product_df['product_description'].fillna('')
product_df['combined_text'] = product_df['product_name'] + ' ' + product_df['product_description2']
valid_texts = product_df['combined_text'].tolist()
product_embeddings = get_embeddings(valid_texts, batch_size=20)

# Generate query embeddings
text = query_df['query'].tolist()
inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
with torch.no_grad():
    outputs = original_model(**inputs, output_hidden_states=True)
    # outputs = original_model(**inputs, output_hidden_states=True, output_scores = True)
    hidden_states = outputs.hidden_states

last_hidden_state= hidden_states[-1]
# qurey_token_embeddings = last_hidden_state[:,0,:]
query_embeddings = last_hidden_state.mean(dim=1)




# Function to calculate cosine similarity between query embeddings and product embeddings
def calculate_cosine_similarity(query_embeddings, product_embeddings):
    num_queries = len(query_embeddings)
    num_products = len(product_embeddings)

    # Create a tensor to store the similarity results
    similarity_matrix = torch.zeros((num_queries, num_products))
    product_embeddings_tensor = torch.tensor(product_embeddings)

    for i in range(num_queries):
        # Calculate cosine similarity between the i-th query and all product embeddings
        query_2d = query_embeddings[i].reshape(1, -1)
        similarities = F.cosine_similarity(query_2d, product_embeddings_tensor)
        # similarities = cosine_similarity(query_embeddings[i], product_embeddings)
        similarity_matrix[i] = similarities

    return similarity_matrix

# Calculate cosine similarities
cosine_similarities = calculate_cosine_similarity(query_embeddings, product_embeddings)

# Retrieve top 10 product IDs for each query based on cosine similarity
top_product_ids_all = []
for i in range(len(cosine_similarities)):
    top_product_indices = cosine_similarities[i].argsort(descending=True)[:10]
    top_product_ids = product_df.iloc[top_product_indices]['product_id'].tolist()
    top_product_ids_all.append(top_product_ids)

# Assign the list of lists to the DataFrame column
query_df['top_product_ids'] = top_product_ids_all


# Function to retrieve exact match product IDs for a query_id
def get_exact_matches_for_query(query_id):
    query_group = grouped_label_df.get_group(query_id)
    exact_matches = query_group.loc[query_group['label'] == 'Exact']['product_id'].values
    return exact_matches

# Add the list of exact match product_IDs from labels_df
query_df['relevant_ids'] = query_df['query_id'].apply(get_exact_matches_for_query)


# Function to calculate Mean Average Precision at K (MAP@K)
def map_at_k(true_ids, predicted_ids, k=10):
    """
    Calculate the Mean Average Precision at K (MAP@K).

    Parameters:
    true_ids (list): List of relevant product IDs.
    predicted_ids (list): List of predicted product IDs.
    k (int): Number of top elements to consider.
             NOTE: IF you wish to change top k, please provide a justification for choosing the new value

    Returns:
    float: MAP@K score.
    """
    if not len(true_ids) or not len(predicted_ids):
        return 0.0

    score = 0.0
    num_hits = 0.0

    for i, p_id in enumerate(predicted_ids[:k]):
        if p_id in true_ids and p_id not in predicted_ids[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)

    return score / min(len(true_ids), k)
    # return score / max(num_hits, 0.00000001)

# Assign the MAP@K score to the DataFrame
query_df['map@k'] = query_df.apply(lambda x: map_at_k(x['relevant_ids'], x['top_product_ids'], k=10), axis=1)

# Calculate the MAP across the entire query set
mean_average_k = query_df.loc[:, 'map@k'].mean()
print(f'\nMean Average Precision @10: {mean_average_k:.4f}')
