In [2]:
#clone the git repo that contains the data and additional information about the dataset
!git clone https://github.com/wayfair/WANDS.git

In [14]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer


class search_engine_eval():
    def __init__(self,query_df,label_df,product_df):
        self.my_stop_words = list(text.ENGLISH_STOP_WORDS.union(["book"]))
        self.query_df = query_df
        self.label_df = label_df
        self.product_df = product_df
        self.grouped_label_df = label_df.groupby('query_id')
        self.vectorizer, self.tfidf_matrix = self.calculate_tfidf(self.product_df)
    #define functions for product search using Tf-IDF
    def calculate_tfidf(self, dataframe):
        """
        Calculate the TF-IDF for combined product name and description.

        Parameters:
        dataframe (pd.DataFrame): DataFrame with product_id, and other product information.

        Returns:
        TfidfVectorizer, csr_matrix: TF-IDF vectorizer and TF-IDF matrix.
        """
        # Combine product name and description to vectorize
        # NOTE: Please feel free to use any combination of columns available, some columns may contain NULL values
        combined_text = dataframe['product_name'] + ' ' + dataframe['product_description'] + ' ' + dataframe['product_class']+' ' + dataframe['category hierarchy']
        vectorizer = TfidfVectorizer(ngram_range=(1,1),use_idf=False, norm=None, stop_words='english',lowercase=True)
        # convert combined_text to list of unicode strings
        tfidf_matrix = vectorizer.fit_transform(combined_text.values.astype('U'))
        return vectorizer, tfidf_matrix
    
    def calculate_vectors(self, dataframe):
        
        pass
    def get_top_products(self, vectorizer, tfidf_matrix, query, top_n=20):
        
        """
        Get top N products for a given query based on TF-IDF similarity.

        Parameters:
        vectorizer (TfidfVectorizer): Trained TF-IDF vectorizer.
        tfidf_matrix (csr_matrix): TF-IDF matrix for the products.
        query (str): Search query.
        top_n (int): Number of top products to return.

        Returns:
        list: List of top N product IDs.
        """
        query_vector = self.vectorizer.transform([query])
        cosine_similarities = cosine_similarity(query_vector, self.tfidf_matrix).flatten()
        top_product_indices = cosine_similarities.argsort()[-top_n:][::-1]
        temp = pd.DataFrame(data= {'product_id':top_product_indices}).merge(self.product_df, how = 'left', on ='product_id')
        #temp = temp[temp['review_count']>15]
        temp.sort_values('review_count',ascending=False,inplace=True)
        #print(temp['product_id'].values)
        return top_product_indices
        #return temp['product_id'].values
        
    #define functions for evaluating retrieval performance
    def map_at_k(self, true_ids, predicted_ids, k=10):
        """
        Calculate the Mean Average Precision at K (MAP@K).

        Parameters:
        true_ids (list): List of relevant product IDs.
        predicted_ids (list): List of predicted product IDs.
        k (int): Number of top elements to consider.
                 NOTE: IF you wish to change top k, please provide a justification for choosing the new value

        Returns:
        float: MAP@K score.
        """
        #print(predicted_ids)
        #if either list is empty, return 0
        if not len(true_ids) or not len(predicted_ids):
            return 0.0

        score = 0.0
        num_hits = 0.0

        for i, p_id in enumerate(predicted_ids[:k]):
            if p_id in true_ids and p_id not in predicted_ids[:i]:
                num_hits += 1.0
                score += num_hits / (i + 1.0)

        return score / min(len(true_ids), k)
    
    #implementing a function to retrieve exact match product IDs for a query_id
    def get_exact_matches_for_query(self, query_id):
        query_group = self.grouped_label_df.get_group(query_id)
        exact_matches = query_group.loc[(query_group['label'] == 'Exact')]['product_id'].values
        partial_matches = query_group.loc[ (query_group['label'] == 'Partial')]['product_id'].values


        # MATT: Output now includes both exact matches and partial matches.
        output = np.append(exact_matches, partial_matches)
        return output
    
    def get_top_product_ids_for_query(self, query):
        top_product_indices = self.get_top_products(vectorizer, tfidf_matrix, query, top_n=10)
        top_product_ids = self.product_df.iloc[top_product_indices]['product_id'].tolist()
        return top_product_ids


In [15]:
# get search queries
query_df = pd.read_csv("WANDS/dataset/query.csv", sep='\t')
# get manually labeled groundtruth lables
label_df = pd.read_csv("WANDS/dataset/label.csv", sep='\t')
# get products
product_df = pd.read_csv("WANDS/dataset/product.csv", sep='\t')
#group the labels for each query to use when identifying exact matches


search = search_engine_eval(query_df,label_df,product_df)
# Calculate TF-IDF

#vectorizer = search.calculate_vectors(product_df)

#define the test query
query = "armchair"

#obtain top product IDs
top_product_ids = search.get_top_product_ids_for_query(query)

print(f"Top products for '{query}':")
for product_id in top_product_ids:
    product = product_df.loc[product_df['product_id'] == product_id]
    print(product_id, product['product_name'].values[0])
    
    


#applying the function to obtain top product IDs and adding top K product IDs to the dataframe 
query_df['top_product_ids'] = query_df['query'].apply(search.get_top_product_ids_for_query)

#adding the list of exact match product_IDs from labels_df
query_df['relevant_ids'] = query_df['query_id'].apply(search.get_exact_matches_for_query)

#now assign the map@k score
query_df['map@k'] = query_df.apply(lambda x: search.map_at_k(x['relevant_ids'], x['top_product_ids'], k=10), axis=1)

Top products for 'armchair':
42698 donham armchair
31564 biloxi 34.75 '' wide armchair
42697 donham 25 '' wide armchair
12756 24.41 '' wide tufted polyester armchair
41306 hartsell 33 '' wide armchair
11642 baltes 29.5 '' wide barrel chair
23907 faizah 27.6 '' wide tufted polyester armchair
42802 donham polyester lounge chair
25844 jill 29.5 '' wide armchair
1140 charnley 47 '' wide chenille armchair


In [16]:
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype

#is_string_dtype(product_df['product_name'])
for col in product_df.columns:
    if is_string_dtype(product_df[col]):
        product_df[col] = product_df[col].str.replace('[^a-zA-Z0-9]', ' ')

  product_df[col] = product_df[col].str.replace('[^a-zA-Z0-9]', ' ')


In [17]:
# calculate the MAP across the entire query set
query_df.loc[:, 'map@k'].mean()

0.6357619047619048