In [2]:
#clone the git repo that contains the data and additional information about the dataset
!git clone https://github.com/wayfair/WANDS.git

In [7]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer


class search_engine_eval():
    def __init__(self,query_df,label_df,product_df):
        self.my_stop_words = list(text.ENGLISH_STOP_WORDS.union(["book"]))
        self.query_df = query_df
        self.label_df = label_df
        self.product_df = product_df
        self.grouped_label_df = label_df.groupby('query_id')
    #define functions for product search using Tf-IDF
    def calculate_tfidf(self, dataframe):
        """
        Calculate the TF-IDF for combined product name and description.

        Parameters:
        dataframe (pd.DataFrame): DataFrame with product_id, and other product information.

        Returns:
        TfidfVectorizer, csr_matrix: TF-IDF vectorizer and TF-IDF matrix.
        """
        # Combine product name and description to vectorize
        # NOTE: Please feel free to use any combination of columns available, some columns may contain NULL values
        combined_text = dataframe['product_name'] + ' ' + dataframe['product_description'] + ' ' + dataframe['product_class']+' ' + dataframe['category hierarchy']
        vectorizer = TfidfVectorizer(ngram_range=(1,1),use_idf=False, norm=None, stop_words='english',lowercase=True)
        # convert combined_text to list of unicode strings
        tfidf_matrix = vectorizer.fit_transform(combined_text.values.astype('U'))
        return vectorizer, tfidf_matrix
    
    def calculate_vectors(self, dataframe):
        
        pass
    def get_top_products(self, vectorizer, tfidf_matrix, query, top_n=20):
        
        """
        Get top N products for a given query based on TF-IDF similarity.

        Parameters:
        vectorizer (TfidfVectorizer): Trained TF-IDF vectorizer.
        tfidf_matrix (csr_matrix): TF-IDF matrix for the products.
        query (str): Search query.
        top_n (int): Number of top products to return.

        Returns:
        list: List of top N product IDs.
        """
        query_vector = vectorizer.transform([query])
        cosine_similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()
        top_product_indices = cosine_similarities.argsort()[-top_n:][::-1]
        temp = pd.DataFrame(data= {'product_id':top_product_indices}).merge(self.product_df, how = 'left', on ='product_id')
        #temp = temp[temp['review_count']>15]
        temp.sort_values('review_count',ascending=False,inplace=True)
        #print(temp['product_id'].values)
        return top_product_indices
        #return temp['product_id'].values
        
    #define functions for evaluating retrieval performance
    def map_at_k(self, true_ids, predicted_ids, k=10):
        """
        Calculate the Mean Average Precision at K (MAP@K).

        Parameters:
        true_ids (list): List of relevant product IDs.
        predicted_ids (list): List of predicted product IDs.
        k (int): Number of top elements to consider.
                 NOTE: IF you wish to change top k, please provide a justification for choosing the new value

        Returns:
        float: MAP@K score.
        """
        #print(predicted_ids)
        #if either list is empty, return 0
        if not len(true_ids) or not len(predicted_ids):
            return 0.0

        score = 0.0
        num_hits = 0.0

        for i, p_id in enumerate(predicted_ids[:k]):
            if p_id in true_ids and p_id not in predicted_ids[:i]:
                num_hits += 1.0
                score += num_hits / (i + 1.0)

        return score / min(len(true_ids), k)
    
    #implementing a function to retrieve exact match product IDs for a query_id
    def get_exact_matches_for_query(self, query_id):
        query_group = self.grouped_label_df.get_group(query_id)
        exact_matches = query_group.loc[(query_group['label'] == 'Exact')]['product_id'].values
        partial_matches = query_group.loc[ (query_group['label'] == 'Partial')]['product_id'].values


        # MATT: Output now includes both exact matches and partial matches.
        output = np.append(exact_matches, partial_matches)
        return output
    
    def get_top_product_ids_for_query(self, query):
        top_product_indices = self.get_top_products(vectorizer, tfidf_matrix, query, top_n=10)
        top_product_ids = self.product_df.iloc[top_product_indices]['product_id'].tolist()
        return top_product_ids


In [8]:
# get search queries
query_df = pd.read_csv("WANDS/dataset/query.csv", sep='\t')
# get manually labeled groundtruth lables
label_df = pd.read_csv("WANDS/dataset/label.csv", sep='\t')
# get products
product_df = pd.read_csv("WANDS/dataset/product.csv", sep='\t')
#group the labels for each query to use when identifying exact matches


search = search_engine_eval(query_df,label_df,product_df)
# Calculate TF-IDF
vectorizer, tfidf_matrix = search.calculate_tfidf(product_df)
#vectorizer = search.calculate_vectors(product_df)

#define the test query
query = "armchair"

#obtain top product IDs
top_product_ids = search.get_top_product_ids_for_query(query)

print(f"Top products for '{query}':")
for product_id in top_product_ids:
    product = product_df.loc[product_df['product_id'] == product_id]
    print(product_id, product['product_name'].values[0])
    
    


#applying the function to obtain top product IDs and adding top K product IDs to the dataframe 
query_df['top_product_ids'] = query_df['query'].apply(search.get_top_product_ids_for_query)

#adding the list of exact match product_IDs from labels_df
query_df['relevant_ids'] = query_df['query_id'].apply(search.get_exact_matches_for_query)

#now assign the map@k score
query_df['map@k'] = query_df.apply(lambda x: search.map_at_k(x['relevant_ids'], x['top_product_ids'], k=10), axis=1)

[42697 42802 23907 25844 42698 11642 31564 41306 12756  1140]
Top products for 'armchair':
42698 donham armchair
31564 biloxi 34.75 '' wide armchair
42697 donham 25 '' wide armchair
12756 24.41 '' wide tufted polyester armchair
41306 hartsell 33 '' wide armchair
11642 baltes 29.5 '' wide barrel chair
23907 faizah 27.6 '' wide tufted polyester armchair
42802 donham polyester lounge chair
25844 jill 29.5 '' wide armchair
1140 charnley 47 '' wide chenille armchair
[17087 35793  7408  7465 15612  6519  7466  7468  7467 22744]
[ 7185 20331 17336 29421 20878 28095 41694 36778 22677 28096]
[36631 21048 36622 24094 36630 36533 28472  3754 14418 34739]
[21026 21028 21044 21037 25621 23599  4852  2746 11792 11849]
[20405  1526 18971  7408 15612 41955 14881  6519  2185  5248]
[19037  4721 31025 40987 42060 20646  8497 15772 19817 29667]
[17087  7408 15612 25143 29496 28930 29497  6519  5298 22744]
[  771 28703 27777 15760  4698 34926 34083 31383 38626 19580]
[30358  8778 38044 21994 28089 30355 1

[ 7185 17336 29421 28095 39948 36778 22677 41312 28096 33181]
[18607 37712   894  9996 24908  9859 30535 39327 39331 39328]
[37583  5821 20592 24330 20802 34834 18602 30920 18715 36419]
[38447 22348 28594 17257  8445  1275 29447 35446 17037 17038]
[12323 16408 19378 37500 39631 19411 18253 18268 33442 31701]
[17278 24364 38872 35099 34552 23497 24926 31332 42966 17281]
[37609  1363 16934 35326 29426 21672 40258 24233 35047 37205]
[25558 23177  1693 31487 14724  6866 26718 13760 31478 23009]
[26806  2719  7638 11140 19464 29132 36033  6731  8801 36828]
[37598 15408 10714 28989 25090 19963 21651 36861 18620   321]
[15367 13261 13263 26289 15369 21494 21492 10430 21497 25042]
[11809 30745 13520   786 31401 18489 11886  4187 11887 13519]
[  841 35957  7903  1327 37201 18506 18472 12274 12275 33023]
[16408 37500 39631 21120 33714 23312 31699 18268 33442 31701]
[10107 10108 10142 17052 35068 17835  5295  6847  5311 13811]
[15612 17251 40117 25553 25179  6519 15079 34863  7441  7442]
[39198 2

[ 1795 22813  7350 26961  7288  4367  8362 31209 24778 30646]
[33276  7332 33277 33281 31829 31828 31799 17588 16009 36438]
[23533 23570 11525 11527 11526 39689 37849 34566 37050 27793]
[ 1685  1740  1739  1753 21151 33963  9259 25584 14859  4983]
[23682 38091 15408 40188 15453 32842 42771  7768 29437  6018]
[29692 12656 35956 29756 38745 18012 35526 20927 21895 29755]
[21855 34919 39497 33583 33582 33579 30691 33581 21009 33580]
[38301 27382  2564 36834 36833 36832 41108 29090 17751 36830]
[37598 15408  7744 10714 32842 22398 28513 36847 36861 13805]
[24366  3483 17325 24361  4384 17326 18443 24360 18444 29049]
[  771 40477 15760  4698  6010 13734  4613 19281  1041  1043]
[ 7983 33708 22901 23065 41328 42608 13631 30941 36451 36454]
[33547 33542 41020  4289 18201 19129 25311 30351 20373 29513]
[13262  3942 34127   330 34129  3941 15400 20816 20817  3830]
[ 7575 16549 22506 22495 22483 22500 22508 22526 15775  9941]
[14809 14811 15340 14810 35558 28874 35622  6880  6878  6879]
[30627 1

[40592  3197 12468 33347  2914 12473 12469 36349 29898 29902]
[18550 37221 19231 20435 37625 15605 19118 15973 19664 36439]
[ 7185 20331 17336 29421 20878 28095 41694 36778 22677 28096]
[17111  1990  2047 39855  5264 19297  1974  2049 36827 33072]
[21865 15408 29350 29348 38061 20330 39547 10714 30137  8797]
[22964 25560 40044  7851 40042 29615 42809 14833  1138 20672]
[38311 11119  6421 30928 26414 27154 27094 33913 24217 13802]
[28340 31220  5460  2601 29216 34753  8626 10013  1279  9012]
[11641 20610 12659 41092 15612 33330  8427  6519 33457  3164]
[41524  2400  7578 41064 32210 36771 36769 21398 36772 16658]
[ 2019 34896  2038  7241 21123 12352 39216  2001 39142 11888]
[42100  9262 21605 18210 20532 23065 41328  6096 41415 28807]
[40909  1203  5844 33637 34592  2979 38721    18 18189 41314]
[12790 12778 12837 12872 26414 12891 12791 12903 12786 12792]
[25183 15454  7568  4759  6086 18234 35438 18245 18246 28255]
[25003 25013 24661 40444  4790  8814 26037  5414 26023  5416]
[38887  

In [9]:
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype

#is_string_dtype(product_df['product_name'])
for col in product_df.columns:
    if is_string_dtype(product_df[col]):
        product_df[col] = product_df[col].str.replace('[^a-zA-Z0-9]', ' ')

  product_df[col] = product_df[col].str.replace('[^a-zA-Z0-9]', ' ')


In [10]:
# calculate the MAP across the entire query set
query_df.loc[:, 'map@k'].mean()

0.6357619047619048