In [1]:
def search_weighted_word_embedding(query):
    '''
    The function uses weighted word embedding to process documents, 
    and print the best outfit we can find in product and outfit files.
    
    '''
    # Input packages and dataset
    import pandas as pd
    import spacy
    import en_core_web_lg
    from sklearn.feature_extraction.text import TfidfVectorizer
    import numpy as np
    from sklearn.metrics.pairwise import cosine_similarity
    import warnings
    warnings.filterwarnings('ignore')
    
    df = pd.read_csv('processed_product.csv')
    out_fit = pd.read_csv('outfit_combinations.csv')
    
    # Output is expected to be a list of dictionaries.
    output = []
    
    # check if the query is a product ID
    # If not a product ID, we would do doc2vec after.
    # If it is a product ID, print the product name.
    # Moreover, if the product ID is in human domain experts combos, print the combinations.        
    out_fit_products = list(out_fit['product_id'].unique())
    # check if the query is the product ID
    if query in list(df['product_id']):
        matched_product_name = df[df['product_id']==query]['name'].values[0]
        print(f'Matched Product: {matched_product_name} ({query})\n')
        if query in out_fit_products:
            print('WOW! The product is in a great combination(s):\n')
            matched_outfits = out_fit[out_fit['product_id']==query]['outfit_id'].unique()
            for outfit in matched_outfits:
                outfit_details = out_fit[out_fit['outfit_id']==outfit].reset_index()
                output.append(dict(zip(outfit_details['outfit_item_type'],outfit_details['product_full_name']+'('+outfit_details['product_id']+')')))
            combo_idx=1
            for c in output:
                print(f'Combo {combo_idx}:\n')
                combo_idx+=1
                for i in c:
                    print(f'{i}: {c[i]}\n')
        return None  

    
    # load spacy en_core_web_lg model
    nlp = en_core_web_lg.load()
    
    # build tf-idf vector by name
    vectorizer_name = TfidfVectorizer(ngram_range=(1, 1), 
                             stop_words="english", 
                             max_features=1000,token_pattern='(?u)\\b[a-zA-Z][a-zA-Z]+\\b')
    X = vectorizer_name.fit_transform(df['name'])

    tf_idf_lookup_table = pd.DataFrame(X.toarray(), columns=vectorizer_name.get_feature_names())

    DOCUMENT_SUM_COLUMN = "DOCUMENT_TF_IDF_SUM"

    # sum the tf idf scores for each document
    tf_idf_lookup_table[DOCUMENT_SUM_COLUMN] = tf_idf_lookup_table.sum(axis=1)
    available_tf_idf_scores = tf_idf_lookup_table.columns # a list of all the columns we have


    names_vectors = []
    for idx, name in enumerate(df['name']): # iterate through each review
        tokens = nlp(name) # have spacy tokenize the review text
    
    # initially start a running total of tf-idf scores for a document
        total_tf_idf_score_per_document = 0
    
    # start a running total of initially all zeroes (300 is picked since that is the word embedding size used by word2vec)
        running_total_word_embedding = np.zeros(300) 
        for token in tokens: # iterate through each token
    
    # if the token has a pretrained word embedding it also has a tf-idf score
            if token.has_vector and token.text.lower() in available_tf_idf_scores:
            
                tf_idf_score = tf_idf_lookup_table.loc[idx, token.text.lower()]
            #print(f"{token} has tf-idf score of {tf_idf_lookup_table.loc[idx, token.text.lower()]}")
                running_total_word_embedding += tf_idf_score * token.vector
            
                total_tf_idf_score_per_document += tf_idf_score
    
    # divide the total embedding by the total tf-idf score for each document
        document_embedding = running_total_word_embedding / total_tf_idf_score_per_document
        names_vectors.append(document_embedding)

        
    # create tf-idf weighted word embedding vector for query (name model)
    Q = vectorizer_name.transform(pd.Series(query))

    tf_idf_lookup_table_q = pd.DataFrame(Q.toarray(), columns=vectorizer_name.get_feature_names())
    QUERY_SUM_COLUMN = "QUERY_TF_IDF_SUM"

    tf_idf_lookup_table_q[QUERY_SUM_COLUMN] = tf_idf_lookup_table_q.sum(axis=1)
    available_tf_idf_scores = tf_idf_lookup_table_q.columns # a list of all the columns we have


    q_tokens = nlp(query)

    total_tf_idf_score_q = 0

    running_total_word_embedding = np.zeros(300) 

    for token in q_tokens: # iterate through each token    
        # if the token has a pretrained word embedding it also has a tf-idf score
        if token.has_vector and token.text.lower() in available_tf_idf_scores:
            tf_idf_score = tf_idf_lookup_table_q.loc[0, token.text.lower()]
            #print(f"{token} has tf-idf score of {tf_idf_lookup_table_q.loc[0, token.text.lower()]}")
            running_total_word_embedding += tf_idf_score * token.vector
            total_tf_idf_score_q += tf_idf_score

    q_embedding_name = running_total_word_embedding / total_tf_idf_score_q
    
    
    # build tf-idf vector by description
    vectorizer_desc = TfidfVectorizer(ngram_range=(1, 1), 
                             stop_words="english", 
                             max_features=500,token_pattern='(?u)\\b[a-zA-Z][a-zA-Z]+\\b')
    X = vectorizer_desc.fit_transform(df['description'])

    tf_idf_lookup_table = pd.DataFrame(X.toarray(), columns=vectorizer_desc.get_feature_names())

    DOCUMENT_SUM_COLUMN = "DOCUMENT_TF_IDF_SUM"

    # sum the tf idf scores for each document
    tf_idf_lookup_table[DOCUMENT_SUM_COLUMN] = tf_idf_lookup_table.sum(axis=1)
    available_tf_idf_scores = tf_idf_lookup_table.columns # a list of all the columns we have


    desc_vectors = []
    for idx, desc in enumerate(df['description']): # iterate through each review
        tokens = nlp(desc) # have spacy tokenize the review text
    
    # initially start a running total of tf-idf scores for a document
        total_tf_idf_score_per_document = 0
    
    # start a running total of initially all zeroes (300 is picked since that is the word embedding size used by word2vec)
        running_total_word_embedding = np.zeros(300) 
        for token in tokens: # iterate through each token
    
    # if the token has a pretrained word embedding it also has a tf-idf score
            if token.has_vector and token.text.lower() in available_tf_idf_scores:
            
                tf_idf_score = tf_idf_lookup_table.loc[idx, token.text.lower()]
            #print(f"{token} has tf-idf score of {tf_idf_lookup_table.loc[idx, token.text.lower()]}")
                running_total_word_embedding += tf_idf_score * token.vector
            
                total_tf_idf_score_per_document += tf_idf_score
    
    # divide the total embedding by the total tf-idf score for each document
        document_embedding = running_total_word_embedding / total_tf_idf_score_per_document
        desc_vectors.append(document_embedding)

        
    # create query vector by description model
    Q2 = vectorizer_desc.transform(pd.Series(query))

    tf_idf_lookup_table_q2 = pd.DataFrame(Q2.toarray(), columns=vectorizer_desc.get_feature_names())
    QUERY_SUM_COLUMN = "QUERY_TF_IDF_SUM"

    tf_idf_lookup_table_q2[QUERY_SUM_COLUMN] = tf_idf_lookup_table_q2.sum(axis=1)
    available_tf_idf_scores = tf_idf_lookup_table_q2.columns # a list of all the columns we have


    q_tokens = nlp(query)

    total_tf_idf_score_q = 0

    running_total_word_embedding = np.zeros(300) 

    for token in q_tokens: # iterate through each token    
        # if the token has a pretrained word embedding it also has a tf-idf score
        if token.has_vector and token.text.lower() in available_tf_idf_scores:
            tf_idf_score = tf_idf_lookup_table_q2.loc[0, token.text.lower()]
            #print(f"{token} has tf-idf score of {tf_idf_lookup_table_q.loc[0, token.text.lower()]}")
            running_total_word_embedding += tf_idf_score * token.vector
            total_tf_idf_score_q += tf_idf_score

    q_embedding_desc = running_total_word_embedding / total_tf_idf_score_q    
    
    # combine 2 vector together
    # during trails, we find there are lots of noisy words in description
    # thus, we set different weights for name column and description column
    q_embedding_name[np.isnan(q_embedding_name)]=0
    q_embedding_desc[np.isnan(q_embedding_desc)]=0    
    q_embedding = np.append(0.8*q_embedding_name,0.2*q_embedding_desc)
    
    similarities = []
    for i in range(len(names_vectors)):
        a = names_vectors[i]
        a[np.isnan(a)] = 0
        b = desc_vectors[i]
        b[np.isnan(b)] = 0
        c = np.append(0.8*a, 0.2*b)
        similarities.append(cosine_similarity(c.reshape(1,-1),q_embedding.reshape(1,-1))[0][0])
    
    # build similarity dataframe
    similarities = pd.DataFrame(similarities,index=df['product_id'],columns=['similarity']).sort_values(by='similarity',ascending=False).reset_index()
    similarities = pd.merge(similarities, df[['product_id','name','description','product_category']],on='product_id',how='left')
    
    
    # find the most similar products 
    most_matched_product = similarities.loc[0,'product_id']
    most_matched_product_name = similarities.loc[0,'name']

    # threshold for 'good' similarity
    if similarities.loc[0,'similarity'] <=0.4 or similarities.loc[0,'product_category']=='UNKNOWN_TOKEN':
        print('Sorry, we do not find matched product. Please check if you are searching for clothing.')
        return None
     

    # If the most similar product in outfit dataset, print out experts recommended combos
    if most_matched_product in out_fit_products:
        matched_outfits = out_fit[out_fit['product_id']==most_matched_product]['outfit_id'].unique()
        for outfit in matched_outfits:
            outfit_details = out_fit[out_fit['outfit_id']==outfit].reset_index()
            output.append(dict(zip(outfit_details['outfit_item_type'],outfit_details['product_full_name']+'('+outfit_details['product_id']+')')))
        print(f'The most similar product is {most_matched_product_name} ({most_matched_product})\n')
        print("WOW, the product is in human domain experts' outfit combination(s):\n")
        combo_idx = 1
        for c in output:
            print(f'Combo {combo_idx}:\n')
            combo_idx+=1
            for i in c:
                print(f'{i}: {c[i]}\n')    
    # Otherwise, we give a recommendation by ourselves
    # We have two format of combo: [top, bottom, shoes, accessory] or [one-piece, shoes, accessory]            
    else:     
        top = similarities[similarities['product_category']=='top'].iloc[0]['name']+'('+similarities[similarities['product_category']=='top'].iloc[0]['product_id']+')'        
        bottom = similarities[similarities['product_category']=='bottom'].iloc[0]['name']+'('+similarities[similarities['product_category']=='bottom'].iloc[0]['product_id']+')'
        onepiece = similarities[similarities['product_category']=='onepiece'].iloc[0]['name']+'('+similarities[similarities['product_category']=='onepiece'].iloc[0]['product_id']+')'
        shoes = similarities[similarities['product_category']=='shoe'].iloc[0]['name']+'('+similarities[similarities['product_category']=='shoe'].iloc[0]['product_id']+')'
        accessory = similarities[similarities['product_category']=='accessory'].iloc[0]['name']+'('+similarities[similarities['product_category']=='accessory'].iloc[0]['product_id']+')'
        # if matched product is top/bottom, we recommend a combo with top, bottom, shoe, accessory        
        if similarities[similarities['product_id']==most_matched_product]['product_category'].values[0] in ['top','bottom']:
            output.append({'top':top,'bottom':bottom,'shoe':shoes,'accessory':accessory})
        # if matched product is onepiece, we recommend a combo with onepiece, shoe, accessory            
        elif similarities[similarities['product_id']==most_matched_product]['product_category'].values[0]=='onepiece':
            output.append({'onepiece':onepiece,'shoe':shoes,'accessory':accessory})
        # if matched product is shoe/accessory, we recommend 2 kinds of combos
        else:
            output.append({'top':top,'bottom':bottom,'shoe':shoes,'accessory':accessory})
            output.append({'onepiece':onepiece,'shoe':shoes,'accessory':accessory})
        print(f'The most similar product is {most_matched_product_name} ({most_matched_product})\n')
        print('Following are our recommended outfit combinations:\n')
        combo_idx = 1
        for c in output:
            print(f'Combo {combo_idx}:\n')
            combo_idx+=1
            for i in c:
                print(f'{i}: {c[i]}\n')   

In [2]:
search_weighted_word_embedding('MacBook computer')

Sorry, we do not find matched product. Please check if you are searching for clothing.


In [8]:
search_weighted_word_embedding('01EWTHFH4H3GP0Q34E6JBYJZNZ')

Matched Product: clara (01EWTHFH4H3GP0Q34E6JBYJZNZ)



In [9]:
search_weighted_word_embedding('01DVA59VHYAPT4PVX32NXW91G5')

Matched Product: juan embossed mules (01DVA59VHYAPT4PVX32NXW91G5)

WOW! The product is in a great combination(s):

Combo 1:

top: Knightley Striped Cotton-Voile Shirt(01DTATDR81EZ9S7DTYW3NE1QH0)

bottom: Vanessa High-Rise Straight-Leg Jeans(01DTATGN3YQGYEPCXAD0E207TP)

shoe: Juan Embossed Mules(01DVA59VHYAPT4PVX32NXW91G5)



In [7]:
search_weighted_word_embedding('pink shirt crewneck')

The most similar product is pink spacedye crewneck (01EPZB9YGRNEBENQCNS24V3WWP)

Following are our recommended outfit combinations:

Combo 1:

top: pink spacedye crewneck(01EPZB9YGRNEBENQCNS24V3WWP)

bottom: embroidered gouyen shorts pink(01ED4N1V910AZ276A5HY8AYNCM)

shoe: sl x two bridges hoodie in hot pink(01EHWB4Z1V5FFVA7D17DDP0ZNW)

accessory: gola hoodie pink(01EF50WZEGCFG5A3Q54QWP7Q9H)



In [11]:
search_weighted_word_embedding('high rise straight jeans')

The most similar product is toothpick high rise jeans (01E5ZX91EHF6W52B4B9GHEVJMC)

Following are our recommended outfit combinations:

Combo 1:

top: oriana high waist straight leg velveteen jeans(01E2KX2A2EDTFS7ZPZZE3YT2A9)

bottom: toothpick high rise jeans(01E5ZX91EHF6W52B4B9GHEVJMC)

shoe: ★ high cut bottom – black jersey(01ET5VYBZ1VYN5ED4ZTFBD224R)

accessory: the tie belted high leg high rise bottom(01EAFFATXRK3XPZRS903FQ3XDE)

