In [1]:
import re
import pickle
import numpy as np
from itertools import chain, product
from difflib import get_close_matches, SequenceMatcher
from gensim.models import Word2Vec

In [2]:
cluster_item_models = [Word2Vec.load(f"../cluster_models/model_cluster_{id}.model") for id in range(0, 12)]

In [3]:
with open('product_lookup.pkl', 'rb') as file:
    product_lookup = pickle.load(file)

In [4]:
product_id_lookup = dict(map(reversed, product_lookup.items()))

In [42]:
basket_items = ('Avocado', 'Craft Beer', 'Kiwi')

In [43]:
matches = list(chain.from_iterable([get_close_matches(item, product_lookup.values()) for item in basket_items]))

In [44]:
product_ids_matches = [product_id_lookup[item] for item in matches]

In [45]:
def filter_matches(cluster_model, product_ids_matches):
    return [product_id for product_id in product_ids_matches if cluster_model.wv.__contains__(product_id)]

filtered_matches = filter_matches(cluster_item_models[0], product_ids_matches)

In [46]:
product_names = [product_lookup[product] for product in filtered_matches]

In [47]:
cross_product_names = [product for product in list(product(product_names, product_names)) if product[0] != product[1]]

In [48]:
if np.mean([SequenceMatcher(None, cross[0], cross[1]).ratio() for cross in cross_product_names]) > 0.5:
    filtered_matches_cleared = [filtered_matches[0]]
else:
    filtered_matches_cleared = filtered_matches

In [49]:
def average_item_vectors(cluster_model, product_ids_matches):
    if len(product_ids_matches) == 1:
        return cluster_model.wv[product_ids_matches[0]]
    else:    
        embeddings = [cluster_model.wv[product_id] for product_id in product_ids_matches]
    return np.mean(embeddings, axis=0)

In [50]:
basket_vector = average_item_vectors(cluster_item_models[0], filtered_matches_cleared)

In [51]:
def retrieve_most_similar_products(cluster_model, product_lookup, basket_vector, n_matches):
    similar_products = cluster_model.wv.similar_by_vector(basket_vector, topn=n_matches)[1:n_matches]
    similar_products_id = [similar[0] for similar in similar_products]
    recommendations = [product_lookup[item_number] for item_number in similar_products_id]
        
    return recommendations   

In [53]:
recommendations = retrieve_most_similar_products(cluster_item_models[0], product_lookup, basket_vector, n_matches=5)
recommendations

['Avocado',
 'Super Simple Sprouted Flax Snax',
 'Organic Limes',
 'Organic Red Leaf Lettuce']

In [54]:
filtered_recommendations = [recommendation for recommendation in recommendations if recommendation not in basket_items]

In [55]:
filtered_recommendations

['Super Simple Sprouted Flax Snax',
 'Organic Limes',
 'Organic Red Leaf Lettuce']