In [130]:
import torch
import torch.nn.functional as F
import pandas as pd
import utils
import numpy as np

In [131]:
original_reviews = utils.getDF('data/Office_Products_5.json.gz')
original_products = utils.getDF('data/meta_Office_Products.json.gz')

In [132]:
all_products = pd.read_csv('data/products_sampled_processed.csv')
all_reviews = pd.read_csv('data/reviews_sampled_processed.csv')

all_reviews['review_length'] = all_reviews['reviewText'].apply(lambda x: len(x.split()))
all_reviews['review_length'] = all_reviews['review_length']

all_reviews.head(1)

Unnamed: 0,overall,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,dayDifferenceReview,review_length
0,4.0,"November 07, 2017",A2NIJTYWADLK57,140503528,cotton clay,kid like story really wanted board book one sm...,good story small size book though,0.967635,11


In [133]:
rev_embeddings = torch.load('data/review_embeddings.pt')
summ_embeddings = torch.load('data/summary_embeddings.pt')
desc_embeddings = torch.load('data/description_embeddings.pt')
titles_embeddings = torch.load('data/title_embeddings.pt')
features_embeddings = torch.load('data/feature_embeddings.pt')
brand_embeddings = torch.load('data/brand_embeddings.pt')

In [134]:
product_embeddings_average = torch.mean(torch.stack([desc_embeddings, titles_embeddings, features_embeddings, brand_embeddings]), dim=0)
user_embeddings_average = torch.mean(torch.stack([rev_embeddings, summ_embeddings]), dim=0)


In [135]:
product_embeddings_map = {}
for i, row in all_products.iterrows():
    product_id = row['asin']
    product_embeddings_map[product_id] = product_embeddings_average[i]

In [136]:
def calculate_weight(row):
    
    review = all_reviews.iloc[row]

    rating = review['overall']
    rating = rating / 5
    recencie = review['reviewTime']
    recencie = int(recencie.split(' ')[-1])
    recencie = recencie / 2021
    length = review['review_length']

    length = length / 20
    
    combined_weight = (rating + recencie + length) / 3
    
    return combined_weight

In [137]:
user_embeddings_map = {}
for i, row in all_reviews.iterrows():
    user_id = row['reviewerID']
    if user_id not in user_embeddings_map:
        user_embeddings_map[user_id] = [user_embeddings_average[i]]
    else:
        user_embeddings_map[user_id].append(user_embeddings_average[i])

for user_id in user_embeddings_map:
    user_embeddings_map[user_id] = torch.mean(torch.stack(user_embeddings_map[user_id]), dim=0)


In [138]:
all_reviews['user_embeddings'] = all_reviews['reviewerID'].apply(lambda x: user_embeddings_map[x])
all_reviews['product_embeddings'] = all_reviews['asin'].apply(lambda x: product_embeddings_map[x])
all_products['product_embeddings'] = all_products['asin'].apply(lambda x: product_embeddings_map[x])


In [139]:
user = all_reviews['reviewerID'].sample(1).values[0]

reviews_sample = all_reviews.sample(frac=0.5)
user_embeddings = all_reviews[all_reviews['reviewerID'] == user]['user_embeddings'].values[0]

other_users_similaties = {}
for i, row in reviews_sample.iterrows():
    other_user = row['reviewerID']
    other_user_embedding = row['user_embeddings']
    other_users_similaties[other_user] = F.cosine_similarity(user_embeddings, other_user_embedding, dim=0).item()

print("Minimum user similarity:", min(other_users_similaties, key=other_users_similaties.get), min(other_users_similaties.values()))



sorted_users = sorted(other_users_similaties.items(), key=lambda x: x[1], reverse=True)


top_users = sorted_users[:5]
top_users_reviews = {}
for u, _ in top_users:
    user_reviews = original_reviews[original_reviews['reviewerID'] == u]
    top_users_reviews[u] = user_reviews.sample(5)['reviewText'].values

top_users_reviews


Minimum user similarity: AN1JBPWLTNJQQ 0.8226282596588135


{'A10HH1WCT8AW4G': array(['I sell books on Amazon and needed a box that fit well for single book shipments.  This is the second time I have ordered these boxes. There are two fold lines to adjust the box to fit the book.  This time it was a little harder to find one of the fold lines but overall the box works perfectly for my use. I would recommend this box and will order it again.',
        'I sell books on Amazon and needed a box that fit well for single book shipments.  This is the second time I have ordered these boxes. There are two fold lines to adjust the box to fit the book.  This time it was a little harder to find one of the fold lines but overall the box works perfectly for my use. I would recommend this box and will order it again.',
        'If I purchase HP901 Tri-Color ink, it lasts a lot longer. My package arrive torn and partially open but that probably happened during shipment because it has never happened with this vendor.  The cartridge was not damaged.  My last car

In [140]:
product = all_products['asin'].sample(1).values[0]
product_embeddings = all_products[all_products['asin'] == product]['product_embeddings'].values[0]

other_products_similaties = {}
for i, row in all_products.iterrows():
    other_product = row['asin']
    if other_product != product:
        other_product_embedding = row['product_embeddings']
        other_products_similaties[other_product] = F.cosine_similarity(product_embeddings, other_product_embedding, dim=0).item()

sorted_products = sorted(other_products_similaties.items(), key=lambda x: x[1], reverse=True)

top_products = sorted_products[:5]


In [141]:
print(product)
top_products

B001U885CQ


[('B001U885CG', 0.9997738599777222),
 ('B001U885DU', 0.9956530332565308),
 ('B00136523K', 0.9932079911231995),
 ('B007PV3Z0E', 0.9931053519248962),
 ('B00IXQ2LGY', 0.9919257760047913)]

In [142]:
top_products_descriptions = {}
for p, _ in top_products:
    product_description = original_products[original_products['asin'] == p]['description'].values[0]
    top_products_descriptions[p] = product_description

print(product, all_products[all_products['asin'] == product]['description'].values[0])
top_products_descriptions

B001U885CQ igh quality 2roller system maximum laminating width 13 accepts hot film 35 mil thick laminate speed 118 per min ready time 5 minute patented cooling guide bar hot laminated item lay flat cool photo lamination possible hot shoe heating method easy use button control temperature easy read led displa


{'B001U885CG': ['High quality 2-roller system. Maximum laminating width of 9" .Accepts hot film 3-5 mil thick .Laminates at a speed of 1.18\' per min Ready time of about 5 minutes . Patented cooling guide bars all hot laminated items to lay flat and cool .Photo lamination possible .Hot shoe heating method ~ Easy to use buttons control temperature .Easy to read LED display'],
 'B001U885DU': ['4-roller system helps provide bubble and wrinkle-free lamination .Maximum laminating width of 12" .Accepts hot film 3-10 mil thick .Laminates at a speed of 1.3\' per min .Quick ready time of about 3 minutes .Easy to use soft touch controls to adjust temperature .Heat plate heating method. Easy to read LED display'],
 'B00136523K': ["Royal Sovereign's APL-330U 13&quot; laminator is a great solution for photo and document laminating applications found for the home and office environment. The APL-330U utilizes a quality and reliable 4-roller system that provides professional and clean lamination finis

In [143]:
other_products_similaties = {}
for i, row in all_products.iterrows():
    other_product = row['asin']
    other_product_embedding = row['product_embeddings']
    other_products_similaties[other_product] = F.cosine_similarity(user_embeddings, other_product_embedding, dim=0).item()


sorted_products = sorted(other_products_similaties.items(), key=lambda x: x[1], reverse=True)


top_products = sorted_products[:5]

top_products


[('B009GAOOF2', 0.9907962679862976),
 ('B002W6ZVD6', 0.989658772945404),
 ('B002A109NA', 0.9887592196464539),
 ('B0037B91NQ', 0.9885908961296082),
 ('B00O9HABPE', 0.9883636236190796)]

In [144]:
top_products_descriptions = {}
for p, _ in top_products:
    product_description = original_products[original_products['asin'] == p]['description'].values[0]
    top_products_descriptions[p] = product_description

top_products_descriptions

{'B009GAOOF2': ['You\'ll be able to carry all you\'ll ever need when you\'re out on the town: ID, credit card, business cards, and a little cash. This Id case measures approximately 4" x 2 7/8" x 7/8".'],
 'B002W6ZVD6': ['This sophisticated set comes with'],
 'B002A109NA': ['"I need more money and power and less shit from you people". Retro design humor will bring a smile to your face. She is only saying what we are all thinking anyway. Very nice quality.'],
 'B0037B91NQ': ["Never forget another appointment againadd a littlegifts sticky note to your agenda book and you'll not only be on time but going in style."],
 'B00O9HABPE': ['Multipurpose']}

In [145]:
print(desc_embeddings.shape)
print(titles_embeddings.shape)
print(features_embeddings.shape)
print(brand_embeddings.shape)
print(rev_embeddings.shape)
print(summ_embeddings.shape)


torch.Size([233462, 768])
torch.Size([233462, 768])
torch.Size([233462, 768])
torch.Size([233462, 768])
torch.Size([661468, 768])
torch.Size([661468, 768])


In [146]:
from sklearn.cluster import KMeans

users = list(user_embeddings_map.keys())
users_embeddings = torch.stack([user_embeddings_map[u] for u in users])
kmeans = KMeans(n_clusters=5, random_state=0).fit(users_embeddings)
clusters = kmeans.predict(users_embeddings)

users_clusters = {}
for i, c in enumerate(clusters):
    if c not in users_clusters:
        users_clusters[c] = [users[i]]
    else:
        users_clusters[c].append(users[i])

users_clusters_instances = {}
for c in users_clusters:
    users_clusters_instances[c] = users_clusters[c][:2]

users_clusters_instances

users_similarities = {}
for c in users_clusters_instances:
    u1 = users_clusters_instances[c][0]
    u2 = users_clusters_instances[c][1]
    u1_embedding = user_embeddings_map[u1]
    u2_embedding = user_embeddings_map[u2]
    users_similarities[c] = F.cosine_similarity(u1_embedding, u2_embedding, dim=0).item()

users_similarities



{3: 0.995256781578064,
 0: 0.9751606583595276,
 2: 0.9869819283485413,
 1: 0.9920946359634399,
 4: 0.9921297430992126}

In [147]:
products = list(product_embeddings_map.keys())
products_embeddings = torch.stack([product_embeddings_map[p] for p in products])
kmeans = KMeans(n_clusters=5, random_state=0).fit(products_embeddings)

clusters = kmeans.predict(products_embeddings)

products_clusters = {}
for i, c in enumerate(clusters):
    if c not in products_clusters:
        products_clusters[c] = [products[i]]
    else:
        products_clusters[c].append(products[i])

products_clusters_instances = {}
for c in products_clusters:
    products_clusters_instances[c] = products_clusters[c][:2]

products_clusters_instances

products_similarities = {}
for c in products_clusters_instances:
    p1 = products_clusters_instances[c][0]
    p2 = products_clusters_instances[c][1]
    p1_embedding = product_embeddings_map[p1]
    p2_embedding = product_embeddings_map[p2]
    products_similarities[c] = F.cosine_similarity(p1_embedding, p2_embedding, dim=0).item()

products_similarities
        

{2: 0.9625016450881958,
 1: 0.9685355424880981,
 3: 0.9877108931541443,
 4: 0.9732388854026794,
 0: 0.9583519697189331}