In [1]:
import pandas as pd
import numpy as np
import os
import json
import pickle
from collections import defaultdict
from datetime import datetime
import openai
import requests
from tenacity import retry, wait_random_exponential, stop_after_attempt
from termcolor import colored

In [2]:
rating_path = './data/All_Beauty.csv'

In [3]:
custom_header = ['item', 'user', 'rating', 'timestamp']
ratings_df = pd.read_csv(rating_path, header = None, names = custom_header)
print(ratings_df.shape)
ratings_df.head()

(371345, 4)


Unnamed: 0,item,user,rating,timestamp
0,143026860,A1V6B6TNIC10QE,1.0,1424304000
1,143026860,A2F5GHSXFQ0W6J,4.0,1418860800
2,143026860,A1572GUYS7DGSR,4.0,1407628800
3,143026860,A1PSGLFK1NSVO,5.0,1362960000
4,143026860,A6IKXKZMTKGSC,5.0,1324771200


In [4]:
rating_df = ratings_df.sort_values(by=['user', 'timestamp'])
rating_df = rating_df.reset_index(drop = True)
print(rating_df.shape)
rating_df.head()

(371345, 4)


Unnamed: 0,item,user,rating,timestamp
0,B01FHDYGQ8,A0010876CNE3ILIM9HV0,5.0,1476230400
1,B01FCW88D6,A001170867ZBE9FORRQL,5.0,1476576000
2,B01B18T01Y,A001170867ZBE9FORRQL,5.0,1487203200
3,B00TZ8XK1E,A0028738FYF1SKPPC7B1,5.0,1440028800
4,B018H0EJI8,A0045356A23634W7RI4M,3.0,1463961600


In [5]:
ratings_6core = rating_df.groupby('user').filter(lambda x: len(x) >= 6)
print(ratings_6core.shape)
ratings_6core.head()

(4239, 4)


Unnamed: 0,item,user,rating,timestamp
538,B000CR4ER6,A105A034ZG9EHO,4.0,1155081600
539,B000EIOAFY,A105A034ZG9EHO,5.0,1268697600
540,B0009RF9DW,A105A034ZG9EHO,5.0,1404604800
541,B000FI4S1E,A105A034ZG9EHO,5.0,1404604800
542,B000URXP6E,A105A034ZG9EHO,5.0,1404604800


In [6]:
print("Unique items:", ratings_6core['item'].nunique())
print("Unique users:", ratings_6core['user'].nunique())

Unique items: 1220
Unique users: 624


In [7]:
ratings_6core.rating.value_counts(normalize = True)

rating
5.0    0.795235
4.0    0.122906
3.0    0.043406
2.0    0.023119
1.0    0.015334
Name: proportion, dtype: float64

In [8]:
reviews_path = './data/All_Beauty.json'
reviews = []
with open(reviews_path, 'r') as f:
    for line in f:
        reviews.append(json.loads(line))
print(len(reviews))

371345


In [9]:
reviews[0]

{'overall': 1.0,
 'verified': True,
 'reviewTime': '02 19, 2015',
 'reviewerID': 'A1V6B6TNIC10QE',
 'asin': '0143026860',
 'reviewerName': 'theodore j bigham',
 'reviewText': 'great',
 'summary': 'One Star',
 'unixReviewTime': 1424304000}

### Creating user and item mapping

In [10]:
ratings_6core = ratings_6core.reset_index(drop = True)
ratings_6core.head()

Unnamed: 0,item,user,rating,timestamp
0,B000CR4ER6,A105A034ZG9EHO,4.0,1155081600
1,B000EIOAFY,A105A034ZG9EHO,5.0,1268697600
2,B0009RF9DW,A105A034ZG9EHO,5.0,1404604800
3,B000FI4S1E,A105A034ZG9EHO,5.0,1404604800
4,B000URXP6E,A105A034ZG9EHO,5.0,1404604800


In [11]:
item_mapping = {item: idx for idx, item in enumerate(ratings_6core['item'].unique())}
user_mapping = {user: idx for idx, user in enumerate(ratings_6core['user'].unique())}

In [12]:
print(len(item_mapping), len(user_mapping))

1220 624


In [13]:
reverse_item_mapping = {v:k for k,v in item_mapping.items()}
reverse_user_mapping = {v:k for k,v in user_mapping.items()}

In [14]:
print(len(reverse_item_mapping), len(reverse_user_mapping))

1220 624


In [15]:
id = 2
print(id, reverse_user_mapping[id], user_mapping[reverse_user_mapping[id]])

2 A10ZBR6O8S8OCY 2


In [16]:
id = 374
print(id, reverse_item_mapping[id], item_mapping[reverse_item_mapping[id]])

374 B015VKZ9R2 374


In [18]:
with open('./data/item_idx_mapping.json', 'w+') as f:
    json.dump(item_mapping, f)
with open('./data/idx_item_mapping.json', 'w+') as f:
    json.dump(reverse_item_mapping, f)
with open('./data/user_idx_mapping.json', 'w+') as f:
    json.dump(user_mapping, f)
with open('./data/idx_user_mapping.json', 'w+') as f:
    json.dump(reverse_user_mapping, f)

### Creating train, test, valid dataset

In [19]:
ratings_6core['item'] = ratings_6core['item'].map(item_mapping)
ratings_6core['user'] = ratings_6core['user'].map(user_mapping)
ratings_6core.head()

Unnamed: 0,item,user,rating,timestamp
0,0,0,4.0,1155081600
1,1,0,5.0,1268697600
2,2,0,5.0,1404604800
3,3,0,5.0,1404604800
4,4,0,5.0,1404604800


In [20]:
ratings_6core.to_csv('./data/ratings.csv', index = False)

In [21]:
user_item_interactions = dict()
for user in list(ratings_6core['user'].unique()):
    user_item_interactions[user] = ratings_6core[ratings_6core['user']==user]['item'].tolist()
print(len(user_item_interactions))

624


In [22]:
user_item_interactions[0]

[0, 1, 2, 3, 4, 5, 5, 6]

In [29]:
user_train = dict()
user_valid = dict()
user_test = dict()
user_profile = dict()
for user, item_list in user_item_interactions.items():
    user_profile[int(user)] = item_list[:-6]
    user_train[int(user)] = item_list[-8:-2]
    user_valid[int(user)] = item_list[-7:-1]
    user_test[int(user)] = item_list[-6:]
    # break

In [34]:
id = 54
print(user_profile[id])
print(user_train[id])
print(user_valid[id])
print(user_test[id])

[28, 173, 174]
[173, 174, 175, 176, 177, 178]
[174, 175, 176, 177, 178, 179]
[175, 176, 177, 178, 179, 180]


In [35]:
with open('./data/user_profile.json', 'w+') as f:
    json.dump(user_profile, f)
with open('./data/user_train.json', 'w+') as f:
    json.dump(user_train, f)
with open('./data/user_valid.json', 'w+') as f:
    json.dump(user_valid, f)
with open('./data/user_test.json', 'w+') as f:
    json.dump(user_test, f)

### Creating item descriptions

Idea is to create the item description from meta file and reviews file
- Use title, brand and price when available from the meta file
- Use user reviews from the reviews file
    - Identify how to keep the reviews in limit and need to use llm for summarizing

In [36]:
meta_fashion_path = './data/meta_All_Beauty.json'
meta_fashion = []
with open(meta_fashion_path, 'r') as f:
    for line in f:
        meta_fashion.append(json.loads(line))
print(len(meta_fashion))

32892


In [37]:
all_keys = []
for item in meta_fashion:
    item_keys = list(item.keys())
    all_keys.extend(item_keys)

In [38]:
all_keys = list(set(all_keys))
print(all_keys)

['brand', 'tech2', 'category', 'also_buy', 'similar_item', 'date', 'details', 'description', 'tech1', 'title', 'imageURLHighRes', 'fit', 'imageURL', 'also_view', 'main_cat', 'price', 'feature', 'rank', 'asin']


In [39]:
count_desc = 0
total_len = 0
for item in meta_fashion:
    if 'description' not in item:
        continue
    if item['description']=='' or item['description']==' ':
        continue
    if len(item['description']) == 0:
        continue
    count_desc += 1
    cur_len = len(item['description'][0].strip().split(' '))
    total_len += cur_len
    if cur_len > 100:
        print(item['description'][0])
print(count_desc/len(meta_fashion))
print(f"Avg len: {total_len/count_desc}")

If you can design one thing, you can design everything. The Italian architect and designer Lella Vignelli has turned her hand to every kind of project, from furniture, interiors, showrooms and exhibitions to product design, silverware and clothing. In the beginning of the 1960s she established the Vignelli Office of Design and Architecture in Milan together with her husband Massimo Vignelli. In the end of the decade the pair settled in New York and launched one of the worlds biggest design firms at the time, Unimark International. Lella Vignelli received AIGAs Gold Medal in 1983.

Hall of Femmes: Lella Vignelli includes an introductionary essay  by Martha Scotford, Professor of Graphic Design and author of Cipe Pineles: a Life of Design. It is richly illustrated, much of it never before published.
Finale Whitening Cream for Armpit/Inner thigh/Elbow/Knee Size: 30 grams / 1.05 oz. It helps brightening underarm, groin area. Lightens dark spot while removing dead skin cell gradually within

In [41]:
required_keys = ['brand', 'price', 'description']

In [42]:
item_information = dict()

cnt = 0
for item in meta_fashion:
    if item['asin'] in item_mapping:
        item_id = item_mapping[item['asin']]
        # print("item_id:", item_id)
        item_information[item_id] = dict()
        cnt += 1
        ### meta information
        if 'title' in item:
            item_information[item_id]['title'] = item['title']
        for key in required_keys:
            if key in item:
                if key == 'brand':
                    item_information[item_id]['brand'] = item[key]
                elif key == 'description' and len(item['description']) > 0:
                    item_information[item_id]['description'] = ' '.join(word for word in item['description'][0].strip().split(' ')[:100])
                elif key == 'price':
                    item_information[item_id]['price'] = item[key]
        ### review information
    # if cnt > 10:
    #     break
# print(item_information)

In [43]:
print(len(item_information))

1216


In [44]:
reviews[0]

{'overall': 1.0,
 'verified': True,
 'reviewTime': '02 19, 2015',
 'reviewerID': 'A1V6B6TNIC10QE',
 'asin': '0143026860',
 'reviewerName': 'theodore j bigham',
 'reviewText': 'great',
 'summary': 'One Star',
 'unixReviewTime': 1424304000}

In [45]:
from collections import defaultdict
item_reviews = dict()
for item in meta_fashion:
    if item['asin'] in item_mapping:
        item_id = item_mapping[item['asin']]
        item_reviews[item_id] = defaultdict(list)
        for review in reviews:
            if review['asin'] == item['asin']:
                if 'verified' in review:
                    if review['verified'] == True:
                        if 'reviewText' in review:
                            if review['reviewText'] != '' or review['reviewText'] != ' ':
                                if 'overall' in review:
                                    if review['overall'] != '' or review['overall'] != ' ':
                                        item_reviews[item_id][review['overall']].append(review['reviewText'])
        # break

In [46]:
len(item_reviews)

1216

In [47]:
item_reviews[657]

defaultdict(list,
            {3.0: ["ouch...like rubbing needles on your scalp....I'll try it for a week and see if I can take it....will write an update"],
             5.0: ['Big difference in my scalp - it is looser - has movement - new hair are beginning to sprout',
              'i think the benefits of using this will help with hair growth to early to tell i only just got it. I think it will be good',
              'I ordered a Magnaroller and I must say Im pleasantly surprised with the quality of this product. Its durable, easy to use and feels good on my scalp. Its a great value for the money I think. Im excited about using it.',
              'I got my Magnaroller about 4 months ago and I can honestly say my hair loss has dropped considerably, I am not bald but a hair line that has receded about an inch or maybe a little more. Of I still have some hair fall, but everyone does and this was extremely painful for me to use at first but it does work for me and I can see some smal

In [48]:
len(item_information)

1216

In [50]:
for item in item_information:
    item_information[item]['reviews'] = item_reviews[item]

In [51]:
with open('./data/item_information.json', 'w+') as f:
    json.dump(item_information, f)

### Selecting Reviews for Item Description

In [52]:
item_information_path = './data/item_information.json'
with open(item_information_path, 'r') as f:
    item_information = json.load(f)

In [53]:
len(item_information)

1216

In [54]:
item_information['0']

{'title': 'Fresh Eau De Parfum EDP - Fig Apricot 3.4oz (100ml)',
 'brand': 'Fresh',
 'price': '',
 'description': 'FIG APRICOT EAU DE PARFUM: a rich fruity floral with green tea, musc, and petitgrain. NOTES: - Top: Turkish apricot, peach skin, lychee - Heart: fig leaf, petitgrain, dandelion - Base: green tea, musc, marine notes',
 'reviews': {'4.0': ['This is the second time I have purchased this scent and still love it. I love the fresh line because it is one of the few products I can wear on my skin without an allergic reaction. The scent never fails to generate compliments. KMRN']}}

In [55]:
for item, information in item_information.items():
    print(item, [len(review.split(' ')) for review in information['reviews']])

882 [1, 1, 1, 1, 1]
17 [1, 1, 1, 1, 1]
257 [1, 1, 1]
555 [1, 1]
1016 [1, 1, 1, 1, 1]
239 []
119 [1, 1, 1, 1, 1]
441 [1, 1, 1, 1]
241 [1, 1, 1, 1, 1]
672 []
519 [1, 1, 1, 1]
18 [1, 1, 1, 1, 1]
240 []
994 [1, 1, 1, 1, 1]
1012 [1, 1, 1, 1, 1]
725 [1, 1, 1, 1, 1]
643 [1, 1, 1, 1, 1]
1031 [1, 1, 1, 1]
23 [1, 1, 1, 1, 1]
742 [1, 1, 1, 1, 1]
572 [1, 1]
547 [1, 1, 1, 1, 1]
983 [1, 1, 1]
351 [1, 1, 1, 1, 1]
518 [1, 1, 1, 1, 1]
254 [1, 1, 1]
436 [1, 1, 1, 1, 1]
647 [1, 1, 1, 1, 1]
449 []
376 [1, 1, 1, 1, 1]
645 [1, 1]
256 [1, 1, 1, 1, 1]
567 [1, 1, 1, 1, 1]
522 [1, 1, 1, 1, 1]
24 [1]
935 [1, 1, 1, 1]
1115 [1, 1, 1, 1]
784 [1, 1, 1, 1, 1]
636 [1, 1, 1]
1095 [1, 1, 1, 1, 1]
1032 [1, 1, 1, 1, 1]
1037 [1, 1, 1, 1]
774 [1]
2 [1, 1, 1, 1, 1]
319 [1, 1, 1, 1, 1]
1084 [1, 1, 1]
548 [1, 1, 1, 1, 1]
487 [1, 1, 1, 1, 1]
1144 [1]
161 [1, 1, 1]
131 [1, 1, 1]
1108 [1]
0 [1]
22 [1, 1, 1, 1, 1]
810 [1, 1, 1, 1, 1]
812 [1, 1, 1, 1, 1]
1 [1, 1, 1, 1]
709 [1, 1, 1, 1]
824 [1, 1, 1, 1, 1]
995 [1, 1, 1, 1, 1]
726 [1

In [56]:
import random

def select_reviews(item_reviews):
    random.seed(42)
    total_reviews = 0
    rating_wise_reviews = {'1.0': 0 , '2.0': 0, '3.0': 0, '4.0': 0, '5.0': 0}
    for rating, reviews in item_reviews.items():
        total_reviews += len(reviews)
        rating_wise_reviews[rating] += len(reviews)
    # print("total_reviews:", total_reviews)
    # print("rating_wise_reviews:", rating_wise_reviews)

    ### Take 10 reviews accoridng to distribution if num of total reviews > 10 else take all reviews
    if total_reviews <= 10:
        reviews_for_description = []
        for rating, reviews in item_reviews.items():
            reviews_for_description.extend(reviews)
        print(reviews_for_description)
        print([len(review.split(' ')) for review in reviews_for_description])
        reviews_for_description = [' '.join(review.strip().split(' ')[:100]) for review in reviews_for_description] ### For longer reviews keeping the first 50 words
        print(reviews_for_description)
        print([len(review.split(' ')) for review in reviews_for_description])
        return reviews_for_description
    else:
        rating_distribution = {k: v/total_reviews for k, v in rating_wise_reviews.items()}
        rating_count = {k: round(v*10) for k, v in rating_distribution.items()}
        rating_wise_reviews = dict(sorted(rating_wise_reviews.items(), key = lambda item: item[1], reverse = True))
        # print("Sorted rating_wise_reviews:", rating_wise_reviews)
        ### Starting from the highest present rating assign 1 each
        current_num_reviews = sum(list(rating_count.values()))
        while current_num_reviews < 10:
            for rating, num_reviews in rating_wise_reviews.items():
                if current_num_reviews < 10:
                    if num_reviews > rating_count[rating]:
                        rating_count[rating] += 1
                        current_num_reviews += 1
        # print("rating_distribution:", rating_distribution)
        # print("rating_count:", rating_count)
        reviews_for_description = []
        '''
        Randomly sample the reviews by weights (length of reviews)
        '''
        for rating, reviews in item_reviews.items():
            # weights = [len(review) for review in reviews]
            sampled_reviews = random.choices(reviews, k=rating_count[rating])
            print(sampled_reviews)
            reviews_for_description.extend(sampled_reviews)
        print(len(review.split(' ')) for review in reviews_for_description)
        reviews_for_description = [' '.join(review.strip().split(' ')[:50]) for review in reviews_for_description] ### For longer reviews keeping the first 50 words
        print(len(review.split(' ')) for review in reviews_for_description)
        return reviews_for_description

In [57]:
for item in item_information:
    # print(item)
    # print(item_information[item])
    item_information[item]['reviews_for_description'] = select_reviews(item_information[item]['reviews'])
    # select_reviews(item_information[item]['reviews'])
    # break

[]
['Does not fit perfectly with the shaver unit - leaves a gap at base.  Still able to produce a good shave.']
['Perfect fit.', 'Quality brand', 'The Braun razor gives a good close shave.  I have only had to change the heads about once a year.', 'The product arrived on time and in perfect condition. I am very satisfied.\nArt', "Excellent - I'm so glad I replaced my old broken parts with new ones from this vendor. Product arrived as promised, brand new in the package, at a great price.", 'OME Braun shaver head is the only way to go to keep your razor working like new. Recommended.', 'was a gift', 'Works great!']
['none']
[]
<generator object select_reviews.<locals>.<genexpr> at 0x7f8bca4b3dd0>
<generator object select_reviews.<locals>.<genexpr> at 0x7f8bca4b3dd0>
['Work as expected.', "Works like it's supposed to. Easy to change and keeps your razor clean.", 'Good', 'Exactly what was needed', 'Excellent', 'I have been using the same razor now for 2 years.  I use it for my face and to s

In [58]:
for item, information in item_information.items():
    print(item, [len(review.split(' ')) for review in information['reviews_for_description']])

882 [22, 2, 2, 21, 13, 30, 18, 3, 2, 1]
17 [3, 13, 1, 4, 1, 45, 6, 43, 5, 7]
257 [50, 4, 3, 3, 15, 50, 2, 4, 6, 6, 1]
555 [54, 90, 100, 7, 73, 100, 100, 35]
1016 [16, 17, 3, 7, 50, 20, 20, 12, 5, 2]
239 []
119 [7, 16, 5, 2, 15, 8, 26, 1, 3, 2, 48]
441 [28, 39, 34, 50, 23, 20, 18, 50, 18, 50]
241 [50, 50, 17, 50, 31, 50, 50, 50, 50, 50]
672 []
519 [25, 50, 42, 26, 29, 29, 47, 32, 28, 31]
18 [4, 46, 27, 11, 2, 50, 27, 46, 11, 50]
240 []
994 [50, 50, 19, 19, 40, 29, 24, 50, 50, 50]
1012 [29, 8, 26, 20, 50, 25, 48, 6, 30, 2]
725 [50, 15, 20, 20, 50, 19, 2, 50, 50, 34]
643 [4, 16, 49, 11, 45, 25, 33, 44, 14, 50]
1031 [50, 6, 2, 10, 50, 28, 44, 22, 40, 10]
23 [50, 50, 3, 2, 21, 25, 23, 11, 50, 30]
742 [17, 2, 9, 50, 50, 37, 29, 5, 11, 13, 50]
572 [100, 42]
547 [50, 50, 50, 50, 50, 50, 43, 46, 50, 50]
983 [50, 50, 50, 20, 39, 50, 50, 50, 50, 50]
351 [9, 12, 2, 42, 33, 8, 35, 14, 49, 50]
518 [26, 50, 28, 45, 50, 26, 50, 18, 41, 24]
254 [11, 22, 10, 16, 41, 3, 28, 10, 7, 8]
436 [50, 50, 8, 50, 

In [59]:
item_information['0']['reviews_for_description']

['This is the second time I have purchased this scent and still love it. I love the fresh line because it is one of the few products I can wear on my skin without an allergic reaction. The scent never fails to generate compliments. KMRN']

In [60]:
with open('./data/item_information.json', 'w+') as f:
    json.dump(item_information, f)

### Create item description using chatgpt api - gpt3.5-instruct

In [59]:
openai.api_key = os.environ.get('OPENAI_API_KEY')
# print(openai.api_key)

In [12]:
text = """You are given the title of an fashion product and the list of reviews about the product -  
Title: Allegra K Lady Long Sleeve Letter Pattern Pullover Knit Shirt Black S
List of Reviews: 'Cute top. I am 5\'9" and 175 lbs with DDD bra size. This top is very flattering, covers my belly which I love. I usually order an XL, but went with a large, and am pleased with that. Will order another.',
 'Everybody LOVES this top, fits perfectly, so flattering and unique. Very happy with this purchase!!',
 'Love this shirt! The combination of stretchy and flowy is so hard to find! I\'m adapting to my transitioning "mom bod" after having my second child. Needless to say, I didn\'t bounce back. I feel so great wearing this with some leggings! It\'s flattering and doesn\'t overly emphasize the fact that I\'m carrying some extra weight. I love that it has a conservative neckline, too. The medium fits me great, if not a little big. I\'m on the large end of being able to fit in medium shirts, if that helps. I am a true size 8, and am 5\'8" tall and it fits me like the model picture in length.',
 'Looks even better in person.  Love it!',
 'Got a lot of compliments wearing this. I am a size 14 and this fit perfectly. I love all of my tops from Allegra K',
 "I LOVE THIS DRESS!! I wear it as a dress with sheer pantyhoses and pumps, and it looks amazing! I wish I had a picture of me in it so y'all could see how cute and sexy it looks while it being modest on the top. Also, it's super soft!",
 "Great product. Very good quality for the price- maybe a little thin but just as good as anything you'd get at Walmart or something. The size seems to run a bit large. I'm 5'2, 120 pounds and the small fits like a short dress, but I think its supposed to be like that. Also I like the little smiley faces and the tapering (is that what its called?) on the sides",
 "If you ever happen to shop at a retailer like Ross Dress for Less, this is the exact quality you would get from there.  Specifically from the junior section where they have a bunch of no name brands that look cute but wasn't designed to last long.  Personally it looks cute but I know this blouse was not meant to worn for years on end and in some regards I am okay with that.  This is a thin, lightweight sweater.  So if you are self-conscious about your bra showing, just wear a nude color or wear a camisole underneath.  Personally I am shameless so I don't care unless I am cold, then I would throw this over a shirt.",
 "It does run big.  Buy a size down.  I usually wear a large.  And I purchased the medium.  It was a great fit.  It's even better looking in person.",
 "Very casual! Nothing special, not my favorite, but could be worse, meaning it doesn't look good quality or stylish, very simple. Maybe for a colder day to go out to nature!:)"
As an expert fashion product recommender and advertiser, extract the strong (positive) and weak (negative) features or characteristics of the product from the reviews. Give a 25 word description of the item from its title and summarized features. You should only give the 25 word product description."""
response = openai.completions.create(
  model="gpt-3.5-turbo-instruct",
  prompt = text,
  temperature=0,
  max_tokens=50,
  # top_p=0.3,
  # frequency_penalty=0.5,
  # presence_penalty=0.5
)
print(response.choices[0].text)



Description: This Allegra K knit shirt is a flattering and unique top with a combination of stretchy and flowy fabric. It has a conservative neckline and is perfect for transitioning "mom bods."
