In [1]:
import pandas as pd
import numpy as np
import os
import json
import pickle
from collections import defaultdict
from datetime import datetime
import openai
import requests
from tenacity import retry, wait_random_exponential, stop_after_attempt
from termcolor import colored

In [2]:
rating_path = './Fashion/AMAZON_FASHION.csv'

In [3]:
custom_header = ['item', 'user', 'rating', 'timestamp']
ratings_df = pd.read_csv(rating_path, header = None, names = custom_header)
print(ratings_df.shape)
ratings_df.head()

(883636, 4)


Unnamed: 0,item,user,rating,timestamp
0,7106116521,A1D4G1SNUZWQOT,5.0,1413763200
1,7106116521,A3DDWDH9PX2YX2,2.0,1411862400
2,7106116521,A2MWC41EW7XL15,4.0,1408924800
3,7106116521,A2UH2QQ275NV45,2.0,1408838400
4,7106116521,A89F3LQADZBS5,3.0,1406419200


In [4]:
rating_df = ratings_df.sort_values(by=['user', 'timestamp'])
rating_df = rating_df.reset_index(drop = True)
print(rating_df.shape)
rating_df.head()

(883636, 4)


Unnamed: 0,item,user,rating,timestamp
0,B00L8J2RF8,A0007604Q2582KFW7N4B,5.0,1426377600
1,B0121M1AJE,A0010606488RW7ZH6EP7,1.0,1507852800
2,B00IGAOE3U,A001152055E9KVRHH96L,1.0,1434585600
3,B01GK6XQFI,A00181966XZUX7KEGYLH,3.0,1468022400
4,B00RLSCLJM,A00205926S885DTOYMX6,5.0,1441497600


In [5]:
ratings_7core = rating_df.groupby('user').filter(lambda x: len(x) >= 7)
print(ratings_7core.shape)
ratings_7core.head()

(11136, 4)


Unnamed: 0,item,user,rating,timestamp
3238,B00HFKN2IU,A10G10SU7SAUG1,4.0,1452816000
3239,B009PMILQE,A10G10SU7SAUG1,4.0,1453766400
3240,B00D83TNRM,A10G10SU7SAUG1,3.0,1453766400
3241,B00E1L9QOU,A10G10SU7SAUG1,4.0,1453766400
3242,B00AW83HM8,A10G10SU7SAUG1,4.0,1453766400


In [6]:
print("Unique items:", ratings_7core['item'].nunique())
print("Unique users:", ratings_7core['user'].nunique())

Unique items: 6089
Unique users: 1273


In [7]:
ratings_7core.rating.value_counts(normalize = True)

rating
5.0    0.577137
4.0    0.193786
3.0    0.116110
1.0    0.058818
2.0    0.054149
Name: proportion, dtype: float64

In [8]:
reviews_path = './Fashion/AMAZON_FASHION.json'
reviews = []
with open(reviews_path, 'r') as f:
    for line in f:
        reviews.append(json.loads(line))
print(len(reviews))

883636


In [9]:
reviews[0]

{'overall': 5.0,
 'verified': True,
 'reviewTime': '10 20, 2014',
 'reviewerID': 'A1D4G1SNUZWQOT',
 'asin': '7106116521',
 'reviewerName': 'Tracy',
 'reviewText': 'Exactly what I needed.',
 'summary': 'perfect replacements!!',
 'unixReviewTime': 1413763200}

### Creating user and item mapping

In [10]:
ratings_7core = ratings_7core.reset_index(drop = True)
ratings_7core.head()

Unnamed: 0,item,user,rating,timestamp
0,B00HFKN2IU,A10G10SU7SAUG1,4.0,1452816000
1,B009PMILQE,A10G10SU7SAUG1,4.0,1453766400
2,B00D83TNRM,A10G10SU7SAUG1,3.0,1453766400
3,B00E1L9QOU,A10G10SU7SAUG1,4.0,1453766400
4,B00AW83HM8,A10G10SU7SAUG1,4.0,1453766400


In [11]:
item_mapping = {item: idx for idx, item in enumerate(ratings_7core['item'].unique())}
user_mapping = {user: idx for idx, user in enumerate(ratings_7core['user'].unique())}

In [12]:
print(len(item_mapping), len(user_mapping))

6089 1273


In [13]:
reverse_item_mapping = {v:k for k,v in item_mapping.items()}
reverse_user_mapping = {v:k for k,v in user_mapping.items()}

In [14]:
print(len(reverse_item_mapping), len(reverse_user_mapping))

6089 1273


In [15]:
id = 2
print(id, reverse_user_mapping[id], user_mapping[reverse_user_mapping[id]])

2 A10RXRZE0TAKPU 2


In [16]:
id = 374
print(id, reverse_item_mapping[id], item_mapping[reverse_item_mapping[id]])

374 B01201P2GS 374


In [17]:
# with open('./Fashion/data/item_idx_mapping.json', 'w+') as f:
#     json.dump(item_mapping, f)
# with open('./Fashion/data/idx_item_mapping.json', 'w+') as f:
#     json.dump(reverse_item_mapping, f)
# with open('./Fashion/data/user_idx_mapping.json', 'w+') as f:
#     json.dump(user_mapping, f)
# with open('./Fashion/data/idx_user_mapping.json', 'w+') as f:
#     json.dump(reverse_user_mapping, f)

### Creating train, test, valid dataset

In [18]:
ratings_7core['item'] = ratings_7core['item'].map(item_mapping)
ratings_7core['user'] = ratings_7core['user'].map(user_mapping)
ratings_7core.head()

Unnamed: 0,item,user,rating,timestamp
0,0,0,4.0,1452816000
1,1,0,4.0,1453766400
2,2,0,3.0,1453766400
3,3,0,4.0,1453766400
4,4,0,4.0,1453766400


In [19]:
# ratings_7core.to_csv('./Fashion/data/ratings.csv', index = False)

In [20]:
user_item_interactions = dict()
for user in list(ratings_7core['user'].unique()):
    user_item_interactions[user] = ratings_7core[ratings_7core['user']==user]['item'].tolist()
print(len(user_item_interactions))

1273


In [21]:
user_item_interactions[0]

[0, 1, 2, 3, 4, 5, 6, 7]

In [22]:
user_train = dict()
user_valid = dict()
user_test = dict()
user_profile = dict()
for user, item_list in user_item_interactions.items():
    user_profile[int(user)] = item_list[:-7]
    user_train[int(user)] = item_list[-7:-2]
    user_valid[int(user)] = item_list[-6:-1]
    user_test[int(user)] = item_list[-5:]
    # break

In [23]:
id = 1200
print(user_profile[id])
print(user_train[id])
print(user_valid[id])
print(user_test[id])

[974, 4266]
[5780, 2332, 5781, 5782, 5783]
[2332, 5781, 5782, 5783, 5784]
[5781, 5782, 5783, 5784, 5785]


In [24]:
# with open('./Fashion/data/user_profile.json', 'w+') as f:
#     json.dump(user_profile, f)
# with open('./Fashion/data/user_train.json', 'w+') as f:
#     json.dump(user_train, f)
# with open('./Fashion/data/user_valid.json', 'w+') as f:
#     json.dump(user_valid, f)
# with open('./Fashion/data/user_test.json', 'w+') as f:
#     json.dump(user_test, f)

### Creating item descriptions

Idea is to create the item description from meta file and reviews file
- Use title, brand and price when available from the meta file
- Use user reviews from the reviews file
    - Identify how to keep the reviews in limit and need to use llm for summarizing

In [25]:
meta_fashion_path = './Fashion/meta_AMAZON_FASHION.json'
meta_fashion = []
with open(meta_fashion_path, 'r') as f:
    for line in f:
        meta_fashion.append(json.loads(line))
print(len(meta_fashion))

186637


In [26]:
all_keys = []
for item in meta_fashion:
    item_keys = list(item.keys())
    all_keys.extend(item_keys)

In [27]:
all_keys = list(set(all_keys))
print(all_keys)

['also_buy', 'feature', 'rank', 'title', 'imageURL', 'also_view', 'details', 'description', 'imageURLHighRes', 'fit', 'date', 'asin', 'brand', 'similar_item', 'price', 'tech1']


In [28]:
count_desc = 0
for item in meta_fashion:
    if 'description' not in item:
        continue
    if item['description']=='' or item['description']==' ':
        continue
    count_desc += 1
print(count_desc/len(meta_fashion))

0.08502601306279034


In [38]:
required_keys = ['brand', 'price']
### description is present in only 8% of the items and many times they are just generic text

In [39]:
item_information = dict()

cnt = 0
for item in meta_fashion:
    if item['asin'] in item_mapping:
        item_id = item_mapping[item['asin']]
        # print("item_id:", item_id)
        item_information[item_id] = dict()
        cnt += 1
        ### meta information
        if 'title' in item:
            item_information[item_id]['title'] = item['title']
        for key in required_keys:
            if key in item:
                if key == 'brand':
                    item_information[item_id]['brand'] = item[key]
                elif key == 'price':
                    item_information[item_id]['price'] = item[key]
        ### review information
    # if cnt > 10:
    #     break
# print(item_information)

In [40]:
print(len(item_information))

6089


In [41]:
reviews[0]

{'overall': 5.0,
 'verified': True,
 'reviewTime': '10 20, 2014',
 'reviewerID': 'A1D4G1SNUZWQOT',
 'asin': '7106116521',
 'reviewerName': 'Tracy',
 'reviewText': 'Exactly what I needed.',
 'summary': 'perfect replacements!!',
 'unixReviewTime': 1413763200}

In [48]:
from collections import defaultdict
item_reviews = dict()
for item in meta_fashion:
    if item['asin'] in item_mapping:
        item_id = item_mapping[item['asin']]
        item_reviews[item_id] = defaultdict(list)
        for review in reviews:
            if review['asin'] == item['asin']:
                if 'verified' in review:
                    if review['verified'] == True:
                        if 'reviewText' in review:
                            if review['reviewText'] != '' or review['reviewText'] != ' ':
                                if 'overall' in review:
                                    if review['overall'] != '' or review['overall'] != ' ':
                                        item_reviews[item_id][review['overall']].append(review['reviewText'])
        # break

In [49]:
len(item_reviews)

6089

In [50]:
item_reviews[4426]

defaultdict(list,
            {5.0: ['Hello good evening, excellent product quality, comfortable, nice, good wreck 100% recommend them very beautiful, I gusta.soy of Venezuela v'],
             3.0: ["This is a beautiful watch, bigger than I thought it would be, which made me happy. The back light works very well, the shine/glitter it has is beautiful, it doesn't look like too much. It is easy to put on and take off, perfect for workouts. The only bad thing I have to say about this product is that the lifespan of the rubber pieces at the end of the band is Very short and you have no way of getting a replacement, because they are not sold separately. I took mine off and I wear the watch with no rubbers at the ends, its not that ugly you cant even tell actually. All the same Im happy with my purchase, I would recommend it."]})

In [51]:
len(item_information)

6089

In [52]:
for item in item_information:
    item_information[item]['reviews'] = item_reviews[item]

In [53]:
with open('./Fashion/data/item_information.json', 'w+') as f:
    json.dump(item_information, f)

### Loading the item_information

In [54]:
item_information_path = './Fashion/data/item_information.json'
with open(item_information_path, 'r') as f:
    item_information = json.load(f)

In [55]:
len(item_information)

6089

In [56]:
item_information['0']

{'title': 'Allegra K Lady Long Sleeve Letter Pattern Pullover Knit Shirt Black S',
 'reviews': {'5.0': ["I love this shirt. I bought an M, but since I'm a shorty it is a bit long on me. Otherwise , the material is fine and is perfect for the spring/summer.\nHowever, I've a question to all the other reviewers, have any of you been able to read what it says on the entire shirt?\nThanks.",
   'Everybody LOVES this top, fits perfectly, so flattering and unique. Very happy with this purchase!!',
   'Very cute and comfy :-)',
   'I got it for my wife and she really liked it',
   'Love Allegra K Blouses.  I own several!!!',
   "The top was very good fit perfect on me I'm 6 feet tall I weigh 220 pounds I order more",
   'Excellent top, fits like it should.',
   'GOOD PRODUCT.  BEAUTIFUL DESIGNS.',
   'Fit as expected. Love it!',
   'I love this shirt. It fits perfectly and you can wear it with anything. I have worn it with slacks and tights so far.  I am around size 8-10. So i got the medium a

In [127]:
import random

def select_reviews(item_reviews):
    random.seed(42)
    total_reviews = 0
    rating_wise_reviews = {'1.0': 0 , '2.0': 0, '3.0': 0, '4.0': 0, '5.0': 0}
    for rating, reviews in item_reviews.items():
        total_reviews += len(reviews)
        rating_wise_reviews[rating] += len(reviews)
    # print("total_reviews:", total_reviews)
    # print("rating_wise_reviews:", rating_wise_reviews)

    ### Take 10 reviews accoridng to distribution if num of total reviews > 10 else take all reviews
    if total_reviews <= 10:
        reviews_for_description = []
        for rating, reviews in item_reviews.items():
            reviews_for_description.extend(reviews)
        # print(reviews_for_description)
        return reviews_for_description
    else:
        rating_distribution = {k: v/total_reviews for k, v in rating_wise_reviews.items()}
        rating_count = {k: round(v*10) for k, v in rating_distribution.items()}
        rating_wise_reviews = dict(sorted(rating_wise_reviews.items(), key = lambda item: item[1], reverse = True))
        # print("Sorted rating_wise_reviews:", rating_wise_reviews)
        ### Starting from the highest present rating assign 1 each
        current_num_reviews = sum(list(rating_count.values()))
        while current_num_reviews < 10:
            for rating, num_reviews in rating_wise_reviews.items():
                if current_num_reviews < 10:
                    if num_reviews > rating_count[rating]:
                        rating_count[rating] += 1
                        current_num_reviews += 1
        # print("rating_distribution:", rating_distribution)
        # print("rating_count:", rating_count)
        reviews_for_description = []
        '''
        Randomly sample the reviews by weights (length of reviews)
        '''
        for rating, reviews in item_reviews.items():
            weights = [len(review) for review in reviews]
            sampled_reviews = random.choices(reviews, weights=weights, k=rating_count[rating])
            # print(sampled_reviews)
            reviews_for_description.extend(sampled_reviews)
        return reviews_for_description

In [128]:
for item in item_information:
    # print(item)
    # print(item_information[item])
    item_information[item]['reviews_for_description'] = select_reviews(item_information[item]['reviews'])
    # break

In [133]:
item_information['0']['reviews_for_description']

['Cute top. I am 5\'9" and 175 lbs with DDD bra size. This top is very flattering, covers my belly which I love. I usually order an XL, but went with a large, and am pleased with that. Will order another.',
 'Everybody LOVES this top, fits perfectly, so flattering and unique. Very happy with this purchase!!',
 'Love this shirt! The combination of stretchy and flowy is so hard to find! I\'m adapting to my transitioning "mom bod" after having my second child. Needless to say, I didn\'t bounce back. I feel so great wearing this with some leggings! It\'s flattering and doesn\'t overly emphasize the fact that I\'m carrying some extra weight. I love that it has a conservative neckline, too. The medium fits me great, if not a little big. I\'m on the large end of being able to fit in medium shirts, if that helps. I am a true size 8, and am 5\'8" tall and it fits me like the model picture in length.',
 'Looks even better in person.  Love it!',
 'Got a lot of compliments wearing this. I am a siz

In [134]:
with open('./Fashion/data/item_information.json', 'w+') as f:
    json.dump(item_information, f)

### Create item description using chatgpt api - gpt3.5-instruct

In [4]:
openai.api_key = os.environ.get('OPENAI_API_KEY')
# print(openai.api_key)

In [12]:
text = """You are given the title of an fashion product and the list of reviews about the product -  
Title: Allegra K Lady Long Sleeve Letter Pattern Pullover Knit Shirt Black S
List of Reviews: 'Cute top. I am 5\'9" and 175 lbs with DDD bra size. This top is very flattering, covers my belly which I love. I usually order an XL, but went with a large, and am pleased with that. Will order another.',
 'Everybody LOVES this top, fits perfectly, so flattering and unique. Very happy with this purchase!!',
 'Love this shirt! The combination of stretchy and flowy is so hard to find! I\'m adapting to my transitioning "mom bod" after having my second child. Needless to say, I didn\'t bounce back. I feel so great wearing this with some leggings! It\'s flattering and doesn\'t overly emphasize the fact that I\'m carrying some extra weight. I love that it has a conservative neckline, too. The medium fits me great, if not a little big. I\'m on the large end of being able to fit in medium shirts, if that helps. I am a true size 8, and am 5\'8" tall and it fits me like the model picture in length.',
 'Looks even better in person.  Love it!',
 'Got a lot of compliments wearing this. I am a size 14 and this fit perfectly. I love all of my tops from Allegra K',
 "I LOVE THIS DRESS!! I wear it as a dress with sheer pantyhoses and pumps, and it looks amazing! I wish I had a picture of me in it so y'all could see how cute and sexy it looks while it being modest on the top. Also, it's super soft!",
 "Great product. Very good quality for the price- maybe a little thin but just as good as anything you'd get at Walmart or something. The size seems to run a bit large. I'm 5'2, 120 pounds and the small fits like a short dress, but I think its supposed to be like that. Also I like the little smiley faces and the tapering (is that what its called?) on the sides",
 "If you ever happen to shop at a retailer like Ross Dress for Less, this is the exact quality you would get from there.  Specifically from the junior section where they have a bunch of no name brands that look cute but wasn't designed to last long.  Personally it looks cute but I know this blouse was not meant to worn for years on end and in some regards I am okay with that.  This is a thin, lightweight sweater.  So if you are self-conscious about your bra showing, just wear a nude color or wear a camisole underneath.  Personally I am shameless so I don't care unless I am cold, then I would throw this over a shirt.",
 "It does run big.  Buy a size down.  I usually wear a large.  And I purchased the medium.  It was a great fit.  It's even better looking in person.",
 "Very casual! Nothing special, not my favorite, but could be worse, meaning it doesn't look good quality or stylish, very simple. Maybe for a colder day to go out to nature!:)"
As an expert fashion product recommender and advertiser, extract the strong (positive) and weak (negative) features or characteristics of the product from the reviews. Give a 25 word description of the item from its title and summarized features. You should only give the 25 word product description."""
response = openai.completions.create(
  model="gpt-3.5-turbo-instruct",
  prompt = text,
  temperature=0,
  max_tokens=50,
  # top_p=0.3,
  # frequency_penalty=0.5,
  # presence_penalty=0.5
)
print(response.choices[0].text)



Description: This Allegra K knit shirt is a flattering and unique top with a combination of stretchy and flowy fabric. It has a conservative neckline and is perfect for transitioning "mom bods."
