In [1]:
import pandas as pd
import numpy as np
import os
import json
import pickle
from collections import defaultdict
from datetime import datetime
import openai
import requests
from tenacity import retry, wait_random_exponential, stop_after_attempt
from termcolor import colored

In [2]:
item_information_path = './data/item_information.json'
with open(item_information_path, 'r') as f:
    item_information = json.load(f)
print(len(item_information))

1216


In [3]:
print(f"title: {item_information['87']['title']}")
print(f"brand: {item_information['87']['brand']}")
print(f"price: {item_information['87']['price']}")
print(f"reviews: {item_information['87']['reviews_for_description']}")

title: VIIcode T2O2.5 Oxygen Eye Cream For Dark Circles and Wrinkles - Reduces Puffiness, Crow's Feet, Fine Lines and Bags
brand: VIIcode
price: $199.00
reviews: ["It's the first time to be opened and that's what my mom said. Only half inside. I don't know it was used or that's what it is supposed to be", "This product doesn't work at all for me. Too expensive for nothing."]


In [4]:
for item, information in item_information.items():
    print(item, [len(review.split(' ')) for review in information['reviews_for_description']])

882 [22, 2, 2, 21, 13, 30, 18, 3, 2, 1]
17 [3, 13, 1, 4, 1, 45, 6, 43, 5, 7]
257 [50, 4, 3, 3, 15, 50, 2, 4, 6, 6, 1]
555 [54, 90, 100, 7, 73, 100, 100, 35]
1016 [16, 17, 3, 7, 50, 20, 20, 12, 5, 2]
239 []
119 [7, 16, 5, 2, 15, 8, 26, 1, 3, 2, 48]
441 [28, 39, 34, 50, 23, 20, 18, 50, 18, 50]
241 [50, 50, 17, 50, 31, 50, 50, 50, 50, 50]
672 []
519 [25, 50, 42, 26, 29, 29, 47, 32, 28, 31]
18 [4, 46, 27, 11, 2, 50, 27, 46, 11, 50]
240 []
994 [50, 50, 19, 19, 40, 29, 24, 50, 50, 50]
1012 [29, 8, 26, 20, 50, 25, 48, 6, 30, 2]
725 [50, 15, 20, 20, 50, 19, 2, 50, 50, 34]
643 [4, 16, 49, 11, 45, 25, 33, 44, 14, 50]
1031 [50, 6, 2, 10, 50, 28, 44, 22, 40, 10]
23 [50, 50, 3, 2, 21, 25, 23, 11, 50, 30]
742 [17, 2, 9, 50, 50, 37, 29, 5, 11, 13, 50]
572 [100, 42]
547 [50, 50, 50, 50, 50, 50, 43, 46, 50, 50]
983 [50, 50, 50, 20, 39, 50, 50, 50, 50, 50]
351 [9, 12, 2, 42, 33, 8, 35, 14, 49, 50]
518 [26, 50, 28, 45, 50, 26, 50, 18, 41, 24]
254 [11, 22, 10, 16, 41, 3, 28, 10, 7, 8]
436 [50, 50, 8, 50, 

In [14]:
prompt = f"""As an expert beauty product recommender and advertiser, extract the strong (positive) and weak (negative) features or characteristics of the product from the given title and reviews. You are given the title of a fashion product and list of reviews about the product -
{item_information['87']['title']}
Reviews -
{item_information['87']['reviews_for_description']}
Give a 25 word concise product description mentioning strong and weak features of the product."""
print(prompt)

As an expert beauty product recommender and advertiser, extract the strong (positive) and weak (negative) features or characteristics of the product from the given title and reviews. You are given the title of a fashion product and list of reviews about the product -
VIIcode T2O2.5 Oxygen Eye Cream For Dark Circles and Wrinkles - Reduces Puffiness, Crow's Feet, Fine Lines and Bags
Reviews -
["It's the first time to be opened and that's what my mom said. Only half inside. I don't know it was used or that's what it is supposed to be", "This product doesn't work at all for me. Too expensive for nothing."]
Give a 25 word concise product description mentioning strong and weak features of the product.


In [15]:
openai.api_key = os.environ.get('OPENAI_API_KEY')

In [16]:
response = openai.completions.create(
    # model = "gpt-3.5-turbo-0125",
    model="gpt-3.5-turbo-instruct",
    prompt = prompt,
    temperature=0.3,
    max_tokens=50,
    # top_p=0.3,
    # frequency_penalty=0.5,
    # presence_penalty=0.5
)
print(response.choices[0].text)



The VIIcode T2O2.5 Oxygen Eye Cream effectively reduces dark circles, wrinkles, and puffiness, but some customers have received partially used products and found it to be expensive.


### Getting Product Descriptions

In [17]:
def generate_prompt(title, reviews):
    prompt = f"""As an expert beauty product recommender and advertiser, extract the strong (positive) and weak (negative) features or characteristics of the product from the given reviews.
    You are given a 
    You are given a list of reviews about the product -
    {reviews}
    Give a 25 word concise product description mentioning strong and weak features of the product."""
    return prompt

In [18]:
items_no_title = []
items_no_reviews = []
content_to_summarize = dict()
for item_id, item_values in item_information.items():
    if 'title' in item_values:
        if len(item_values['reviews_for_description']) > 0:
            content_to_summarize[item_id] = generate_prompt(item_values['title'], item_values['reviews_for_description'])
        else:
            items_no_reviews.append(item_id)
            continue
    else:
        items_no_title.append(item_id)
    # break

In [19]:
print(len(content_to_summarize), content_to_summarize['123'])

1169 As an expert beauty product recommender and advertiser, extract the strong (positive) and weak (negative) features or characteristics of the product from the given reviews.
    You are given a 
    You are given a list of reviews about the product -
    ["You need medium to thick hair to work the best.  Hardly use them and I've had these for quite awhile..", "These are just a gimmick and don't work. Don't waste your money. You are better off using the tiny hair clips and then teasing over the top of them.", "My hair has been kind of thin all my life and if you are like me there is no way you'll be able to hide these in your hair. They are not worth it.", "There's a reason why these are sold so cheaply. Save your time and money. These are a total waste.\nSpray and Tease."]
    Give a 25 word concise product description mentioning strong and weak features of the product.


In [12]:
print(items_no_title)

[]


In [13]:
print(len(items_no_reviews))

24


In [14]:
content_list = list(content_to_summarize.values())
print(len(content_list))

758


In [15]:
print(content_list[0])
print(content_to_summarize[list(content_to_summarize.keys())[0]])

As an expert beauty product recommender and advertiser, extract the strong (positive) and weak (negative) features or characteristics of the product from the given reviews. You are given the list of reviews about the product -
            ['Does not fit perfectly with the shaver unit - leaves a gap at base.  Still able to produce a good shave.', 'Perfect fit.', 'Quality brand', 'The Braun razor gives a good close shave.  I have only had to change the heads about once a year.', 'The product arrived on time and in perfect condition. I am very satisfied.\nArt', "Excellent - I'm so glad I replaced my old broken parts with new ones from this vendor. Product arrived as promised, brand new in the package, at a great price.", 'OME Braun shaver head is the only way to go to keep your razor working like new. Recommended.', 'was a gift', 'Works great!', 'none']
            Give a 25 word concise product description mentioning strong and weak features of the product.
As an expert beauty product re

### FIlling in the item_descriptions

In [5]:
item_description_path = './data/item_description.json'
with open(item_description_path, 'r') as f:
    item_description = json.load(f)
print(len(item_description))

1168


In [6]:
items_with_no_description = list(set(list(item_information.keys())) - set(list(item_description.keys())))
print(len(items_with_no_description))

48


In [7]:
processed_item_information = defaultdict(dict)
for item, information in item_information.items():
    # print(item, information)
    if 'title' in information:
        processed_item_information[item]['title'] = information['title']
    if 'brand' in information:
        processed_item_information[item]['brand'] = information['brand']
    if 'price' in information:
        processed_item_information[item]['price'] = information['price']
    if item in item_description:
        processed_item_information[item]['description'] = item_description[item]
    else:
        processed_item_information[item]['description'] = ''
    # break

In [8]:
processed_item_information['362']

{'title': 'Crest + Oral-B Professional Gingivitis Kit, 1 Count',
 'brand': 'Crest',
 'price': '',
 'description': '\nAns - "This product is a 5-in-1 toothbrush with customizable settings, but its complexity and clock feature are unnecessary."'}

In [9]:
print(len(processed_item_information))

1216


In [10]:
with open('./processed_data/processed_item_information.json', 'w+') as f:
    json.dump(processed_item_information, f)

In [11]:
### Remove items which are not present in processed_item_information from ratings dict

In [14]:
all_items = list(processed_item_information.keys())
all_items = [int(item) for item in all_items]
print(len(all_items))

1216


In [15]:
ratings_df = pd.read_csv("./data/ratings.csv")
print(ratings_df.shape)
ratings_df_filtered = ratings_df[ratings_df['item'].isin(all_items)]
print(ratings_df_filtered.shape)

(4239, 4)
(4231, 4)


In [16]:
ratings_df_filtered.to_csv('./data/ratings.csv', index = False)
ratings_df_filtered.to_csv('./processed_data/ratings.csv', index = False)
