In [9]:
import numpy as np
import pandas as pd
import tiktoken

from openai.embeddings_utils import get_embedding
from sklearn.cluster import AgglomerativeClustering

import os
import openai
from dotenv import load_dotenv
# from sqlalchemy import create_engine

load_dotenv()
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

if os.getenv("OPENAI_API_KEY") is not None:
    print ("OPENAI_API_KEY is ready")
else:
    print ("OPENAI_API_KEY environment variable not found")

import requests
from tenacity import retry, wait_random_exponential, stop_after_attempt

GPT_MODEL = "gpt-3.5-turbo-0613"

OPENAI_API_KEY is ready


# Obiectiv Fisier
- identific topics, apoi clusterizez si denumesc din nou, daca e nevoie
- identific atribute asociate cu topics, apoi le clusterizez si denumesc din nou
- fiecare topic si atribut trebuie sa aibe asociate rating-ul, ID-ul review-ului si asin-ul, sentimentele asociate.
- plec la drum cu un fisier de reivews redus la minimul necesar. Acelasi fisier de reviews data va fi extins (exploded) astfel incat atributele sa fie specifice unei baze de date:

"Attribute" (exemplu: when)
si value

In [10]:
interim_reviews_path = '/Users/vladbordei/Documents/Development/ProductExplorer/data/interim/reviews_df_interim.csv'
reviews = pd.read_csv(interim_reviews_path)

In [11]:
reviews

Unnamed: 0,Date,Author,Verified,Helpful,Title,review,Rating,Images,Videos,URL,...,Quality,Durability,Ease of Use,Setup and Instructions,Noise and Smell,Colors,Size and Fit,Danger Appraisal,Design and Appearance,Parts and Components
0,2023-04-07,Amazon Customer,yes,-,On time,It was very good easy to put up Iam very happy,5,-,-,https://www.amazon.com/gp/customer-reviews/R3L...,...,,,,,,,,,,
1,2023-03-01,Dan,yes,-,Beautiful,Absolutely inlove with these Very durable Stro...,5,-,-,https://www.amazon.com/gp/customer-reviews/R3L...,...,great quality,very durable,,,,,fits well,,,
2,2023-04-06,Karen Robertson,yes,-,Love it,My daughter and her husband put these together...,5,-,-,https://www.amazon.com/gp/customer-reviews/RKV...,...,Good quality,Not mentioned,Not mentioned,Not mentioned,Not mentioned,Not mentioned,Good size,Not mentioned,Not mentioned,Not mentioned
3,2023-02-22,Andrew,-,-,A great purchase when on sale,First I want to say that I got this for 119 I ...,3,-,-,https://www.amazon.com/gp/customer-reviews/R3I...,...,"Thick and heavy steel, but not painted",Expected to last a few years,Not specified,Not specified,Not specified,Not specified,24 inches is a good height for gardening,Not specified,Not specified,Not specified
4,2023-05-20,Jose Perez,-,-,Easy to setup and this thing is massive!,Took a few mins to setup so wasnt terrible Its...,5,-,-,https://www.amazon.com/gp/customer-reviews/R2W...,...,,,,Took a few mins to setup,,,,,,measurements were correct
5,2023-03-27,DeSanne Fluitt,yes,-,Great Raised Garden Bed,These beds are perfect for vegetable beds Save...,5,-,-,https://www.amazon.com/gp/customer-reviews/R1Y...,...,Great quality,Durable,Easy to use,,,,,Safe,,
6,2023-03-27,Md,yes,-,Best raised garden bed.,So far I am very pleased with my raised bed Th...,5,-,-,https://www.amazon.com/gp/customer-reviews/R1X...,...,Well built,Durable,Easy to put together,Not specified,Not specified,Not specified,Not specified,Not specified,Not specified,Not specified
7,2023-03-18,zlajoie,yes,-,Easy setup and looks great,The garden bed was very easy to setup and look...,5,-,-,https://www.amazon.com/gp/customer-reviews/RKJ...,...,,,Easy to use,Easy to setup,,,,,Looks awesome,
8,2023-05-16,Danese CameronDanese Cameron,yes,-,Perfect fit,I like the square design rather than the round...,5,https://m.media-amazon.com/images/I/81jHLvy7t6...,-,https://www.amazon.com/gp/customer-reviews/R35...,...,Not specified,Not specified,Not specified,Not specified,Not specified,Not specified,Not specified,Not specified,"Square design, fits well next to a garden shed...",Not specified
9,2023-06-02,penny van eck,yes,-,poor quality,My husband is a mechanic and very handy We str...,2,-,-,https://www.amazon.com/gp/customer-reviews/R2M...,...,Not mentioned,Not mentioned,Not mentioned,Difficult to set up,Not mentioned,Not mentioned,Not mentioned,Not mentioned,Not mentioned,Not mentioned


In [12]:
reviews.drop(columns = [ 'Verified', 'Helpful', 'Title', 'review','Videos','Variation', 'Style', 'num_tokens', 'review_num_tokens','initial_response', 'eval_response'], inplace = True)

In [13]:
data_cols = ["Review Summary","Buyer Motivation", "Customer Expectations", "How the product is used", "Where the product is used", "User Description", "Packaging", "Season", "When the product is used", "Price", "Quality", "Durability", "Ease of Use", "Setup and Instructions", "Noise and Smell", "Colors", "Size and Fit", "Danger Appraisal", "Design and Appearance", "Parts and Components"]
for col in data_cols:
    reviews[col] = reviews[col].fillna('')
    reviews[col].replace(['\n', 'not mentioned',np.nan, '',' ', 'NA', 'N/A', 'missing', 'NaN', 'unknown', 'Not mentioned','not specified','Not specified'], 'unknown', inplace = True)

In [14]:
columns_to_pivot = ["Buyer Motivation", "Customer Expectations", "How the product is used", "Where the product is used", "User Description", "Packaging", "Season", "When the product is used", "Price", "Quality", "Durability", "Ease of Use", "Setup and Instructions", "Noise and Smell", "Colors", "Size and Fit", "Danger Appraisal", "Design and Appearance", "Parts and Components"]

# assume 'df' is your DataFrame
reviews_data_df = reviews.melt(id_vars=[col for col in reviews.columns if col not in columns_to_pivot], 
                    value_vars=columns_to_pivot, 
                    var_name='Attribute', 
                    value_name='Value')

In [15]:
reviews_data_df = reviews_data_df[reviews_data_df['Value'] != 'unknown']

# Clustering

In [16]:
embedding_model = "text-embedding-ada-002"
embedding_encoding = "cl100k_base"  # this the encoding for text-embedding-ada-002
max_tokens = 8000  # the maximum for text-embedding-ada-002 is 8191
encoding = tiktoken.get_encoding(embedding_encoding)
    
def get_text_from_embedding(embedding):
    return openai.Embedding.retrieve(embedding, model="text-embedding-ada-002")["data"][0]["text"]


In [17]:
df = reviews_data_df

In [18]:
# omit reviews that are too long to embed
df["n_tokens"] = df['Value'].apply(lambda x: len(encoding.encode(x)))
df = df[df.n_tokens <= max_tokens]

# Get embeddings
df["embedding"] = df['Value'].apply(lambda x: get_embedding(x, engine=embedding_model))
df["embedding"] = df["embedding"].apply(np.array)  # convert string to numpy array

In [19]:
max_n_clusters = 7
df["cluster"] = np.nan

types_list = list(reviews_data_df['Attribute'].unique())

for type in types_list:
    print(type)
    df_type = df[df['Attribute'] == type]
    n_clusters = min(max_n_clusters, len(df_type['Value'].unique()))
    if n_clusters > 2:
        clustering = AgglomerativeClustering(n_clusters=n_clusters)
        matrix = np.vstack(df_type["embedding"].values)
        labels = clustering.fit_predict(matrix)
        df_type["cluster"] = labels
        df.loc[df['Attribute'] == type, "cluster"] = df_type["cluster"]
    else:
        df.loc[df['Attribute'] == type, "cluster"] = 0

df['cluster'] = df['cluster'].astype(int)

Buyer Motivation
Customer Expectations
How the product is used
Where the product is used
User Description
Season
When the product is used
Price
Quality
Durability
Ease of Use
Setup and Instructions
Size and Fit
Danger Appraisal
Design and Appearance
Parts and Components


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_type["cluster"] = labels
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_type["cluster"] = labels
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_type["cluster"] = labels
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instea

In [20]:
cluster_df  = df[['Attribute', 'cluster','Value']].drop_duplicates()

In [21]:
cluster_df.drop_duplicates()
cluster_df

Unnamed: 0,Attribute,cluster,Value
2,Buyer Motivation,5,To get things growing
3,Buyer Motivation,6,To have a raised bed for gardening
5,Buyer Motivation,2,To save my back
6,Buyer Motivation,4,To grow crops with my granddaughter
8,Buyer Motivation,3,To have a convenient gardening space
9,Buyer Motivation,1,To have a handy tool
10,Buyer Motivation,0,To plant perennials
13,Customer Expectations,3,To have a good size and sturdy product
14,Customer Expectations,6,To have a durable bed that will last for a few...
16,Customer Expectations,5,To help tremendously with gardening


# Get label for the clusters

In [22]:
labeling_function = [
    {
        "name": "cluster_label",
        "description": "Provide a single label for the topic represented in the list of values.",
        "parameters": {
            "type": "object",
            "properties": {
                "cluster_label": {
                    "type": "string",
                    "description": "Provide a single label for the topic represented in the list of values. [7 words max]. Example: 'Low perceived quality versus competitors', 'Breaks easily and often', 'low sound quality','better than expected'',' "
                },
            },
            "required": ["cluster_label"]
        },
    }
]

In [23]:
import asyncio
import aiohttp
from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
)

headers = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {OPENAI_API_KEY}"
}

class ProgressLog:
    def __init__(self, total):
        self.total = total
        self.done = 0

    def increment(self):
        self.done = self.done + 1

    def __repr__(self):
        return f"Done runs {self.done}/{self.total}."

@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(20), before_sleep=print, retry_error_callback=lambda _: None)
async def get_completion(content, session, semaphore, progress_log, functions=None, function_call=None):
    async with semaphore:
        json_data = {
            "model": GPT_MODEL,
            "messages": content,
            "temperature": 0
        }
        
        if functions is not None:
            json_data.update({"functions": functions})
        if function_call is not None:
            json_data.update({"function_call": function_call})

        async with session.post("https://api.openai.com/v1/chat/completions", headers=headers, json=json_data) as resp:
            response_json = await resp.json()
            progress_log.increment()
            print(progress_log)
            return response_json["choices"][0]['message']

async def get_completion_list(content_list, max_parallel_calls, timeout, functions=None, function_call=None):
    semaphore = asyncio.Semaphore(value=max_parallel_calls)
    progress_log = ProgressLog(len(content_list))

    async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(timeout)) as session:
        return await asyncio.gather(*[get_completion(content, session, semaphore, progress_log, functions, function_call) for content in content_list])



In [24]:
# Define maximum parallel calls and timeout
max_parallel_calls = 100  # Adjust based on how many requests you want to make concurrently
timeout = 60  # Adjust timeout as per your needs

# Define functions and function call
functions = labeling_function  # Replace with your functions
function_call = {"name": "cluster_label"}

# Initialize 'content_list' if it's not already defined
content_list = []

# Loop through the unique types in the 'Attribute' column of 'cluster_df'
for type in cluster_df['Attribute'].unique():
    # Filter 'cluster_df' to get only rows with the current 'type' and loop through the clusters for that type
    for cluster in cluster_df[cluster_df['Attribute'] == type]['cluster'].unique():
        # Get the unique values for the current 'type' and 'cluster'
        values = cluster_df[(cluster_df['Attribute'] == type) & (cluster_df['cluster'] == cluster)]['Value'].unique()
        # Create the message dictionary
        messages = [{"role": "user", "content": f"{type} : {values}"}]
        content_list.append(messages)

# Wrap your main coroutine invocation in another async function.
async def main():
    responses = await get_completion_list(content_list, max_parallel_calls, timeout, functions, function_call)
    return responses

# Now you can run your code using an await expression:
responses = await main()

Done runs 1/53.
Done runs 2/53.
Done runs 3/53.
Done runs 4/53.
Done runs 5/53.
Done runs 6/53.
Done runs 7/53.
Done runs 8/53.
Done runs 9/53.
Done runs 10/53.
Done runs 11/53.
Done runs 12/53.
Done runs 13/53.
Done runs 14/53.
Done runs 15/53.
Done runs 16/53.
Done runs 17/53.
Done runs 18/53.
Done runs 19/53.
Done runs 20/53.
Done runs 21/53.
Done runs 22/53.
Done runs 23/53.
Done runs 24/53.
Done runs 25/53.
Done runs 26/53.
Done runs 27/53.
Done runs 28/53.
Done runs 29/53.
Done runs 30/53.
Done runs 31/53.
Done runs 32/53.
Done runs 33/53.
Done runs 34/53.
Done runs 35/53.
Done runs 36/53.
Done runs 37/53.
Done runs 38/53.
Done runs 39/53.
Done runs 40/53.
Done runs 41/53.
Done runs 42/53.
Done runs 43/53.
Done runs 44/53.
Done runs 45/53.
Done runs 46/53.
Done runs 47/53.
Done runs 48/53.
Done runs 49/53.
Done runs 50/53.
Done runs 51/53.
Done runs 52/53.
<RetryCallState 6323291376: attempt #1; slept for 0.12; last result: failed (TimeoutError )>
Done runs 53/53.


In [25]:
eval_responses = []
for item in responses:
    data = item['function_call']['arguments']
    eval_data = eval(data)
    eval_responses.append(eval_data['cluster_label'])

In [26]:
cluster_response_df= cluster_df.drop(columns = ['Value']).drop_duplicates()
cluster_response_df['cluster_label'] = eval_responses

In [27]:
df_with_clusters = df.merge(cluster_response_df, on = ['Attribute', 'cluster'], how = 'left')
df_with_clusters.drop(columns = ['n_tokens', 'embedding','Date', 'Author','Images'], inplace = True)

In [28]:
interim_reviews_path = '/Users/vladbordei/Documents/Development/ProductExplorer/data/interim/reviews_df_interim.csv'
reviews = pd.read_csv(interim_reviews_path)

In [31]:
df_with_clusters.columns

Index(['Rating', 'URL', 'positive_sentiment', 'negative_sentiment', 'asin',
       'id', 'Review Summary', 'Attribute', 'Value', 'cluster',
       'cluster_label'],
      dtype='object')

In [33]:
reviews_with_clusters = df_with_clusters.merge(reviews[['URL', 'Date', 'Author','Verified', 'Helpful', 'Title', 'review',  'Images', 'Videos','Variation', 'Style' ]], on = ['URL'], how = 'left')

In [34]:
reviews_with_clusters_path = '/Users/vladbordei/Documents/Development/ProductExplorer/data/interim/reviews_with_clusters.csv'
reviews_with_clusters.to_csv(reviews_with_clusters_path)

# Quantify observations

In [35]:
df_with_clusters['positive_sentiment'] = df_with_clusters['positive_sentiment'].astype(float)
df_with_clusters['negative_sentiment'] = df_with_clusters['negative_sentiment'].astype(float)


agg_result = df_with_clusters.groupby(['Attribute', 'cluster_label']).agg({
    'positive_sentiment': 'mean', 
    'negative_sentiment': 'mean',
    'Rating': lambda x: list(x),
    'id': lambda x: list(x),
    'asin': lambda x: list(x),
    'URL': lambda x: list(x),
    }).reset_index()

# Aggregate the count separately
count_result = df_with_clusters.groupby(['Attribute', 'cluster_label']).size().reset_index(name='observation_count')
attribute_clusters_with_percentage = pd.merge(agg_result, count_result, on=['Attribute', 'cluster_label'])


# Calculate the average rating
m = []
for e in attribute_clusters_with_percentage['Rating']:
    f =[]
    for r in e:
        f.append(int(r))
    m.append(np.mean(f))
k = []
for e in m:
    f = round(e,0)
    f = int(f)
    k.append(f)

attribute_clusters_with_percentage['rating_avg'] = k


total_observations_per_attribute = df_with_clusters.groupby('Attribute').size()

attribute_clusters_with_percentage = attribute_clusters_with_percentage.set_index('Attribute')  # set 'Attribute' as the index to allow for division
attribute_clusters_with_percentage['attribute_percentage'] = attribute_clusters_with_percentage['observation_count'] / total_observations_per_attribute * 100
attribute_clusters_with_percentage = attribute_clusters_with_percentage.reset_index()  # reset the index if desired


In [36]:
attribute_clusters_with_percentage.head(3)

Unnamed: 0,Attribute,cluster_label,positive_sentiment,negative_sentiment,Rating,id,asin,URL,observation_count,rating_avg,attribute_percentage
0,Buyer Motivation,Back pain relief,0.993381,0.002647,[5],[5],[B0BGND2M55],[https://www.amazon.com/gp/customer-reviews/R1...,1,5,14.285714
1,Buyer Motivation,Convenient gardening space,0.943908,0.004142,[5],[8],[B0BM5ZJWQP],[https://www.amazon.com/gp/customer-reviews/R3...,1,5,14.285714
2,Buyer Motivation,Emotional bonding with family,0.989524,0.001497,[5],[6],[B09D7QDLYC],[https://www.amazon.com/gp/customer-reviews/R1...,1,5,14.285714


In [37]:
attribute_clusters_with_percentage.columns

Index(['Attribute', 'cluster_label', 'positive_sentiment',
       'negative_sentiment', 'Rating', 'id', 'asin', 'URL',
       'observation_count', 'rating_avg', 'attribute_percentage'],
      dtype='object')

In [38]:
attribute_clusters_with_percentage_path = '/Users/vladbordei/Documents/Development/ProductExplorer/data/interim/attribute_clusters_with_percentage.csv'
attribute_clusters_with_percentage.to_csv(attribute_clusters_with_percentage_path)