In [53]:
import numpy as np
import pandas as pd
import tiktoken

from openai.embeddings_utils import get_embedding
from sklearn.cluster import AgglomerativeClustering

import os
import openai
from dotenv import load_dotenv
# from sqlalchemy import create_engine

load_dotenv()
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

if os.getenv("OPENAI_API_KEY") is not None:
    print ("OPENAI_API_KEY is ready")
else:
    print ("OPENAI_API_KEY environment variable not found")

import requests
from tenacity import retry, wait_random_exponential, stop_after_attempt

GPT_MODEL = "gpt-3.5-turbo-0613"

OPENAI_API_KEY is ready


# Obiectiv Fisier
- identific topics, apoi clusterizez si denumesc din nou, daca e nevoie
- identific atribute asociate cu topics, apoi le clusterizez si denumesc din nou
- fiecare topic si atribut trebuie sa aibe asociate rating-ul, ID-ul review-ului si asin-ul, sentimentele asociate.
- plec la drum cu un fisier de reivews redus la minimul necesar. Acelasi fisier de reviews data va fi extins (exploded) astfel incat atributele sa fie specifice unei baze de date:

"Attribute" (exemplu: when)
si value

In [57]:
interim_reviews_path = interim_reviews_path = '/Users/vladbordei/Documents/Development/ProductExplorer/data/interim/reviews_df_interim.csv'
reviews = pd.read_csv(interim_reviews_path)

In [58]:
reviews

Unnamed: 0,Date,Author,Verified,Helpful,Title,review,Rating,Images,Videos,URL,...,Durability,Ease of Use,Setup and Instructions,Noise and Smell,Colors,Size and Fit,Danger Appraisal,Design and Appearance,Parts and Components,Issues
0,2023-04-13,Michael Neiswender,yes,-,Well made raised beds.,Well made with good finish Great looking beds ...,5,-,-,https://www.amazon.com/gp/customer-reviews/RVX...,...,,,,,,,,Great looking beds,,It does take a lot of fill
1,2023-04-18,Knope,yes,-,Good value,Nice size Good value,5,-,-,https://www.amazon.com/gp/customer-reviews/R2Z...,...,,,,,,,,,,
2,2023-03-25,A. Meek,yes,-,Simply requires a lot of patience and wrist ac...,I like the product I own several raised bed pl...,5,-,-,https://www.amazon.com/gp/customer-reviews/R1K...,...,Not specified,"Challenging to assemble, but works well once e...",Not specified,Not specified,Not specified,Not specified,Not specified,Not specified,Not specified,Not specified
3,2023-03-17,Amazon Customer,yes,-,Functional and easy to assemble,Very easy to assemble The customer service is...,5,-,-,https://www.amazon.com/gp/customer-reviews/R2E...,...,,,,,,,,,,
4,2023-03-16,melodie woodall,yes,-,Perfect for a raised garden .,Its a great for a vegy garden,5,-,-,https://www.amazon.com/gp/customer-reviews/R2R...,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
262,2022-08-06,Chari,yes,-,Simple and good.,Perfect for what I need,5,-,-,https://www.amazon.com/gp/customer-reviews/RCW...,...,,,,,,,,,,
263,2023-04-12,Helen,yes,-,Thin and flimsy,I got these gardens beds thinking they are thi...,1,-,-,https://www.amazon.com/gp/customer-reviews/R1I...,...,Not specified,Not specified,Not specified,Not specified,Not specified,Not specified,Not specified,Not specified,Not specified,The garden beds are flimsy and require additio...
264,2023-05-13,Diane Forsell MooneyDiane Forsell Mooney,-,-,Run away from this,This item is overpriced for the flimsy materia...,2,https://m.media-amazon.com/images/I/61l6wtqikW...,-,https://www.amazon.com/gp/customer-reviews/RZ1...,...,,Not easy to assemble,Difficult to assemble,,,,,,,"Difficult to assemble, flimsy materials"
265,2022-08-23,Amazon Customer,yes,-,Great for the price,Easy to assemble and use,5,-,-,https://www.amazon.com/gp/customer-reviews/RRH...,...,,Easy to use,Easy to assemble,,,,,,,


In [60]:
reviews.drop(columns = [ 'Verified', 'Helpful', 'Title', 'review','Videos','Variation', 'Style', 'num_tokens', 'review_num_tokens', 'eval_response'], inplace = True)

In [61]:
data_cols = ["Review Summary","Buyer Motivation", "Customer Expectations", "How the product is used", "Where the product is used", "User Description", "Packaging", "Season", "When the product is used", "Price", "Quality", "Durability", "Ease of Use", "Setup and Instructions", "Noise and Smell", "Colors", "Size and Fit", "Danger Appraisal", "Design and Appearance", "Parts and Components", "Issues"]
for col in data_cols:
    reviews[col] = reviews[col].fillna('')
    reviews[col].replace(['\n', 'not mentioned',np.nan, '',' ', 'NA', 'N/A', 'missing', 'NaN', 'unknown', 'Not mentioned','not specified','Not specified'], 'unknown', inplace = True)

In [62]:
columns_to_pivot = ["Buyer Motivation", "Customer Expectations", "How the product is used", "Where the product is used", "User Description", "Packaging", "Season", "When the product is used", "Price", "Quality", "Durability", "Ease of Use", "Setup and Instructions", "Noise and Smell", "Colors", "Size and Fit", "Danger Appraisal", "Design and Appearance", "Parts and Components", "Issues"]

# assume 'df' is your DataFrame
reviews_data_df = reviews.melt(id_vars=[col for col in reviews.columns if col not in columns_to_pivot], 
                    value_vars=columns_to_pivot, 
                    var_name='Attribute', 
                    value_name='Value')

In [63]:
reviews_data_df = reviews_data_df[reviews_data_df['Value'] != 'unknown']

# Clustering

In [67]:
embedding_model = "text-embedding-ada-002"
embedding_encoding = "cl100k_base"  # this the encoding for text-embedding-ada-002
max_tokens = 8000  # the maximum for text-embedding-ada-002 is 8191
encoding = tiktoken.get_encoding(embedding_encoding)

@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def get_embedding(text: str, model="text-embedding-ada-002") -> list[float]:
    return openai.Embedding.create(input=[text], model=model)["data"][0]["embedding"]


In [65]:
df = reviews_data_df

In [66]:
# omit reviews that are too long to embed
df["n_tokens"] = df['Value'].apply(lambda x: len(encoding.encode(x)))
df = df[df.n_tokens <= max_tokens]

# Get embeddings
df["embedding"] = df['Value'].apply(lambda x: get_embedding(x, model=embedding_model))
df["embedding"] = df["embedding"].apply(np.array)  # convert string to numpy array

In [68]:
max_n_clusters = 7
df["cluster"] = np.nan

types_list = list(reviews_data_df['Attribute'].unique())

for type in types_list:
    print(type)
    df_type = df[df['Attribute'] == type]
    n_clusters = min(max_n_clusters, len(df_type['Value'].unique()))
    if n_clusters > 2:
        clustering = AgglomerativeClustering(n_clusters=n_clusters)
        matrix = np.vstack(df_type["embedding"].values)
        labels = clustering.fit_predict(matrix)
        df_type["cluster"] = labels
        df.loc[df['Attribute'] == type, "cluster"] = df_type["cluster"]
    else:
        df.loc[df['Attribute'] == type, "cluster"] = 0

df['cluster'] = df['cluster'].astype(int)

Buyer Motivation
Customer Expectations
How the product is used
Where the product is used
User Description
Packaging
Season
When the product is used
Price
Quality
Durability
Ease of Use
Setup and Instructions
Noise and Smell
Colors
Size and Fit
Danger Appraisal
Design and Appearance
Parts and Components
Issues


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_type["cluster"] = labels
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_type["cluster"] = labels
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_type["cluster"] = labels
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instea

In [69]:
cluster_df  = df[['Attribute', 'cluster','Value']].drop_duplicates()

In [70]:
cluster_df.drop_duplicates()
cluster_df

Unnamed: 0,Attribute,cluster,Value
1,Buyer Motivation,4,Good value
2,Buyer Motivation,2,To have a convenient way to garden
6,Buyer Motivation,4,Great reviews
9,Buyer Motivation,0,To assist my mother with her mobility limitations
12,Buyer Motivation,1,Im bored at prepping it for spring planting
...,...,...,...
5331,Issues,0,"Missing nuts and bolts, sharp edges"
5333,Issues,2,"Defective product, flimsy support brackets, mi..."
5334,Issues,1,There were many tiny screws that made it time ...
5336,Issues,4,The garden beds are flimsy and require additio...


# Get label for the clusters

In [71]:
labeling_function = [
    {
        "name": "cluster_label",
        "description": "Provide a single label for the topic represented in the list of values.",
        "parameters": {
            "type": "object",
            "properties": {
                "cluster_label": {
                    "type": "string",
                    "description": "Provide a single label for the topic represented in the list of values. [7 words max]. Example: 'Low perceived quality versus competitors', 'Breaks easily and often', 'low sound quality','better than expected'',' "
                },
            },
            "required": ["cluster_label"]
        },
    }
]

In [72]:
import asyncio
import aiohttp
from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
)

headers = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {OPENAI_API_KEY}"
}

class ProgressLog:
    def __init__(self, total):
        self.total = total
        self.done = 0

    def increment(self):
        self.done = self.done + 1

    def __repr__(self):
        return f"Done runs {self.done}/{self.total}."

@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(20), before_sleep=print, retry_error_callback=lambda _: None)
async def get_completion(content, session, semaphore, progress_log, functions=None, function_call=None):
    async with semaphore:
        json_data = {
            "model": GPT_MODEL,
            "messages": content,
            "temperature": 0
        }
        
        if functions is not None:
            json_data.update({"functions": functions})
        if function_call is not None:
            json_data.update({"function_call": function_call})

        async with session.post("https://api.openai.com/v1/chat/completions", headers=headers, json=json_data) as resp:
            response_json = await resp.json()
            progress_log.increment()
            print(progress_log)
            return response_json["choices"][0]['message']

async def get_completion_list(content_list, max_parallel_calls, timeout, functions=None, function_call=None):
    semaphore = asyncio.Semaphore(value=max_parallel_calls)
    progress_log = ProgressLog(len(content_list))

    async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(timeout)) as session:
        return await asyncio.gather(*[get_completion(content, session, semaphore, progress_log, functions, function_call) for content in content_list])



In [73]:
# Define maximum parallel calls and timeout
max_parallel_calls = 100  # Adjust based on how many requests you want to make concurrently
timeout = 60  # Adjust timeout as per your needs

# Define functions and function call
functions = labeling_function  # Replace with your functions
function_call = {"name": "cluster_label"}

# Initialize 'content_list' if it's not already defined
content_list = []

# Loop through the unique types in the 'Attribute' column of 'cluster_df'
for type in cluster_df['Attribute'].unique():
    # Filter 'cluster_df' to get only rows with the current 'type' and loop through the clusters for that type
    for cluster in cluster_df[cluster_df['Attribute'] == type]['cluster'].unique():
        # Get the unique values for the current 'type' and 'cluster'
        values = cluster_df[(cluster_df['Attribute'] == type) & (cluster_df['cluster'] == cluster)]['Value'].unique()
        # Create the message dictionary
        messages = [{"role": "user", "content": f"{type} : {values}"}]
        content_list.append(messages)

# Wrap your main coroutine invocation in another async function.
async def main():
    responses = await get_completion_list(content_list, max_parallel_calls, timeout, functions, function_call)
    return responses

# Now you can run your code using an await expression:
responses = await main()

Done runs 1/140.
Done runs 2/140.
Done runs 3/140.
Done runs 4/140.
Done runs 5/140.
Done runs 6/140.
Done runs 7/140.
Done runs 8/140.
Done runs 9/140.
Done runs 10/140.
Done runs 11/140.
Done runs 12/140.
Done runs 13/140.
Done runs 14/140.
Done runs 15/140.
Done runs 16/140.
Done runs 17/140.
Done runs 18/140.
Done runs 19/140.
Done runs 20/140.
Done runs 21/140.
Done runs 22/140.
Done runs 23/140.
Done runs 24/140.
Done runs 25/140.
Done runs 26/140.
Done runs 27/140.
Done runs 28/140.
Done runs 29/140.
Done runs 30/140.
Done runs 31/140.
Done runs 32/140.
Done runs 33/140.
Done runs 34/140.
Done runs 35/140.
Done runs 36/140.
Done runs 37/140.
Done runs 38/140.
Done runs 39/140.
Done runs 40/140.
Done runs 41/140.
Done runs 42/140.
Done runs 43/140.
Done runs 44/140.
Done runs 45/140.
Done runs 46/140.
Done runs 47/140.
Done runs 48/140.
Done runs 49/140.
Done runs 50/140.
Done runs 51/140.
Done runs 52/140.
Done runs 53/140.
Done runs 54/140.
Done runs 55/140.
Done runs 56/140.
D

In [74]:
eval_responses = []
for item in responses:
    data = item['function_call']['arguments']
    eval_data = eval(data)
    eval_responses.append(eval_data['cluster_label'])

In [75]:
cluster_response_df= cluster_df.drop(columns = ['Value']).drop_duplicates()
cluster_response_df['cluster_label'] = eval_responses

In [76]:
df_with_clusters = df.merge(cluster_response_df, on = ['Attribute', 'cluster'], how = 'left')
df_with_clusters.drop(columns = ['n_tokens', 'embedding','Date', 'Author','Images'], inplace = True)

In [77]:
interim_reviews_path = '/Users/vladbordei/Documents/Development/ProductExplorer/data/interim/reviews_df_interim.csv'
reviews = pd.read_csv(interim_reviews_path)

In [78]:
df_with_clusters.columns

Index(['Rating', 'URL', 'positive_sentiment', 'negative_sentiment', 'asin',
       'id', 'Review Summary', 'Attribute', 'Value', 'cluster',
       'cluster_label'],
      dtype='object')

In [79]:
reviews_with_clusters = df_with_clusters.merge(reviews[['URL', 'Date', 'Author','Verified', 'Helpful', 'Title', 'review',  'Images', 'Videos','Variation', 'Style' ]], on = ['URL'], how = 'left')

In [80]:
reviews_with_clusters_path = '/Users/vladbordei/Documents/Development/ProductExplorer/data/interim/reviews_with_clusters.csv'
reviews_with_clusters.to_csv(reviews_with_clusters_path)

# Quantify observations

In [81]:
df_with_clusters['positive_sentiment'] = df_with_clusters['positive_sentiment'].astype(float)
df_with_clusters['negative_sentiment'] = df_with_clusters['negative_sentiment'].astype(float)


agg_result = df_with_clusters.groupby(['Attribute', 'cluster_label']).agg({
    'positive_sentiment': 'mean', 
    'negative_sentiment': 'mean',
    'Rating': lambda x: list(x),
    'id': lambda x: list(x),
    'asin': lambda x: list(x),
    'URL': lambda x: list(x),
    }).reset_index()

# Aggregate the count separately
count_result = df_with_clusters.groupby(['Attribute', 'cluster_label']).size().reset_index(name='observation_count')
attribute_clusters_with_percentage = pd.merge(agg_result, count_result, on=['Attribute', 'cluster_label'])


# Calculate the average rating
m = []
for e in attribute_clusters_with_percentage['Rating']:
    f =[]
    for r in e:
        f.append(int(r))
    m.append(np.mean(f))
k = []
for e in m:
    f = round(e,0)
    f = int(f)
    k.append(f)

attribute_clusters_with_percentage['rating_avg'] = k


total_observations_per_attribute = df_with_clusters.groupby('Attribute').size()

attribute_clusters_with_percentage = attribute_clusters_with_percentage.set_index('Attribute')  # set 'Attribute' as the index to allow for division
attribute_clusters_with_percentage['attribute_percentage'] = attribute_clusters_with_percentage['observation_count'] / total_observations_per_attribute * 100
attribute_clusters_with_percentage = attribute_clusters_with_percentage.reset_index()  # reset the index if desired


In [82]:
attribute_clusters_with_percentage.head(3)

Unnamed: 0,Attribute,cluster_label,positive_sentiment,negative_sentiment,Rating,id,asin,URL,observation_count,rating_avg,attribute_percentage
0,Buyer Motivation,Bed replacement,0.662187,0.128551,"[4, 5, 3, 5, 5, 5]","[38, 40, 129, 160, 167, 233]","[B0BDLH5M3F, B0BDLH5M3F, B0BGND2M55, B09D7QDLY...",[https://www.amazon.com/gp/customer-reviews/R2...,6,4,4.615385
1,Buyer Motivation,Buyer Motivation,0.730204,0.143931,"[5, 2, 5, 5, 5, 4, 5, 5, 4, 5, 4, 1, 5, 5, 5, ...","[1, 6, 12, 13, 16, 24, 28, 34, 37, 56, 66, 84,...","[B09SQZJV67, B09SQZJV67, B09SQZJV67, B09SQZJV6...",[https://www.amazon.com/gp/customer-reviews/R2...,40,4,30.769231
2,Buyer Motivation,Motivation: Raised Bed Gardening,0.553676,0.240755,"[5, 4, 5, 3, 4, 1, 3, 5, 4, 3, 3, 4, 4, 5, 2, ...","[23, 31, 32, 49, 62, 79, 85, 88, 91, 95, 96, 9...","[B09SQZJV67, B0BDLH5M3F, B0BDLH5M3F, B0BDLH5M3...",[https://www.amazon.com/gp/customer-reviews/R3...,25,4,19.230769


In [83]:
attribute_clusters_with_percentage.columns

Index(['Attribute', 'cluster_label', 'positive_sentiment',
       'negative_sentiment', 'Rating', 'id', 'asin', 'URL',
       'observation_count', 'rating_avg', 'attribute_percentage'],
      dtype='object')

In [84]:
attribute_clusters_with_percentage_path = '/Users/vladbordei/Documents/Development/ProductExplorer/data/interim/attribute_clusters_with_percentage.csv'
attribute_clusters_with_percentage.to_csv(attribute_clusters_with_percentage_path)