In [81]:
import numpy as np
import pandas as pd
import tiktoken

from openai.embeddings_utils import get_embedding
from sklearn.cluster import AgglomerativeClustering

import os
import openai
from dotenv import load_dotenv
# from sqlalchemy import create_engine

load_dotenv()
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

if os.getenv("OPENAI_API_KEY") is not None:
    print ("OPENAI_API_KEY is ready")
else:
    print ("OPENAI_API_KEY environment variable not found")

import requests
from tenacity import retry, wait_random_exponential, stop_after_attempt

GPT_MODEL = "gpt-3.5-turbo-0613"

OPENAI_API_KEY is ready


# Obiectiv Fisier
- identific topics, apoi clusterizez si denumesc din nou, daca e nevoie
- identific atribute asociate cu topics, apoi le clusterizez si denumesc din nou
- fiecare topic si atribut trebuie sa aibe asociate rating-ul, ID-ul review-ului si asin-ul, sentimentele asociate.
- plec la drum cu un fisier de reivews redus la minimul necesar. Acelasi fisier de reviews data va fi extins (exploded) astfel incat atributele sa fie specifice unei baze de date:

"Attribute" (exemplu: when)
si value

In [30]:
interim_reviews_path = '/Users/vladbordei/Documents/Development/ProductExplorer/data/interim/reviews_df_interim.csv'
reviews = pd.read_csv(interim_reviews_path)

In [31]:
reviews.drop(columns = [ 'Verified', 'Helpful', 'Title', 'review','Videos','Variation', 'Style', 'num_tokens', 'review_num_tokens','initial_response', 'eval_response'], inplace = True)

In [32]:
data_cols = ["Review Summary","Buyer Motivation", "Customer Expectations", "How the product is used", "Where the product is used", "User Description", "Packaging", "Season", "When the product is used", "Price", "Quality", "Durability", "Ease of Use", "Setup and Instructions", "Noise and Smell", "Colors", "Size and Fit", "Danger Appraisal", "Design and Appearance", "Parts and Components"]
for col in data_cols:
    reviews[col] = reviews[col].fillna('')
    reviews[col].replace(['\n', 'not mentioned',np.nan, '',' ', 'NA', 'N/A', 'missing', 'NaN', 'unknown', 'Not mentioned','not specified','Not specified'], 'unknown', inplace = True)

In [33]:
columns_to_pivot = data_cols

# assume 'df' is your DataFrame
reviews_data_df = reviews.melt(id_vars=[col for col in reviews.columns if col not in columns_to_pivot], 
                    value_vars=columns_to_pivot, 
                    var_name='Attribute', 
                    value_name='Value')

In [34]:
reviews_data_df = reviews_data_df[reviews_data_df['Value'] != 'unknown']

In [35]:
reviews_data_df

Unnamed: 0,Date,Author,Rating,Images,URL,positive_sentiment,negative_sentiment,asin,id,Attribute,Value
0,2023-04-07,Amazon Customer,5,-,https://www.amazon.com/gp/customer-reviews/R3L...,0.969480,0.004882,B09SQZJV67,0,Review Summary,Very good and easy to put up
1,2023-03-01,Dan,5,-,https://www.amazon.com/gp/customer-reviews/R3L...,0.994441,0.002020,B0BDLH5M3F,1,Review Summary,Absolutely in love with these
2,2023-04-06,Karen Robertson,5,-,https://www.amazon.com/gp/customer-reviews/RKV...,0.986625,0.001518,B09K3557Q8,2,Review Summary,Good size and sturdy
3,2023-02-22,Andrew,3,-,https://www.amazon.com/gp/customer-reviews/R3I...,0.774938,0.053037,B0BG2HP1PJ,3,Review Summary,"Great bed for the price, but better options av..."
4,2023-05-20,Jose Perez,5,-,https://www.amazon.com/gp/customer-reviews/R2W...,0.203367,0.178510,B0BRXBH6L2,4,Review Summary,"Took a few mins to setup, but measurements wer..."
...,...,...,...,...,...,...,...,...,...,...,...
179,2023-02-22,Andrew,3,-,https://www.amazon.com/gp/customer-reviews/R3I...,0.774938,0.053037,B0BG2HP1PJ,3,Size and Fit,24 inches is a good height for gardening
192,2023-03-27,DeSanne Fluitt,5,-,https://www.amazon.com/gp/customer-reviews/R1Y...,0.993381,0.002647,B0BGND2M55,5,Danger Appraisal,Safe
205,2023-03-18,zlajoie,5,-,https://www.amazon.com/gp/customer-reviews/RKJ...,0.990466,0.001663,B0BJZ9J4HD,7,Design and Appearance,Looks awesome
206,2023-05-16,Danese CameronDanese Cameron,5,https://m.media-amazon.com/images/I/81jHLvy7t6...,https://www.amazon.com/gp/customer-reviews/R35...,0.943908,0.004142,B0BM5ZJWQP,8,Design and Appearance,"Square design, fits well next to a garden shed..."


# Clustering

In [38]:
embedding_model = "text-embedding-ada-002"
embedding_encoding = "cl100k_base"  # this the encoding for text-embedding-ada-002
max_tokens = 8000  # the maximum for text-embedding-ada-002 is 8191
encoding = tiktoken.get_encoding(embedding_encoding)
    
def get_text_from_embedding(embedding):
    return openai.Embedding.retrieve(embedding, model="text-embedding-ada-002")["data"][0]["text"]


In [60]:
df = reviews_data_df

In [61]:
# omit reviews that are too long to embed
df["n_tokens"] = df['Value'].apply(lambda x: len(encoding.encode(x)))
df = df[df.n_tokens <= max_tokens]

# Get embeddings
df["embedding"] = df['Value'].apply(lambda x: get_embedding(x, engine=embedding_model))
df["embedding"] = df["embedding"].apply(np.array)  # convert string to numpy array
matrix = np.vstack(df.embedding.values)

In [77]:
max_n_clusters = 7
df["cluster"] = np.nan

types_list = list(reviews_data_df['Attribute'].unique())

for type in types_list:
    print(type)
    df_type = df[df['Attribute'] == type]
    n_clusters = min(max_n_clusters, len(df_type['Value'].unique()))
    if n_clusters > 2:
        clustering = AgglomerativeClustering(n_clusters=n_clusters)
        matrix = np.vstack(df_type["embedding"].values)
        labels = clustering.fit_predict(matrix)
        df_type["cluster"] = labels
        df.loc[df['Attribute'] == type, "cluster"] = df_type["cluster"]
    else:
        df.loc[df['Attribute'] == type, "cluster"] = 0
df['cluster'] = df['cluster'].astype(int)

Review Summary
Buyer Motivation
Customer Expectations
How the product is used
Where the product is used
User Description
Season
When the product is used
Price
Quality
Durability
Ease of Use
Setup and Instructions
Size and Fit
Danger Appraisal
Design and Appearance
Parts and Components


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_type["cluster"] = labels
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_type["cluster"] = labels
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_type["cluster"] = labels
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instea

In [78]:
# Get values for each cluster, write in a dictionary
cluster_dict = {}
for type in types_list:
    cluster_dict[type] = {}
    for cluster in df[df['Attribute'] == type]['cluster'].unique():
        cluster_dict[type][cluster] = list(df[(df['Attribute'] == type) & (df['cluster'] == cluster)]['Value'].unique())

cluster_dict

{'Review Summary': {0: ['Very good and easy to put up',
   'Good size and sturdy',
   'Easy to setup and looks awesome'],
  5: ['Absolutely in love with these'],
  3: ['Great bed for the price, but better options available at higher prices'],
  6: ['Took a few mins to setup, but measurements were correct'],
  1: ['Perfect for vegetable beds',
   'Very pleased with the raised bed',
   'Great for gardening'],
  2: ['Difficult to assemble'],
  4: ['Wonderful flower boxes that survived hurricane winds']},
 'Buyer Motivation': {5: ['To get things growing'],
  6: ['To have a raised bed for gardening'],
  2: ['To save my back'],
  4: ['To grow crops with my granddaughter'],
  3: ['To have a convenient gardening space'],
  1: ['To have a handy tool'],
  0: ['To plant perennials']},
 'Customer Expectations': {3: ['To have a good size and sturdy product'],
  6: ['To have a durable bed that will last for a few years'],
  5: ['To help tremendously with gardening'],
  4: ['To have a durable and eas

# Get label for the clusters

In [53]:
labeling_function = [
    {
        "name": "cluster_label",
        "description": "Provide a single label for the topic represented in the list of values.",
        "parameters": {
            "type": "object",
            "properties": {
                "cluster_label": {
                    "type": "string",
                    "description": "Provide a single label for the topic represented in the list of values. [7 words max]. Example: 'Low perceived quality versus competitors', 'Breaks easily and often', 'low sound quality','better than expected'',' "
                },
            },
            "required": ["cluster_label"]
        },
    }
]

In [79]:
import asyncio
import aiohttp
from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
)

headers = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {OPENAI_API_KEY}"
}

class ProgressLog:
    def __init__(self, total):
        self.total = total
        self.done = 0

    def increment(self):
        self.done = self.done + 1

    def __repr__(self):
        return f"Done runs {self.done}/{self.total}."

@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(20), before_sleep=print, retry_error_callback=lambda _: None)
async def get_completion(content, session, semaphore, progress_log, functions=None, function_call=None):
    async with semaphore:
        json_data = {
            "model": GPT_MODEL,
            "messages": content,
            "temperature": 0
        }
        
        if functions is not None:
            json_data.update({"functions": functions})
        if function_call is not None:
            json_data.update({"function_call": function_call})

        async with session.post("https://api.openai.com/v1/chat/completions", headers=headers, json=json_data) as resp:
            response_json = await resp.json()
            progress_log.increment()
            print(progress_log)
            return response_json["choices"][0]['message']

async def get_completion_list(content_list, max_parallel_calls, timeout, functions=None, function_call=None):
    semaphore = asyncio.Semaphore(value=max_parallel_calls)
    progress_log = ProgressLog(len(content_list))

    async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(timeout)) as session:
        return await asyncio.gather(*[get_completion(content, session, semaphore, progress_log, functions, function_call) for content in content_list])



In [82]:
# Define maximum parallel calls and timeout
max_parallel_calls = 100  # Adjust based on how many requests you want to make concurrently
timeout = 60  # Adjust timeout as per your needs

# Define functions and function call
functions = labeling_function  # Replace with your functions
function_call = {"name": "cluster_label"}

# Create a list of messages for all reviews
content_list = []

# iterate in cluster dict cluster_dict
for type in cluster_dict:
    for cluster in cluster_dict[type]:
        messages = [
            {"role": "user", "content": f"{type} {cluster_dict[type][cluster]}"},
        ]
        content_list.append(messages)

# Wrap your main coroutine invocation in another async function.
async def main():
    responses = await get_completion_list(content_list, max_parallel_calls, timeout, functions, function_call)
    return responses

# Now you can run your code using an await expression:
responses = await main()

Done runs 1/60.
Done runs 2/60.
Done runs 3/60.
Done runs 4/60.
Done runs 5/60.
Done runs 6/60.
Done runs 7/60.
Done runs 8/60.
Done runs 9/60.
Done runs 10/60.
Done runs 11/60.
Done runs 12/60.
Done runs 13/60.
Done runs 14/60.
Done runs 15/60.
Done runs 16/60.
Done runs 17/60.
Done runs 18/60.
Done runs 19/60.
Done runs 20/60.
Done runs 21/60.
Done runs 22/60.
Done runs 23/60.
Done runs 24/60.
Done runs 25/60.
Done runs 26/60.
Done runs 27/60.
Done runs 28/60.
Done runs 29/60.
Done runs 30/60.
Done runs 31/60.
Done runs 32/60.
Done runs 33/60.
Done runs 34/60.
Done runs 35/60.
Done runs 36/60.
Done runs 37/60.
Done runs 38/60.
Done runs 39/60.
Done runs 40/60.
Done runs 41/60.
Done runs 42/60.
Done runs 43/60.
Done runs 44/60.
Done runs 45/60.
Done runs 46/60.
Done runs 47/60.
Done runs 48/60.
Done runs 49/60.
Done runs 50/60.
Done runs 51/60.
Done runs 52/60.
Done runs 53/60.
Done runs 54/60.
Done runs 55/60.
Done runs 56/60.
Done runs 57/60.
Done runs 58/60.
Done runs 59/60.
Done r

In [83]:
responses

[{'role': 'assistant',
  'content': None,
  'function_call': {'name': 'cluster_label',
   'arguments': '{\n  "cluster_label": "Positive reviews"\n}'}},
 {'role': 'assistant',
  'content': None,
  'function_call': {'name': 'cluster_label',
   'arguments': '{\n  "cluster_label": "Absolutely in love with these"\n}'}},
 {'role': 'assistant',
  'content': None,
  'function_call': {'name': 'cluster_label',
   'arguments': '{\n  "cluster_label": "Mixed reviews on value for price"\n}'}},
 {'role': 'assistant',
  'content': None,
  'function_call': {'name': 'cluster_label',
   'arguments': '{\n  "cluster_label": "Accurate measurements after setup"\n}'}},
 {'role': 'assistant',
  'content': None,
  'function_call': {'name': 'cluster_label',
   'arguments': '{\n  "cluster_label": "Positive reviews for vegetable beds"\n}'}},
 {'role': 'assistant',
  'content': None,
  'function_call': {'name': 'cluster_label',
   'arguments': '{\n  "cluster_label": "Difficult to assemble"\n}'}},
 {'role': 'assista

In [49]:
def get_chatbot_trait_labels(clusters_dict, temperature=0.2, api_key=OPENAI_API_KEY):
    
    User_Prompt_1 = """
    I have a list of phrases, each related to a specific theme. Provide a single label for the theme represented in the list.
    List of phrases: {99: ['improved magnet pushability', ' improved magnet strength and functionality', 'improved magnet strength and quality']}
    """

    AI_Prompt_1 = """
    Magnet Strength and Functionality
    """

    chatbot_responses = {}

    for key, data_list in clusters_dict.items():
        User_Prompt_2 = f"List of phrases: {{ {key}: {data_list} }}"

        # Send the prompt to the chatbot and get the response
        response = openai.ChatCompletion.create(
                    model="gpt-3.5-turbo",
                    messages=[
                        {"role": "user", "content": User_Prompt_1},
                        {"role": "assistant", "content": AI_Prompt_1},
                        {"role": "user", "content": User_Prompt_2} ],
                    temperature=temperature,
                    api_key=api_key
        )
    
        # Process the response and store in the dictionary
        chatbot_responses[key] = response["choices"][0]["message"]["content"]
        print(chatbot_responses[key])
    
    return chatbot_responses

#%%


Unnamed: 0,Date,Rating,Images,URL,positive_sentiment,negative_sentiment,asin,id,Attribute,Value,n_tokens,embedding,cluster
12,2023-03-01,5,-,https://www.amazon.com/gp/customer-reviews/R3L...,0.994441,0.00202,B0BDLH5M3F,1,Topics,"{""durability"": ""Very durable"", ""size"": ""Strong...",16,"[0.026560181751847267, 0.017014682292938232, -...",0
14,2023-02-22,3,-,https://www.amazon.com/gp/customer-reviews/R3I...,0.774938,0.053037,B0BG2HP1PJ,3,Topics,"{""price"": ""fair"", ""durability"": ""thick and hea...",50,"[0.01312297210097313, 0.013830996118485928, -0...",6
15,2023-05-20,5,-,https://www.amazon.com/gp/customer-reviews/R2W...,0.203367,0.17851,B0BRXBH6L2,4,Topics,"{""setup"": ""took a few mins"", ""size"": ""tall"", ""...",35,"[-0.006674978882074356, 0.013674555346369743, ...",4
16,2023-03-27,5,-,https://www.amazon.com/gp/customer-reviews/R1Y...,0.993381,0.002647,B0BGND2M55,5,Topics,"back support, durability",4,"[0.025533046573400497, 0.002915936056524515, -...",3
17,2023-03-27,5,-,https://www.amazon.com/gp/customer-reviews/R1X...,0.989524,0.001497,B09D7QDLYC,6,Topics,"{""build quality"": ""well built"", ""ease of assem...",61,"[-0.004094177857041359, 0.01320925634354353, -...",5
19,2023-05-16,5,https://m.media-amazon.com/images/I/81jHLvy7t6...,https://www.amazon.com/gp/customer-reviews/R35...,0.943908,0.004142,B0BM5ZJWQP,8,Topics,"{""design"": ""square design"", ""fit"": ""fit perfec...",47,"[0.004543479532003403, 0.022863510996103287, -...",1
20,2023-06-02,2,-,https://www.amazon.com/gp/customer-reviews/R2M...,0.003215,0.937854,B0BHS6YL2F,9,Topics,"{""assembly"": ""difficult""}",7,"[-0.025849051773548126, 0.027015499770641327, ...",2
21,2022-12-13,5,-,https://www.amazon.com/gp/customer-reviews/R3T...,0.994206,0.001794,B0BNPLKNJB,10,Topics,"{""durability"": ""survived hurricane winds""}",11,"[-0.007426663301885128, 0.017074916511774063, ...",0


In [None]:
trait_labels = get_chatbot_trait_labels(clusters_dict,temperature=0.2)
df['cluster_label'] = df['cluster'].map(trait_labels)
write_cluster_labels(df, type = type, data_table = 'weighted_trait_graph' )
write_cluster_labels(df, type = type, data_table = 'weighted_trait_heatmap' )