In [1]:
import numpy as np
import pandas as pd
#plotly
import plotly.express as px
import plotly.graph_objects as go
import json
import boto3
from utils import bedrock, print_ww
import os
from anthropic_bedrock import AnthropicBedrock
import anthropic_bedrock


In [2]:
from langchain.embeddings import BedrockEmbeddings
from langchain.llms.bedrock import Bedrock

In [None]:
system_prompt = """
    You are built in order to identify common issues in product comments.
    You will output a json of the following format:
    ```json {
        "issues": [
            {
                "issue": "issue name",
                "confidence": 0.5,
                "severity": "low",  # low, medium, high
                "comment": "comment text"
            },
            {
                "issue": "issue name",
                "confidence": 0.5,
                "severity": "high",
                "comment": "comment text"
            }
        ]
    }```
    
    If there are no issues give an empty list. Make the issue names general so that they can be clustered later.
    Make the comment text as short as possible while still being informative and rephrase the initial comment.
    The answer must be a valid json and only json. Include the ```json``` tag at the beginning of the answer.
    The issues must be about the product and not about the customer.
    Very import! Always use exactly 2 words for the issue name.
    """

In [None]:
def get_json_from_comment_analysis(comment_analysis):
    try:
        start = comment_analysis.find("```")
        if start == -1:
            raise ValueError("No code block found")

        end = comment_analysis.find("```", start + 3)
        if end == -1:
            raise ValueError("No closing code block found")

        json_string = comment_analysis[start + 3:end].strip()

        if json_string.startswith("json"):
            json_string = json_string[4:].strip()

        return json.loads(json_string)

    except ValueError as ve:
        raise ve
    except Exception as e:
        raise ValueError("Invalid or not JSON format") from e

In [88]:
import time
client = AnthropicBedrock(
    aws_access_key=os.environ["AWS_ACCESS_KEY_ID"],
    aws_secret_key=os.environ["AWS_SECRET_ACCESS_KEY"],
)

def get_json_for_comment(product_name, comment, iter = 0):
    try:
    
        completion = llm_wrapper(f"{anthropic_bedrock.HUMAN_PROMPT} {system_prompt}\n\n This is the product name: {product_name}\n\n This is the comment: {comment} {anthropic_bedrock.AI_PROMPT}")
    
        return get_json_from_comment_analysis(completion)
    except ValueError as ve:
        if iter > 5:
            raise ve
        # sleep for 5 seconds
        time.sleep(5)
        return get_json_for_comment(comment, iter + 1)
    
def llm_wrapper(text):
    return client.completions.create(
        model="anthropic.claude-v2:1",
        max_tokens_to_sample=1024,
        prompt=f"{text}"
    ).completion


In [89]:
reviews_df = pd.read_csv("Amazon_Unlocked_Mobile.csv")

min_reviews = 10
exclude_5_star = True
min_review_length = 20
max_review_length = 5000
# Reviews only of products with more than 10 reviews
reviews_df = reviews_df.groupby("Product Name").filter(lambda x: len(x) > min_reviews)
# We also ignore reviews with 5 stars
if exclude_5_star:
    reviews_df = reviews_df[reviews_df["Rating"] != 5]
# The review must be more than > 20 characters
reviews_df = reviews_df[reviews_df["Reviews"].str.len() > min_review_length]
reviews_df = reviews_df[reviews_df["Reviews"].str.len() < max_review_length]

products = reviews_df["Product Name"].unique()

In [90]:
# We do not want products with too many reviews. 
def get_number_of_reviews_to_sample(actual_reviews_number):
    if actual_reviews_number == 0:
        return 0
    if actual_reviews_number < 50:
        return actual_reviews_number
    return min(actual_reviews_number, int(np.log(actual_reviews_number) * 10))

In [91]:
def pareto_frontier(df, n):
    # Sort the DataFrame by likes (descending) and length (descending)
    sorted_df = df.sort_values(by=['Review Votes', 'Length of Reviews'], ascending=[False, False])

    # Initialize an empty list to store the indices of Pareto optimal points
    pareto_frontier_indices = []

    # Iterate through the sorted DataFrame
    for i in range(len(sorted_df)):
        # Get the current row
        row = sorted_df.iloc[i]

        # Check if the current row is dominated by any of the selected rows
        dominated = False
        for j in pareto_frontier_indices:
            selected_row = sorted_df.iloc[j]
            if selected_row['Review Votes'] >= row['Review Votes'] and selected_row['Length of Reviews'] >= row['Length of Reviews']:
                dominated = True
                break

        # If not dominated, add it to the Pareto frontier
        if not dominated:
            pareto_frontier_indices.append(i)

        # If we have enough Pareto points, break
        if len(pareto_frontier_indices) == n:
            break

    # If the Pareto frontier does not have enough points, add the next best points
    if len(pareto_frontier_indices) < n:
        additional_indices = set(range(len(sorted_df))) - set(pareto_frontier_indices)
        additional_indices = sorted(additional_indices)[:n - len(pareto_frontier_indices)]
        pareto_frontier_indices.extend(additional_indices)

    # Return the rows corresponding to the Pareto frontier and additional points
    return sorted_df.iloc[pareto_frontier_indices]

In [92]:
import warnings
warnings.filterwarnings("ignore")
def process_product_reviews(reviews_df, products):
    final_df = pd.DataFrame()
    expected_reviews = 0
    for product in products:
        # Create a copy to avoid setting values on a slice of the original DataFrame
        product_reviews = reviews_df[reviews_df["Product Name"] == product].copy()

        number_of_reviews_to_sample = get_number_of_reviews_to_sample(len(product_reviews))
        expected_reviews += number_of_reviews_to_sample
        if number_of_reviews_to_sample == 0:
            continue

        # Use .loc for safe assignment
        product_reviews.loc[:, "Length of Reviews"] = product_reviews["Reviews"].str.len()
        product_reviews.loc[:, "Review Votes"] = product_reviews["Review Votes"].fillna(0).astype(int)

        # Apply the Pareto frontier function
        selected_reviews = pareto_frontier(product_reviews, number_of_reviews_to_sample)

        # Concatenate the results
        final_df = pd.concat([final_df, selected_reviews])
        print("Expected reviews: ", expected_reviews, "Actual reviews: ", len(final_df))

    return final_df
final_df = process_product_reviews(reviews_df, products)

Expected reviews:  21 Actual reviews:  21
Expected reviews:  46 Actual reviews:  46
Expected reviews:  54 Actual reviews:  54
Expected reviews:  72 Actual reviews:  72
Expected reviews:  115 Actual reviews:  115
Expected reviews:  126 Actual reviews:  126
Expected reviews:  135 Actual reviews:  135
Expected reviews:  179 Actual reviews:  179
Expected reviews:  189 Actual reviews:  189
Expected reviews:  195 Actual reviews:  195
Expected reviews:  212 Actual reviews:  212
Expected reviews:  214 Actual reviews:  214
Expected reviews:  220 Actual reviews:  220
Expected reviews:  234 Actual reviews:  234
Expected reviews:  240 Actual reviews:  240
Expected reviews:  250 Actual reviews:  250
Expected reviews:  260 Actual reviews:  260
Expected reviews:  270 Actual reviews:  270
Expected reviews:  282 Actual reviews:  282
Expected reviews:  332 Actual reviews:  332
Expected reviews:  344 Actual reviews:  344
Expected reviews:  387 Actual reviews:  387
Expected reviews:  395 Actual reviews:  

In [93]:
# Sample a random product
product = np.random.choice(products)
product_reviews = reviews_df[reviews_df["Product Name"] == product]
print("Product:", product, "Number of reviews:", len(product_reviews))
# We pass it to the model


def get_json_for_comment_parallel(product_name, review):
    return get_json_for_comment(product_name, review)

def apply_parallel(df, func, max_workers=100):
    # Create a list to store the futures
    futures = []

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Submit a future for each row of the DataFrame
        for row in df.itertuples():
            future = executor.submit(func, row[1], row[5])  # Adjust indices as per your DataFrame structure
            futures.append(future)

        # Collect the results in the order they were submitted
        results = [future.result() for future in futures]

    return results


product_reviews["json"] =  apply_parallel(product_reviews, get_json_for_comment_parallel)

Product: HTC EVO Shift 4G Android Smartphone Blue - Sprint Number of reviews: 182


In [95]:
bedrock_embeddings = BedrockEmbeddings(model_id="amazon.titan-embed-text-v1", client=boto3_bedrock)

# Assuming bedrock_embeddings.embed_query is your embedding function

@lru_cache(maxsize=None)  # Or choose a maxsize that suits your needs
def cached_embed_query(issue):
    return bedrock_embeddings.embed_query(issue)

def generate_embeddings_parallel(issues):
    with ThreadPoolExecutor() as executor:
        embeddings = list(executor.map(cached_embed_query, issues))

    return np.array(embeddings)

In [96]:
issues_for_product = []
for review in product_reviews["json"]:
    issues_for_product.extend(review["issues"])
    
# Get all issue names and comment and combine them into one text
combined_issues_names = [issue["issue"] for issue in issues_for_product]
combined_issues_comments = [issue["comment"] for issue in issues_for_product]
embeddings_names = generate_embeddings_parallel(combined_issues_names)
embeddings_comments = generate_embeddings_parallel(combined_issues_comments)

In [97]:
# tsne 
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=0)
tsne.fit(embeddings_names)

# Plot the results plotly do not show text on top only on hover
fig = px.scatter(x=tsne.embedding_[:, 0], y=tsne.embedding_[:, 1], hover_name=combined_issues_names)
fig.update_layout(
    height=800,
    title_text='TSNE for comments'
)
fig.show()

In [105]:
# OPTICS Clustering
from sklearn.cluster import OPTICS
clustering = OPTICS(metric='cosine').fit(embeddings_names)
labels = clustering.labels_

# Plot the results
fig = px.scatter(x=tsne.embedding_[:, 0], y=tsne.embedding_[:, 1], color=labels, hover_name=combined_issues_names)
fig.update_layout(
    height=800,
    title_text='OPTICS for comments'
)
fig.show()

In [123]:
# Get all clusters and choose a representative of the cluster which is the closest to the center of the cluster
clusters = {}
for i, label in enumerate(labels):
    if label == -1:
        continue
    if label not in clusters:
        clusters[label] = []
    clusters[label].append((combined_issues_names[i], embeddings_names[i], combined_issues_comments[i], embeddings_comments[i]))

# Get the center of the cluster
def get_cluster_center(cluster):
    return np.mean([x[1] for x in cluster], axis=0)

# Get the closest point to the center of the cluster
def get_closest_point_to_center(cluster):
    center = get_cluster_center(cluster)
    closest_point = None
    closest_distance = None
    for issue, embedding, comment, comment_embedding in cluster:
        distance = np.linalg.norm(center - embedding)
        if closest_distance is None or distance < closest_distance:
            closest_distance = distance
            closest_point = (issue, embedding, comment, comment_embedding)
    return closest_point


# Get representative issues
representative_issues = {}
for label, cluster in clusters.items():
    representative_issues[label] = get_closest_point_to_center(cluster)

print("Representative issues:")
for label, issue in representative_issues.items():
    print(issue[0], "Number of issues in cluster:", clusters[label])


Representative issues:
screen malfunction Number of issues in cluster: [('screen failure', array([-0.30664062,  0.04272461, -0.16796875, ...,  0.3359375 ,
       -0.59765625, -0.578125  ]), 'Screen went out after a week', array([ 0.47070312,  0.109375  , -0.24804688, ...,  0.56640625,
       -0.71875   , -0.8125    ])), ('screen malfunction', array([-0.33789062,  0.39453125, -0.578125  , ...,  0.5703125 ,
       -0.8046875 , -0.40234375]), 'A portion of the touch screen was unresponsive and some keyboard buttons were sticking.', array([ 0.27734375,  0.17871094, -0.359375  , ...,  0.29492188,
       -0.42578125, -0.61328125])), ('screen failure', array([-0.30664062,  0.04272461, -0.16796875, ...,  0.3359375 ,
       -0.59765625, -0.578125  ]), 'Phone screen stopped working when shifting keyboard after 3 months of use', array([ 1.515625  ,  0.20996094, -0.72265625, ...,  0.8359375 ,
       -0.41796875, -0.609375  ])), ('screen malfunction', array([-0.33789062,  0.39453125, -0.578125  , .

In [15]:
product_reviews.to_csv(f"1.csv", index=False)

In [22]:
issues_for_product = []
for review in product_reviews["json"]:
    issues_for_product.extend(review["issues"])
sorted([issue['issue'] for issue in issues_for_product])

['Battery life',
 'Battery swollen shortly after purchase',
 'Camera quality',
 'Camera quality lower than advertised',
 'Case design',
 'Phone broke easily from drop',
 'Product does not match description',
 'battery and charging problems',
 'battery life',
 'bluetooth connectivity issues',
 'call audio issues with provided headphones',
 'call audio quality',
 'camera quality',
 'camera quality',
 'connectivity issues',
 'connectivity issues',
 'connectivity problems',
 'cracked screen',
 'data connectivity',
 'data connectivity',
 'device component malfunction',
 'device not powering on',
 'inaccurate GPS location',
 'insufficient storage capacity',
 'long battery charging time',
 'long charging time',
 'long charging time',
 'manual quality',
 'no customer support response',
 'not worth the money',
 'performance',
 'performance',
 'phone freezing',
 'phone had issues',
 'phone stopped working',
 'poor camera quality',
 'poor customer service',
 'poor quality screen protector',
 'rep

In [156]:
# plot get_number_of_reviews_to_sample
x = np.arange(0, 1000)
y = [get_number_of_reviews_to_sample(i) for i in x]
fig = go.Figure()
fig.add_trace(go.Scatter(x=x, y=y))
fig.update_layout(title="Number of reviews to sample", xaxis_title="Number of reviews", yaxis_title="Number of reviews to sample")
fig.show()

In [None]:
lengths = reviews_df['Reviews'].apply(lambda x: len(str(x)))

# kde plot
fig = px.histogram(lengths, x="Reviews", marginal="box", nbins=100, title="Distribution of review lengths")

fig.show()

In [109]:
# Sample 10 reviews
sample_reviews = reviews_df.sample(10)
sample_reviews['analysis'] = sample_reviews.apply(lambda x: get_json_for_comment(x['Product Name'], x['Reviews']), axis=1)
sample_reviews

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes,analysis
37240,Apple iPhone 5c a1532 8GB Blue Smartphone for ...,Apple,94.0,5,THE PHONE IS GREAT LOVE IT,0.0,{'issues': []}
230001,"LG Nexus 5X Unlocked Smart Phone, 5.2"" Ice Blu...",LG Electronics,325.0,5,"I love this phone, it replaced my Nexus 4.",0.0,{'issues': []}
347187,Samsung Galaxy S4 SGH-I337 Unlocked GSM Smartp...,Samsung,183.99,5,"Good seller, good cellphone",0.0,{'issues': []}
375609,"Samsung Galaxy S7, 32GB, Gold Platinum, Unlock...",Samsung,519.98,1,Phone was not unlocked as advertised. I spent ...,20.0,{'issues': [{'issue': 'product not as describe...
147653,BLU Studio C 5+5 Smartphone with Lollipop OS- ...,BLU,81.58,4,good phone,0.0,{'issues': []}
217332,"LG G3 Beat 3G, D724, 8GB, Dual Sim, (Titanium)",,399.99,5,I am pleased with this phone it has exceeded m...,0.0,{'issues': []}
234817,LG OPTIMUS L7 II DUAL P715 Factory Unlocked In...,LG,109.72,5,very good,0.0,{'issues': []}
156692,BLU Studio XL Android Smartphone - GSM Unlocke...,BLU,99.99,5,I am really enjoying my blue phone still getti...,0.0,{'issues': []}
310048,Samsung Galaxy G920F-S6 32GB Factory Unlocked ...,Samsung,434.99,3,"I give this purchase just 3 stars, the phone s...",1.0,{'issues': [{'issue': 'Phone randomly shutting...
181119,ECOOPRO Rugged Unlocked GSM Cell Phone (Gold) ...,,68.99,4,Love the phone. I have had no problems with it...,0.0,{'issues': []}


In [111]:
sample_reviews.to_csv("sample_reviews.csv", index=False)

In [70]:
# Read all files from folder comments all .txt files
import os

comments = []
# list all files in comments
for file in os.listdir("comments"):
    if file.endswith(".txt"):
        with open(os.path.join("comments", file), "r") as f:
            comments.append(f.read())
            

comment_analysis = []
for comment_sample in comments:
    comment_analysis.append(get_json_for_comment(comment_sample))

In [105]:
[print(i) for i in comment_analysis]

{'issues': [{'issue': 'audio issues', 'confidence': 0.9, 'severity': 'high', 'comment': 'Had echo issue initially and later issues with right headphone not working properly. Determined to be wiring and hardware problem with volume control module.'}, {'issue': 'durability concerns', 'confidence': 0.8, 'severity': 'medium', 'comment': 'Stopped working properly after only 2-3 months of use. Cable and hardware components seem to wear out quickly.'}]}
{'issues': [{'issue': 'Microphone not working', 'confidence': 0.9, 'severity': 'high', 'comment': "Microphone doesn't work when plugged in."}, {'issue': 'Product quality', 'confidence': 0.8, 'severity': 'medium', 'comment': 'Commenter describes product as cheaply made garbage.'}, {'issue': 'Past return window', 'confidence': 0.7, 'severity': 'medium', 'comment': 'Can no longer return the product.'}]}


[None, None]