# Architecture

ASIN Input

API Reviews - 30 Reviews
API Product Info
v
Template of Product Information and Reviews. 
v
Problem Statement
v
Atomization Engine
v
Solutions Engine
v
Prototypes

In [118]:
import pandas as pd
import numpy as np
import promptlayer
import re
import requests
import json
import csv

import tiktoken
from typing import Dict


from langchain.agents import load_tools
from langchain.agents import initialize_agent
from langchain.chat_models import ChatOpenAI
from langchain import PromptTemplate
from langchain.chat_models import PromptLayerChatOpenAI

from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    AIMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain.schema import (
    AIMessage,
    HumanMessage,
    SystemMessage
)

from rich.console import Console
from rich.table import Table
console = Console()

from dotenv import load_dotenv
import os

load_dotenv()
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
SERPAPI_API_KEY = os.getenv('SERPAPI_API_KEY')
WOLFRAM_ALPHA_APPID = os.getenv('WOLFRAM_ALPHA_APPID')
PROMPTLAYER_API_KEY = os.getenv('PROMPTLAYER_API_KEY')

In [119]:
def get_amazon_reviews(product_id, api_key="70201ee0c8ed29661bc6ae00a84341fb"):
    url = f"https://h-amazon-data-scraper2.p.rapidapi.com/products/{product_id}/reviews"

    querystring = {"api_key": api_key}

    headers = {
        "X-RapidAPI-Key": "4da31a08e5mshaca05d98a3d9d6ep1fffb1jsn019717508cc8",
        "X-RapidAPI-Host": "h-amazon-data-scraper2.p.rapidapi.com"
    }

    response = requests.request("GET", url, headers=headers, params=querystring)
    json_data = json.loads(response.text)

    # Extract the reviews from the JSON data and convert it to a DataFrame
    reviews = pd.DataFrame(json_data['reviews'])

    # Export reviews to a CSV file
    reviews.to_csv(f"{product_id}_reviews_sample.csv", index=False)

    return reviews


In [120]:
class ReviewsProcessor:
    def __init__(self, source, limit=3000):
        self.source = source
        self.limit = limit

    @staticmethod
    def num_tokens_from_string(string: str, encoding_name: str = 'cl100k_base') -> int:
        encoding = tiktoken.get_encoding(encoding_name)
        num_tokens = len(encoding.encode(string))
        return num_tokens

    @staticmethod
    def extract_asin(url):
        pattern = r'ASIN=(\w{10})'
        match = re.search(pattern, url)
        if match:
            return match.group(1)
        else:
            return None

    @staticmethod
    def clean_review(review):
        return re.sub(r'[^a-zA-Z0-9\s]+', '', review)

    def get_data(self):
        df = pd.read_csv(self.source)
        df['num_tokens'] = df['Body'].apply(self.num_tokens_from_string)
        df['asin'] = df['URL'].apply(self.extract_asin)

        df['review'] = df.apply(lambda x: x['Body'][:self.limit * 3] if x['num_tokens'] > self.limit else x['Body'], axis=1)
        df['review_num_tokens'] = df['review'].apply(self.num_tokens_from_string)
        asin = df['asin'].unique()[0]
        df = df[['review', 'review_num_tokens']]
        return df, asin

    def process_reviews(self, df):
        df['review'] = df['review'].apply(self.clean_review)
        return df

    def create_review_dict(self, df: pd.DataFrame, column_name: str, encoding_name: str = 'cl100k_base', max_tokens: int = 3000) -> Dict[int, str]:
        review_dict = {}
        current_review_str = ""
        current_token_count = 0
        review_index = 0

        for index, row in df.iterrows():
            review = row[column_name]
            token_count = self.num_tokens_from_string(review)

            if current_token_count + token_count <= max_tokens:
                if current_review_str:
                    current_review_str += "\n\n"
                current_review_str += review
                current_token_count += token_count
            else:
                review_dict[review_index] = current_review_str
                review_index += 1
                current_review_str = review
                current_token_count = token_count

        if current_review_str:
            review_dict[review_index] = current_review_str

        return review_dict

In [121]:
def extract_clean_text(df_input, column='Assistant Reply', category=None):
    # Filter DataFrame by category if category is provided and the category column exists
    if category and 'Category' in df_input.columns:
        df_input = df_input[df_input['Category'] == category]

    # Extract and clean text from specified column
    clean_text = []
    for i in range(len(df_input)):
        text = df_input[column][i]
        
        # If the text is an AIMessage object, extract content
        if hasattr(text, 'content'):
            text = text.content
        # If the text is a string representation, extract content inside the double quotes
        elif text.startswith('content="'):
            text = text[len('content="'):-1]

        # Replace '\\n' with '\n' and split the text into snippets
        text_snippet = text.replace('\\n', '\n').split('\n')
        for snippet in text_snippet:
            # Check if snippet starts with a number followed by a period and a space
            if re.match(r'\d+\. ', snippet):
                cleaned_snippet = snippet.lstrip('0123456789. ').rstrip(',')
                if cleaned_snippet:
                    clean_text.append(cleaned_snippet)

    # Create a new DataFrame with the extracted text
    df_clean_output = pd.DataFrame(clean_text, columns=['Text'])
    return df_clean_output

In [122]:
def num_tokens_from_string(string: str, encoding_name = "cl100k_base") -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

def num_tokens_from_messages(messages, model="gpt-3.5-turbo-0301"):
    """Returns the number of tokens used by a list of messages."""
    try:
        encoding = tiktoken.encoding_for_model(model)
    except KeyError:
        print("Warning: model not found. Using cl100k_base encoding.")
        encoding = tiktoken.get_encoding("cl100k_base")
    if model == "gpt-3.5-turbo":
        print("Warning: gpt-3.5-turbo may change over time. Returning num tokens assuming gpt-3.5-turbo-0301.")
        return num_tokens_from_messages(messages, model="gpt-3.5-turbo-0301")
    elif model == "gpt-4":
        print("Warning: gpt-4 may change over time. Returning num tokens assuming gpt-4-0314.")
        return num_tokens_from_messages(messages, model="gpt-4-0314")
    elif model == "gpt-3.5-turbo-0301":
        tokens_per_message = 4  # every message follows <|start|>{role/name}\n{content}<|end|>\n
        tokens_per_name = -1  # if there's a name, the role is omitted
    elif model == "gpt-4-0314":
        tokens_per_message = 3
        tokens_per_name = 1
    else:
        raise NotImplementedError(f"""num_tokens_from_messages() is not implemented for model {model}. See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens.""")
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3  # every reply is primed with <|start|>assistant<|message|>
    return num_tokens

In [None]:
def generate_batches(improvements, max_tokens=1500):
        """
    This function takes a list of improvements and groups them into batches. Each batch has a token count
    that doesn't exceed the specified max_tokens limit. It returns a list of batches, where each batch is a
    list of improvements.
    
    Args:
        improvements (list): A list of improvement texts.
        max_tokens (int): The maximum number of tokens allowed per batch.

    Returns:
        batches (list): A list of lists, where each inner list represents a batch of improvements.
    """
    batches = []
    current_batch = []
    current_tokens = 0

    for imp in improvements:
        if isinstance(imp, float) and np.isnan(imp):  # Skip NaN values
            continue

        print(f"Processing improvement: {imp}")  # Add print statement
        imp_tokens = num_tokens_from_string(imp, encoding_name="cl100k_base")
        if current_tokens + imp_tokens + 1 <= max_tokens:
            current_batch.append(imp)
            current_tokens += imp_tokens + 1
        else:
            print(f"Batch size: {len(current_batch)}, Tokens: {current_tokens}")  # Print tokens in the current batch
            batches.append(current_batch)
            current_batch = [imp]
            current_tokens = imp_tokens
    if current_batch:
        print(f"Batch size: {len(current_batch)}, Tokens: {current_tokens}")  # Print tokens in the last batch
        batches.append(current_batch)

    print(f"Number of batches: {len(batches)}")  # Print the number of batches
    return batches

In [None]:
def read_problem_statements(problem_statement_file):
    # This function reads problem statements from a CSV file and returns them as a list.
    print("Reading problem statements")
    problem_statements = []
    with open(problem_statement_file, 'r') as csvfile:
        reader = csv.reader(csvfile)
        next(reader)  # Skip the header row
        for row in reader:
            problem_statements.append(row[0])
    return problem_statements

def write_atomized_problem_solution(atomized_problem_solutions):
    #: This function writes problem statements and their solutions to a new CSV file.
    print("Writing atomized problem solutions")
    with open('atomized_solution_statement.csv', 'w') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["Problem Statement", "Solution"])  # Add a header row
        for problem, solution in atomized_problem_solutions:
            writer.writerow([problem, solution])

def generate_openai_solution(prompt, chat, chat_prompt):
    # This function generates a solution to a problem using an OpenAI chatbot.
    response = chat(prompt)
    solution = extract_content(response)
    return solution

def extract_content(text):
    # This function extracts the content from an OpenAI response.
    if hasattr(text, 'content'):
        return text.content
    elif text.startswith('content="'):
        return text[len('content="'):-1]
    else:
        return text

In [None]:
def atomize_problem_solutions(problem_statement_file, product_description, openai_api_key):
    # This function atomizes problem statements and their solutions.
    print("Atomizing problem solutions")
    atomized_problem_solutions = []
    
    # Initialize ChatOpenAI instance
    chat = ChatOpenAI(model_name="gpt-3.5-turbo", openai_api_key=openai_api_key)

    SystemMessage = """You are a TRIZ Engineer and your task is to solve the following problem statement using TRIZ. If the problem cannot be solved using common knowledge, don't solve."""
    system_message_prompt = SystemMessagePromptTemplate.from_template(SystemMessage)

    HumanMessage = """Problem Statement: {inputProblemStatement}
                    Product Description: {inputProductDescription}"""
    human_message_prompt = HumanMessagePromptTemplate.from_template(HumanMessage)

    AiMessage = """Solution: """
    ai_message_prompt = AIMessagePromptTemplate.from_template(AiMessage)

    chat_prompt = ChatPromptTemplate.from_messages([system_message_prompt, human_message_prompt, ai_message_prompt])
    print("Prompt: ", chat_prompt.to_messages())
    # Read problem statements from file
    problem_statements = read_problem_statements(problem_statement_file)
    print("Read problem statements from file:", problem_statements)
    
    # Process each problem statement
    for problem_statement in problem_statements:
        print("Processing problem statement:", problem_statement)
        prompt = chat_prompt.format_prompt(
            inputProblemStatement=problem_statement,
            inputProductDescription=product_description
        ).to_messages()
        print("Prompt: ", chat_prompt)
        # Generate solution using OpenAI API
        atomized_solution = generate_openai_solution(prompt, chat, chat_prompt)
        print("Generated solution:", atomized_solution)
        
        # Append problem statement and solution to list
        atomized_problem_solutions.append((problem_statement, atomized_solution))
    
    # Write atomized problem solutions to file
    write_atomized_problem_solution(atomized_problem_solutions)
    print("Atomized problem solutions written to file")
    
    return atomized_problem_solutions

In [None]:
# Run the functions
reviews = get_amazon_reviews("B08TVXQ5S1")
source = ("B07ZKTBGR2 - Blinger Ultimate Set, Glam Collection, Comes with  2023-03-16.csv")
processor = ReviewsProcessor(source)
df, asin = processor.get_data()
df = processor.process_reviews(df)
reviews_dict = processor.create_review_dict(df, column_name='review', encoding_name='cl100k_base', max_tokens=processor.limit)

# create a DataFrame from the review_dict
df_reviews = pd.DataFrame.from_dict(reviews_dict, orient='index', columns=['review'])

# save the DataFrame to a CSV file
df_reviews.to_csv('reviews.csv', index_label='id')

# Read the reviews, checkpoint
reviews = pd.read_csv('reviews.csv')

In [None]:
# Initialize the agent
openai = promptlayer.openai
chat = ChatOpenAI(model_name="gpt-3.5-turbo", openai_api_key=OPENAI_API_KEY)
querry = {0: "Write the top traits that customers appreciate the most about the product. Provide a description of each trait. ",
          1: "Write what customers dislike about the product. Provide a description of each trait. ",
          2: "What are the improvements that can be brought to this product? Provide a description of each trait. "}
category = {
0: "appreciate"
,1: "dislike"
,2: "improvements"
}

SystemMessage = """Answer the question based on the context and reviews below. You will answer with bulletpoints and extra clarity as a profesional developer. If the question cannot be answered using the information provided answer with "I don't know".

Context: You are looking at a product sold on amazon.com. We are a competing product development team. Our scope is to better understand the clients need with our product in order to improve the product. """

system_message_prompt = SystemMessagePromptTemplate.from_template(SystemMessage)

HumanMessage = """Question:  {inputQuestion}
                Reviews: {inputReviews} """

human_message_prompt = HumanMessagePromptTemplate.from_template(HumanMessage)

AiMessage = """Answer: """
ai_message_prompt = AIMessagePromptTemplate.from_template(AiMessage)

chat_prompt = ChatPromptTemplate.from_messages([system_message_prompt, human_message_prompt, ai_message_prompt])



In [None]:
df_problem_statement = pd.DataFrame(columns=['Category','Reviews', 'Human Message', 'Assistant Reply'])

if len(querry) > 0:
    for i in range(len(reviews)):
        for j in range(len(querry)):
            # PROMPTLAYER
            prompt = chat_prompt.format_prompt(
                inputQuestion=querry[j],
                inputReviews=reviews['review'][i]
            ).to_messages()
            results = chat(prompt) 
            print(results)

            # create a new dataframe to store the results
            df = pd.DataFrame({
                'Category': [category[j]],
                'Human Message': [querry[j]],
                'Assistant Reply': [results],
                'Reviews': [reviews['review'][i]]
            })
            
            # add the results to the main dataframe
            df_main = pd.concat([df_main, df], ignore_index=True)
            
df_main.to_csv('questions_and_answers.csv', index=False)

content="From the reviews provided, it is difficult to determine the top traits that customers appreciate the most about a product. However, some potential traits that customers appreciate are:\n\n- Easy to use\n- Gems stick well to hair and skin\n- Fun and cute addition to outfits and accessories\n- Wide variety of gems\n- Good gift for young girls\n- Can be used on multiple surfaces (hair, clothes, books, phone cases, etc.)\n- Can be removed easily without causing damage or discomfort\n\nAdditionally, some points of improvement mentioned are:\n\n- Adhesive doesn't last very long\n- Some gems are small and don't work very well with the product\n- Packaging and/or delivery was not satisfactory in some cases\n- Product can be messy or difficult to use in some situations\n- Product may not work well or could be a waste of money in some cases" additional_kwargs={}
content="Traits that customers appreciate the least based on the reviews are: \n\n- Difficult to use/Operational Issues: A few

In [None]:
df_main = pd.read_csv('questions_and_answers.csv')
improvements_df = extract_clean_improvements(df_main)
improvements_df.to_csv('improvements.txt', index=False)

In [None]:
encoding = tiktoken.get_encoding("cl100k_base")
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")

In [None]:
improvements_df = pd.read_csv('improvements.txt')
improvements_df['Improvements'] = improvements_df['Improvements'].fillna('')
improvement_batches = generate_batches(improvements_df['Improvements'])
improvements_filtered = improvements_df['Improvements'].dropna()
improvement_batches = generate_batches(improvements_filtered)

Processing improvement: Based on the reviews, here are the potential improvements that can be brought to this product:
Processing improvement: Provide clear instructions on how to use the product.
Processing improvement: Ensure that the product matches the pictures shown on the website.
Processing improvement: Increase the quality of the product to avoid breaking or malfunctioning.
Processing improvement: Improve the adhesiveness of the gems to make them last longer.
Processing improvement: Increase the number of gems included in the package.
Processing improvement: Decrease the price to make it more accessible to customers.
Processing improvement: Improve the refilling process for the gems.
Processing improvement: Ensure that the product is eligible for return and provide a refund policy.
Processing improvement: Ensure that the product is delivered in good condition.
Processing improvement: Avoid using harmful materials that can cause harm or ear infections.
Processing improvement: Co

In [None]:
improvements_df_length = len(improvements_df)
improvements_batches_length = sum(len(batch) for batch in improvement_batches)

print(f"Length of improvements_df: {improvements_df_length}")
print(f"Length of improvement_batches: {improvements_batches_length}")

Length of improvements_df: 77
Length of improvement_batches: 77


In [None]:
# Querry needs to be improved
# too much of the problem statement is lost in summarization

querry = "What are the distinct problem statements that can be solved by an engineering team based on the following improvements?"

SystemMessage = """Answer the question based on the context and improvements below. You will answer with bulletpoints and extra clarity as a professional developer. If the question cannot be answered using the information provided, answer with "I don't know".

Context: You are looking at a product sold on amazon.com. We are a competing product development team. Our scope is to better understand the clients' needs with our product in order to improve it. """

system_message_prompt = SystemMessagePromptTemplate.from_template(SystemMessage)

HumanMessage = """Question:  {inputQuestion}
                Improvements: {inputImprovements} """

human_message_prompt = HumanMessagePromptTemplate.from_template(HumanMessage)

AiMessage = """Answer: """
ai_message_prompt = AIMessagePromptTemplate.from_template(AiMessage)

chat_prompt = ChatPromptTemplate.from_messages([system_message_prompt, human_message_prompt, ai_message_prompt])

In [None]:
df_small_problem_statement = pd.DataFrame(columns=['Improvements', 'Assistant Reply'])

for batch in improvement_batches:
    improvements_text = "\n".join(batch)
    
    # PROMPTLAYER
    prompt = chat_prompt.format_prompt(
        inputQuestion=querry,
        inputImprovements=improvements_text
    ).to_messages()
    results = chat(prompt)
    print(results)

    # create a new dataframe to store the results
    df = pd.DataFrame({
        'Assistant Reply': [results],
        'Improvements': [improvements_text]
    })

    # add the results to the df_small_problem_statement dataframe
    df_small_problem_statement = pd.concat([df_small_problem_statement, df], ignore_index=True)

df_small_problem_statement.to_csv('problem_statements.csv', index=False)

Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised APIConnectionError: Error communicating with OpenAI: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer')).


KeyboardInterrupt: 

In [50]:
df_small_problem_statement = pd.read_csv('problem_statements.csv')
atomized_problem_statement = extract_clean_text(df_small_problem_statement, column='Assistant Reply')
print(atomized_problem_statement)
atomized_problem_statement.to_csv('atomized_problem_statement.csv', index=False)

                                                 Text
0                          Lack of Clear Instructions
1   Product does not match the pictures shown on t...
2                          Low Quality and Durability
3                                Lack of Adhesiveness
4                        Shortage of Gems and Refills
5                                          High Price
6                               Refill Process Hassle
7                               Lack of Return Policy
8                                      Poor Packaging
9                          Unsuitability for Children
10                                  Harmful Materials
11                    Utilization of Sticky Materials


In [89]:
# Describe this product:
# - components
# - materials
# - how it's used
# <Proudct Title> 


ProductDescription = {
"Name": "Blinger Ultimate Set, Glam Collection, Comes with Glam Styling Tool & 225 Gems - Load, Click, Bling! Hair, Fashion, Anything! (Amazon Exclusive)",
"Bullets": [
"Blinger Ultimate Set, Glam Collection, Comes with Glam Styling Tool & 225 Gems - Load, Click, BLING. Hair, Fashion, ANYTHING. (Exclusive)",
"Blinger is the new Glam Styling Tool that allows you to Load, Click, BLING – Hair, Fashion ANYTHING.",
"With 225 adhesive gems included & multiple colors to choose from, Blinger makes it easy for you to glam it up and add sparkle to your life.",
"The Blinger Glam Collection is a fashionista’s dream – finally, a styling tool that allows you add all the BLING to your everyday life and create your own striking looks.",
"It’s totally safe and easy to use, however Blinger is a styling tool and there is always a chance it needs a little tune up or fix. If you experience any issue please contact our customer service and it will be our pleasure to help fix or replace."
],
"Assistant Intuition": "The Blinger Ultimate Set, Glam Collection, is a hair and fashion accessory kit that allows you to add sparkling gems to your hair, clothing, and accessories. The set includes a Glam Styling Tool and 225 gems in various shapes and colors.\n\nThe components of the set include the Glam Styling Tool, which is a handheld device that you load with gems and click to apply them to your hair or clothing. The set also includes 225 gems, which come in different shapes such as stars, hearts, and circles, and in various colors like pink, blue, green, and silver.\n\nThe materials used in the set include plastic for the Glam Styling Tool and metal and rhinestones for the gems. The gems are made with high-quality materials that are durable and long-lasting.\n\nTo use the Blinger Ultimate Set, you load the gems into the Glam Styling Tool and click the button to apply them to your hair or clothing. The set is designed to be used on any type of hair or fabric, making it a versatile accessory that can be used to add some sparkle and glam to any outfit or hairstyle. The Blinger Ultimate Set is an Amazon Exclusive product, which means that it can only be purchased on Amazon's online store."
}

In [116]:
problem_statement_file = "atomized_problem_statement.csv"
product_description = ProductDescription

atomized_problem_solutions = atomize_problem_solutions(problem_statement_file, product_description, OPENAI_API_KEY)
print("Atomized problem solutions written to atomized_solution_statement.csv")

Atomizing problem solutions
Reading problem statements
Read problem statements from file: ['Text', 'Low Quality and Durability', 'Lack of Adhesiveness', 'Shortage of Gems and Refills', 'High Price']
Processing problem statement: Text
Prompt:  [SystemMessage(content="You are a TRIZ Engineer and your task is to solve the following problem statement using TRIZ. If the problem cannot be solved using common knowledge, don't solve.", additional_kwargs={}), HumanMessage(content='Problem Statement: Text\n                    Product Description: {\'Name\': \'Blinger Ultimate Set, Glam Collection, Comes with Glam Styling Tool & 225 Gems - Load, Click, Bling! Hair, Fashion, Anything! (Amazon Exclusive)\', \'Bullets\': [\'Blinger Ultimate Set, Glam Collection, Comes with Glam Styling Tool & 225 Gems - Load, Click, BLING. Hair, Fashion, ANYTHING. (Exclusive)\', \'Blinger is the new Glam Styling Tool that allows you to Load, Click, BLING – Hair, Fashion ANYTHING.\', \'With 225 adhesive gems included

NameError: name 'atomized_solution' is not defined

In [106]:
AtomizedSolutions = pd.read_csv('atomized_solution_statement.csv')
AtomizedSolutions