## ARCHITECTURE

- Scrape Reviews
- Create reviews lists, parse by ASIN
- Create batches per ASIN
- Process with GPT
- Create a report showing findings in a dashboard


In [1]:
import pandas as pd
import numpy as np
import promptlayer
import re
import requests
import json
import csv

import tiktoken
from typing import Dict


from langchain.agents import load_tools
from langchain.agents import initialize_agent
from langchain.chat_models import ChatOpenAI
from langchain import PromptTemplate
from langchain.chat_models import PromptLayerChatOpenAI

from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    AIMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain.schema import (
    AIMessage,
    HumanMessage,
    SystemMessage
)

from rich.console import Console
from rich.table import Table
console = Console()

from dotenv import load_dotenv
import os

load_dotenv()
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
SERPAPI_API_KEY = os.getenv('SERPAPI_API_KEY')
WOLFRAM_ALPHA_APPID = os.getenv('WOLFRAM_ALPHA_APPID')
PROMPTLAYER_API_KEY = os.getenv('PROMPTLAYER_API_KEY')

In [81]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Load the pre-trained BERT model for sentiment analysis
model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

def get_sentiment_probabilities(text):
    inputs = tokenizer.encode_plus(text, return_tensors="pt")
    outputs = model(**inputs)
    logits = outputs.logits
    probabilities = torch.softmax(logits, dim=-1)
    
    # Combine probabilities for positive (4-5 stars) and negative (1-2 stars) sentiment
    positive = probabilities[0, 3] + probabilities[0, 4]
    negative = probabilities[0, 0] + probabilities[0, 1]

    return positive.item(), negative.item()

# Assuming your DataFrame is named "reviews"
def process_review(row):
    review_text = row["review"]
    positive, negative = get_sentiment_probabilities(review_text)
    return pd.Series([positive, negative])



In [4]:
def read_data(folder_path):
    reviews = pd.DataFrame()
    products = pd.DataFrame()
    asins = pd.DataFrame()
    
    for file_name in os.listdir(folder_path):
        if file_name.startswith("reviews"):
            file_path = os.path.join(folder_path, file_name)
            df = pd.read_csv(file_path)
            reviews = pd.concat([reviews, df])
        elif file_name.startswith("asin"):
            file_path = os.path.join(folder_path, file_name)
            df = pd.read_csv(file_path)
            products = pd.concat([products, df])
        elif file_name.startswith("products"):
            file_path = os.path.join(folder_path, file_name)
            df = pd.read_csv(file_path)
            asins = pd.concat([asins, df])
    
    return reviews, products, asins


In [98]:
def extract_asin(url):
    pattern = r'ASIN=(\w{10})'
    match = re.search(pattern, url)
    if match:
        return match.group(1)
    else:
        return None

def clean_review(review):
    return re.sub(r'[^a-zA-Z0-9\s]+', '', review)

def get_data(df, limit=3000):

    # Add the asin column to the dataframe
    df['asin'] = df['asin.original']

    # Split the dataframe into a dictionary of dataframes, one for each unique asin
    asin_dfs = {}
    for asin in df['asin'].unique():
        asin_dfs[asin] = df[df['asin'] == asin]

    # Process each asin dataframe and add it to a new dictionary
    asin_data = {}
    for asin, asin_df in asin_dfs.items():
        asin_df['review'] = asin_df['review'].apply(clean_review)
        asin_df['num_tokens'] = asin_df['review'].apply(num_tokens_from_string)
        asin_df['review'] = asin_df.apply(lambda x: x['review'][:limit * 3] if x['num_tokens'] > limit else x['review'], axis=1)
        asin_df['review_num_tokens'] = asin_df['review'].apply(num_tokens_from_string)
        asin_df = asin_df[['review', 'review_num_tokens', 'asin', 'positive_sentiment', 'negative_sentiment', 'rating']]
        asin_data[asin] = asin_df

    return asin_data


In [62]:
def extract_clean_text(df_input, column='Assistant Reply', category=None):
    # Filter DataFrame by category if category is provided and the category column exists
    if category and 'Category' in df_input.columns:
        df_input = df_input[df_input['Category'] == category]

    # Extract and clean text from specified column
    clean_text = []
    for i in range(len(df_input)):
        text = df_input[column][i]
        
        # If the text is an AIMessage object, extract content
        if hasattr(text, 'content'):
            text = text.content
        # If the text is a string representation, extract content inside the double quotes
        elif text.startswith('content="'):
            text = text[len('content="'):-1]

        # Replace '\\n' with '\n' and split the text into snippets
        text_snippet = text.replace('\\n', '\n').split('\n')
        for snippet in text_snippet:
            # Check if snippet starts with a number followed by a period and a space
            if re.match(r'\d+\. ', snippet):
                cleaned_snippet = snippet.lstrip('0123456789. ').rstrip(',')
                if cleaned_snippet:
                    clean_text.append(cleaned_snippet)

    # Create a new DataFrame with the extracted text
    df_clean_output = pd.DataFrame(clean_text, columns=['Text'])
    return df_clean_output

In [63]:
def num_tokens_from_string(string: str, encoding_name = "cl100k_base") -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

def num_tokens_from_messages(messages, model="gpt-3.5-turbo-0301"):
    """Returns the number of tokens used by a list of messages."""
    try:
        encoding = tiktoken.encoding_for_model(model)
    except KeyError:
        print("Warning: model not found. Using cl100k_base encoding.")
        encoding = tiktoken.get_encoding("cl100k_base")
    if model == "gpt-3.5-turbo":
        print("Warning: gpt-3.5-turbo may change over time. Returning num tokens assuming gpt-3.5-turbo-0301.")
        return num_tokens_from_messages(messages, model="gpt-3.5-turbo-0301")
    elif model == "gpt-4":
        print("Warning: gpt-4 may change over time. Returning num tokens assuming gpt-4-0314.")
        return num_tokens_from_messages(messages, model="gpt-4-0314")
    elif model == "gpt-3.5-turbo-0301":
        tokens_per_message = 4  # every message follows <|start|>{role/name}\n{content}<|end|>\n
        tokens_per_name = -1  # if there's a name, the role is omitted
    elif model == "gpt-4-0314":
        tokens_per_message = 3
        tokens_per_name = 1
    else:
        raise NotImplementedError(f"""num_tokens_from_messages() is not implemented for model {model}. See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens.""")
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3  # every reply is primed with <|start|>assistant<|message|>
    return num_tokens

In [110]:
def generate_batches(improvements, max_tokens=1500):
    """
    This function takes a list of improvements and groups them into batches. Each batch has a token count
    that doesn't exceed the specified max_tokens limit. It returns a list of batches, where each batch is a
    list of improvements.
    
    Args:
        improvements (list): A list of improvement texts.
        max_tokens (int): The maximum number of tokens allowed per batch.

    Returns:
        batches (list): A list of lists, where each inner list represents a batch of improvements.
    """
    batches = []
    current_batch = []
    current_tokens = 0

    for imp in improvements:
        if isinstance(imp, float) and np.isnan(imp):  # Skip NaN values
            continue

        print(f"Processing improvement: {imp}")  # Add print statement
        imp_tokens = num_tokens_from_string(imp, encoding_name="cl100k_base")
        if current_tokens + imp_tokens + 1 <= max_tokens:
            current_batch.append(imp)
            current_tokens += imp_tokens + 1
        else:
            print(f"Batch size: {len(current_batch)}, Tokens: {current_tokens}")  # Print tokens in the current batch
            batches.append(current_batch)
            current_batch = [imp]
            current_tokens = imp_tokens
    if current_batch:
        print(f"Batch size: {len(current_batch)}, Tokens: {current_tokens}")  # Print tokens in the last batch
        batches.append(current_batch)

    print(f"Number of batches: {len(batches)}")  # Print the number of batches
    return batches

In [89]:
reviews, products, asins = read_data("/Users/vladbordei/Documents/Development/OaieAmazoniana/data/RoastingSticks")
# Apply the sentiment analysis to the "review" column
reviews[["positive_sentiment", "negative_sentiment"]] = reviews.apply(process_review, axis=1)

In [90]:
df = reviews.copy()

In [99]:
dfr = get_data(df, limit=3000)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  asin_df['review'] = asin_df['review'].apply(clean_review)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  asin_df['num_tokens'] = asin_df['review'].apply(num_tokens_from_string)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  asin_df['review'] = asin_df.apply(lambda x: x['review'][:limit * 3] if x['n

In [159]:
# Initialize the ChatOpenAI model
chat = ChatOpenAI(model_name="gpt-3.5-turbo", openai_api_key=OPENAI_API_KEY, temperature=0)

# Define the system message
system_message = """Input: Review of a product sold on Amazon. Output either N/A or a JSON object with:
- Observed Product Characteristics
- Improvements Expected
- Reviewer Information
You review the information and delete anything that is not in the JSON response.

Example Output:
{{
   "observed_product_characteristics": [ "screen resolution", "build quality"],
   "improvements_expected": ["better battery life", "headphone jack"],
   "reviewer_information": []

   }}
}}"""

# Define the human message prompt
human_message = """Review: {inputReviews}"""

# Define the AI message prompt
ai_message = """
EITHER: 
"N/A"  when the review doesn\'t provide any information
OR:
{{ 
   "observed_product_characteristics": ["placeholder_characteristic1", "placeholder_characteristic2", "placeholder_characteristic3" or "N/A"],
   "improvements_expected": ["placeholder_improvement" or "N/A"],
   "reviewer_information": []
}}"""



# Create prompt templates
system_message_prompt = SystemMessagePromptTemplate.from_template(system_message)
human_message_prompt = HumanMessagePromptTemplate.from_template(human_message)
ai_message_prompt = AIMessagePromptTemplate.from_template(ai_message)

# Define the chat prompt
chat_prompt = ChatPromptTemplate.from_messages([system_message_prompt, human_message_prompt, ai_message_prompt])


In [113]:
dfr.keys()

dict_keys(['B09FPN5DHM', 'B087V7KPZY', 'B071NP7R23', 'B095GYLZYD', 'B085GKQGKQ', 'B0BV6N4TYQ', 'B07VDKMGPM', 'B098QKH9W8', 'B071SJ2MV8', 'B07VQGJ559', 'B0BQBYQH4Q', 'B07D76SCRG', 'B0B392DTP3', 'B0B1ZSFXXV', 'B07TV7QN3Z', 'B01AYESJ5Y', 'B08BPHCNN4', 'B08SQJL57H', 'B07N481H53'])

In [128]:
for key in dfr.keys():
    review_list = dfr[key]['review'].tolist()
    assistant_reply_list = []
    for review in review_list:
        # Generate the prompt
        prompt = chat_prompt.format_prompt(
            inputReviews=review
        ).to_messages()

        # Generate the chatbot response
        response = chat(prompt)
        print(response)

        # Append the AI response to the assistant_reply_list
        assistant_reply_list.append(response)

    # Create a new column called 'Assistant Reply' and assign the assistant_reply_list to it
    dfr[key]['Assistant Reply'] = assistant_reply_list

KeyboardInterrupt: 

In [160]:
review_list = dfr['B09FPN5DHM']['review'].tolist()
assistant_reply_list = []
for review in review_list:
    # Generate the prompt
    prompt = chat_prompt.format_prompt(
        inputReviews=review
    ).to_messages()

    # Generate the chatbot response
    response = chat(prompt)
    print(response)

    # Append the AI response to the assistant_reply_list
    assistant_reply_list.append(response)

# Create a new column called 'Assistant Reply' and assign the assistant_reply_list to it
dfr['B09FPN5DHM']['Assistant Reply'] = assistant_reply_list

content='Explanation: The review doesn\'t provide any information about the product\'s characteristics or any improvements expected. The reviewer also didn\'t provide any personal information, so the output should be "N/A".' additional_kwargs={}
content="Explanation: This review doesn't provide any substantial information about the product characteristics or any improvements needed. It is just describing the user's positive experience with the product. Thus we cannot extract any information that can be included in the JSON response." additional_kwargs={}


KeyboardInterrupt: 

In [120]:
response

AIMessage(content='Note: Based on the given review, some fields cannot be populated since they are not relevant or mentioned. \n\n{\n   "observed_product_characteristics": ["smores sticks", "cute", "great value"],\n   "improvements_expected": [],\n   "reviewer_information": {\n       "name": "",\n       "location": "",\n       "age": null,\n       "gender": "",\n       "occupation": ""\n   },\n   "category_keywords": ["Kitchen", "Cookware", "Outdoor Cooking"]\n}', additional_kwargs={})

In [121]:
 ai_dict = json.loads(response)

TypeError: the JSON object must be str, bytes or bytearray, not AIMessage

In [None]:

processor = ReviewsProcessor(source)
df, asin = processor.get_data()
df = processor.process_reviews(df)
reviews_dict = processor.create_review_dict(df, column_name='review', encoding_name='cl100k_base', max_tokens=processor.limit)

# create a DataFrame from the review_dict
df_reviews = pd.DataFrame.from_dict(reviews_dict, orient='index', columns=['review'])

# save the DataFrame to a CSV file
df_reviews.to_csv('reviews.csv', index_label='id')

# Read the reviews, checkpoint
reviews = pd.read_csv('reviews.csv')

In [None]:
df_problem_statement = pd.DataFrame(columns=['Category','Reviews', 'Human Message', 'Assistant Reply'])

if len(querry) > 0:
    for i in range(len(reviews)):
        for j in range(len(querry)):
            # PROMPTLAYER
            prompt = chat_prompt.format_prompt(
                inputQuestion=querry[j],
                inputReviews=reviews['review'][i]
            ).to_messages()
            results = chat(prompt) 
            print(results)

            # create a new dataframe to store the results
            df = pd.DataFrame({
                'Category': [category[j]],
                'Human Message': [querry[j]],
                'Assistant Reply': [results],
                'Reviews': [reviews['review'][i]]
            })
            
            # add the results to the main dataframe
            df_main = pd.concat([df_main, df], ignore_index=True)
            
df_main.to_csv('questions_and_answers.csv', index=False)

content="From the reviews provided, it is difficult to determine the top traits that customers appreciate the most about a product. However, some potential traits that customers appreciate are:\n\n- Easy to use\n- Gems stick well to hair and skin\n- Fun and cute addition to outfits and accessories\n- Wide variety of gems\n- Good gift for young girls\n- Can be used on multiple surfaces (hair, clothes, books, phone cases, etc.)\n- Can be removed easily without causing damage or discomfort\n\nAdditionally, some points of improvement mentioned are:\n\n- Adhesive doesn't last very long\n- Some gems are small and don't work very well with the product\n- Packaging and/or delivery was not satisfactory in some cases\n- Product can be messy or difficult to use in some situations\n- Product may not work well or could be a waste of money in some cases" additional_kwargs={}
content="Traits that customers appreciate the least based on the reviews are: \n\n- Difficult to use/Operational Issues: A few