#### STRUCTURE
- Select required review data
    - read csv
    - clean reviews
    - filter and sort reviews
    - select the number of required reviews
- Process review data with GPT
    - describe prompts
    - write tasks
    - process tasks
- Process resulting dictionary
    - map results
    - explode results to dataframe format
    - save data


In [19]:
import pandas as pd
import numpy as np
import re
import requests
import json
import csv
import openai

import tiktoken
from typing import Dict

from rich.console import Console
from rich.table import Table
console = Console()
from datetime import datetime, timedelta
from dotenv import load_dotenv
import os

import requests
from tenacity import retry, wait_random_exponential, stop_after_attempt

load_dotenv()
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
SERPAPI_API_KEY = os.getenv('SERPAPI_API_KEY')
WOLFRAM_ALPHA_APPID = os.getenv('WOLFRAM_ALPHA_APPID')
PROMPTLAYER_API_KEY = os.getenv('PROMPTLAYER_API_KEY')
HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')

if os.getenv("OPENAI_API_KEY") is not None:
    print ("OPENAI_API_KEY is ready")
else:
    print ("OPENAI_API_KEY environment variable not found")

# from getpass import getpass
# HUGGINGFACEHUB_API_TOKEN = getpass()


GPT_MODEL = "gpt-3.5-turbo-0613"

OPENAI_API_KEY is ready


In [2]:
def num_tokens_from_string(string: str, encoding_name = "cl100k_base") -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens


In [3]:
def extract_asin(url):
    pattern = r'ASIN=(\w{10})'
    match = re.search(pattern, url)
    if match:
        return match.group(1)
    else:
        return None

def clean_review(review):
    try:
        return re.sub(r'[^a-zA-Z0-9\s]+', '', review)
    except TypeError as e:
        print(f"Error cleaning review: {e}")
        return ""



def initial_review_clean_data(df, limit=3000):
    # Add the asin column to the dataframe
    # df['asin'] = df['asin.original']

    # Process the reviews in the dataframe
    df.loc[:, 'review'] = df['review'].apply(clean_review)
    df.loc[:, 'num_tokens'] = df['review'].apply(num_tokens_from_string)
    df.loc[:, 'review'] = df.apply(lambda x: x['review'][:limit * 3] if x['num_tokens'] > limit else x['review'], axis=1)
    df.loc[:, 'review_num_tokens'] = df['review'].apply(num_tokens_from_string)

    return df



In [4]:
#asin_list_path = './data/external/asin_list.csv'
asin_list_path = '/Users/vladbordei/Documents/Development/ProductExplorer/data/external/asin_list.csv'
asin_list = pd.read_csv(asin_list_path)['asin'].tolist()

In [5]:
# reviews_path = './data/interim/reviews_with_sentiment.csv'
reviews_path = '/Users/vladbordei/Documents/Development/ProductExplorer/data/interim/reviews_with_sentiment.csv'
reviews = pd.read_csv(reviews_path)

#### THIS PART REDUCES THE REVIEW NUMBERS SO WE CAN TEST AT EASE

#### Select required review data
    - read csv
    - clean reviews
    - filter and sort reviews
    - select the number of required reviews

In [6]:
# reviews.rename(columns={'Variation': 'asin'}, inplace=True)

In [7]:
reviews['asin'] = reviews['URL'].apply(extract_asin)

In [8]:
# Get the value counts for each unique value of 'asin.original'
counts = reviews['asin'].value_counts()

# Keep only the top values
top = counts.head(1000)

# Filter the reviews DataFrame to keep only rows with asin.original in the top 10
reviews_filtered = reviews[reviews['asin'].isin(top.index)]

# Get the datetime object for 12 months ago
date_12_months_ago = datetime.today() - timedelta(days=365)

# Convert the 'Date_initial' column to datetime format
reviews_filtered['Date'] = pd.to_datetime(reviews_filtered['Date'].apply(lambda s: s.split(' on ')[-1]))

# Convert the 'date.date' column to datetime format
reviews_filtered['Date'] = pd.to_datetime(reviews_filtered['Date'])

# Filter the reviews dataframe to only include reviews from the last 12 months
reviews_last_12_months = reviews_filtered[reviews_filtered['Date'] >= date_12_months_ago]

# keep only latest  x reviews
reviews_count_filtered = reviews_last_12_months.groupby('asin').tail(3)

# reset index
reviews_count_filtered = reviews_count_filtered.reset_index(drop=True)
reviews_count_filtered["id"] = reviews_count_filtered.index


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reviews_filtered['Date'] = pd.to_datetime(reviews_filtered['Date'].apply(lambda s: s.split(' on ')[-1]))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reviews_filtered['Date'] = pd.to_datetime(reviews_filtered['Date'])


In [9]:
reviews_df = initial_review_clean_data(reviews_count_filtered)

In [10]:
try:
    reviews_df.drop(columns = ["index", "level_0", "Author"], inplace = True)
except:
    pass

#### WRITING DOWN TASKS FOR AI TO PROCESS IN PARALLEL

#### Process review data with GPT
    - review functions


In [15]:
# https://towardsdatascience.com/an-introduction-to-openai-function-calling-e47e7cd7680e
review_functions = [
    {
        "name": "review_data_function",
        "description": "Provide a detailed description of a product",
        "parameters": {
            "type": "object",
            "properties": {
                "Review Summary": {
                    "type": "string",
                    "description": "A brief summary of the review. Example: Good product overall, but improvements can be made in battery life and noise levels."
                },
                "Topics": {
                    "type": "Object",
                    "description": '''"Topics discussed in the review. Separate the topic and the observation on the topic with a :. Example: 
                        {
                        "battery life": "installation",
                        "noise levels": "too high",
                        "ease of use": "had a hard time starting to use it",
                        "durability": "easy to break",
                        "price": "too high" 
                        }'''
                },
                "Buyer Motivation": {
                    "type": "string",
                    "description": "Reasons why the buyer purchased the product. Example: to replace an old product, to try out a new product, to give as a gift"
                },
                "Customer Expectations": {
                    "type": "string",
                    "description": "Expectations the customer had before purchasing the product. Example: to be able to use the product for a long time, to be able to use the product in a variety of situations, to be able to use the product for a specific purpose"
                },
                "How the product is used": {
                    "type": "string",
                    "description": "Information about what the product is used for or about how the product is used. Example: doodling, practicing letters/shapes, playing games"
                },
                "Where the product is used": {
                    "type": "string",
                    "description": "Suggested locations or situations where the product can be used. Example: car, restaurant, garden, public parks"
                },
                "User Description": {
                    "type": "string",
                    "description": "Description of the user for the product. Example: children, preschoolers,  basketball players, mothers, office workers"
                },
                "Packaging": {
                    "type": "string",
                    "description": "Description of the product's packaging. Example: sturdy recyclable box, wrapped in plastic, great for gifting"
                },
                "Season": {
                    "type": "string",
                    "description": "Season or time of year when the product is typically used. Example: fall and winter"
                },
                "When the product is used": {
                    "type": "string",
                    "description": "Time of day or week when the product is typically used. Example: early in the morning, in the weekend"
                }
            },
            "required": ["Review Summary", "Topics", "Buyer Motivation", "Customer Expectations", "How the product is used", "Where the product is used", "User Description", "Packaging", "Season", "When the product is used"]
        },
    }
]


#### Process review data with GPT. Run the model in async mode

In [20]:
import asyncio
import aiohttp
from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
)

headers = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {OPENAI_API_KEY}"
}

class ProgressLog:
    def __init__(self, total):
        self.total = total
        self.done = 0

    def increment(self):
        self.done = self.done + 1

    def __repr__(self):
        return f"Done runs {self.done}/{self.total}."

@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(20), before_sleep=print, retry_error_callback=lambda _: None)
async def get_completion(content, session, semaphore, progress_log, functions=None, function_call=None):
    async with semaphore:
        json_data = {
            "model": GPT_MODEL,
            "messages": content,
            "temperature": 0
        }
        
        if functions is not None:
            json_data.update({"functions": functions})
        if function_call is not None:
            json_data.update({"function_call": function_call})

        async with session.post("https://api.openai.com/v1/chat/completions", headers=headers, json=json_data) as resp:
            response_json = await resp.json()
            progress_log.increment()
            print(progress_log)
            return response_json["choices"][0]['message']

async def get_completion_list(content_list, max_parallel_calls, timeout, functions=None, function_call=None):
    semaphore = asyncio.Semaphore(value=max_parallel_calls)
    progress_log = ProgressLog(len(content_list))

    async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(timeout)) as session:
        return await asyncio.gather(*[get_completion(content, session, semaphore, progress_log, functions, function_call) for content in content_list])



In [21]:
# Define maximum parallel calls and timeout
max_parallel_calls = 10  # Adjust based on how many requests you want to make concurrently
timeout = 600  # Adjust timeout as per your needs

# Define functions and function call
functions = review_functions  # Replace with your functions
function_call = {"name": "review_data_function"}

# Create a list of messages for all reviews
content_list = []
for id in reviews_df['id']:
    review = reviews_df[reviews_df['id'] == id]['review'].values[0]
    messages = [
        {"role": "user", "content": f"REVIEW: ```{review}```"},
    ]
    content_list.append(messages)



# Wrap your main coroutine invocation in another async function.
async def main():
    responses = await get_completion_list(content_list, max_parallel_calls, timeout, functions, function_call)
    return responses

# Now you can run your code using an await expression:
responses = await main()

Done runs 1/33.
<RetryCallState 6145814352: attempt #1; slept for 0.55; last result: failed (KeyError 'choices')>
Done runs 2/33.
<RetryCallState 6145814784: attempt #1; slept for 0.67; last result: failed (KeyError 'choices')>
Done runs 3/33.
<RetryCallState 4870410960: attempt #1; slept for 0.62; last result: failed (KeyError 'choices')>
Done runs 4/33.
<RetryCallState 6145814208: attempt #1; slept for 0.54; last result: failed (KeyError 'choices')>
Done runs 5/33.
<RetryCallState 6145815888: attempt #1; slept for 0.14; last result: failed (KeyError 'choices')>
Done runs 6/33.
<RetryCallState 6145805376: attempt #1; slept for 0.34; last result: failed (KeyError 'choices')>
Done runs 7/33.
<RetryCallState 6145812816: attempt #1; slept for 0.45; last result: failed (KeyError 'choices')>
Done runs 8/33.
<RetryCallState 6145813392: attempt #1; slept for 0.58; last result: failed (KeyError 'choices')>
Done runs 9/33.
<RetryCallState 6145815216: attempt #1; slept for 0.7; last result: fail

CancelledError: 

In [None]:
reviews_df['initial_response'] = responses

In [27]:
initial_responses = responses.copy()

In [40]:
eval_responses = []
for item in initial_responses:
    data = item['function_call']['arguments']
    eval_data = eval(data)
    eval_responses.append(eval_data)

In [41]:
reviews_df['eval_response'] = eval_responses

In [47]:
new_cols = list(reviews_df['eval_response'][3].keys())

In [58]:
for col in new_cols:
    reviews_df[col] = np.nan

In [61]:
for i in reviews_df.index:
        for col in new_cols:
                reviews_df[col][i] = reviews_df['eval_response'][i][col]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reviews_df[col][i] = reviews_df['eval_response'][i][col]


In [63]:
interim_reviews_path = '/Users/vladbordei/Documents/Development/ProductExplorer/data/interim/reviews_df_interim.csv'
reviews_df.to_csv(interim_reviews_path, index=False)

NameError: name 'reviews_df' is not defined