#### STRUCTURE
- Select required review data
    - read csv
    - clean reviews
    - filter and sort reviews
    - select the number of required reviews
- Process review data with GPT
    - describe prompts
    - write tasks
    - process tasks
- Process resulting dictionary
    - map results
    - explode results to dataframe format
    - save data


In [1]:
import pandas as pd
import numpy as np
import re
import requests
import json
import csv
import openai

import tiktoken
from typing import Dict

from rich.console import Console
from rich.table import Table
console = Console()
from datetime import datetime, timedelta
from dotenv import load_dotenv
import os

load_dotenv()
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
SERPAPI_API_KEY = os.getenv('SERPAPI_API_KEY')
WOLFRAM_ALPHA_APPID = os.getenv('WOLFRAM_ALPHA_APPID')
PROMPTLAYER_API_KEY = os.getenv('PROMPTLAYER_API_KEY')
HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')

if os.getenv("OPENAI_API_KEY") is not None:
    print ("OPENAI_API_KEY is ready")
else:
    print ("OPENAI_API_KEY environment variable not found")

# from getpass import getpass
# HUGGINGFACEHUB_API_TOKEN = getpass()

OPENAI_API_KEY is ready


In [None]:
asin_list_path = './data/external/asin_list.csv'
asin_list = pd.read_csv(asin_list_path)['asin'].tolist()

In [5]:
def num_tokens_from_string(string: str, encoding_name = "cl100k_base") -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

def num_tokens_from_messages(messages, model="gpt-3.5-turbo-0301"):
    """Returns the number of tokens used by a list of messages."""
    try:
        encoding = tiktoken.encoding_for_model(model)
    except KeyError:
        print("Warning: model not found. Using cl100k_base encoding.")
        encoding = tiktoken.get_encoding("cl100k_base")
    if model == "gpt-3.5-turbo":
        print("Warning: gpt-3.5-turbo may change over time. Returning num tokens assuming gpt-3.5-turbo-0301.")
        return num_tokens_from_messages(messages, model="gpt-3.5-turbo")
    elif model == "gpt-4":
        print("Warning: gpt-4 may change over time. Returning num tokens assuming gpt-4-0314.")
        return num_tokens_from_messages(messages, model="gpt-4-0314")
    elif model == "gpt-3.5-turbo-0301":
        tokens_per_message = 4  # every message follows <|start|>{role/name}\n{content}<|end|>\n
        tokens_per_name = -1  # if there's a name, the role is omitted
    elif model == "gpt-4-0314":
        tokens_per_message = 3
        tokens_per_name = 1
    else:
        raise NotImplementedError(f"""num_tokens_from_messages() is not implemented for model {model}. See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens.""")
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3  # every reply is primed with <|start|>assistant<|message|>
    return num_tokens

In [6]:
def extract_asin(url):
    pattern = r'ASIN=(\w{10})'
    match = re.search(pattern, url)
    if match:
        return match.group(1)
    else:
        return None

def clean_review(review):
    try:
        return re.sub(r'[^a-zA-Z0-9\s]+', '', review)
    except TypeError as e:
        print(f"Error cleaning review: {e}")
        return ""

def get_data_dict(df, limit=3000):
    # Add the asin column to the dataframe
    df['asin'] = df['asin.original']

    # Split the dataframe into a dictionary of dataframes, one for each unique asin
    asin_dfs = {}
    for asin in df['asin'].unique():
        asin_dfs[asin] = df[df['asin'] == asin]

    # Process each asin dataframe and add it to a new dictionary
    asin_data = {}
    for asin, asin_df in asin_dfs.items():
        asin_df.loc[:, 'review'] = asin_df['review'].apply(clean_review)
        print(asin_df['review'])
        asin_df.loc[:, 'num_tokens'] = asin_df['review'].apply(num_tokens_from_string)
        asin_df.loc[:, 'review'] = asin_df.apply(lambda x: x['review'][:limit * 3] if x['num_tokens'] > limit else x['review'], axis=1)
        asin_df.loc[:, 'review_num_tokens'] = asin_df['review'].apply(num_tokens_from_string)
        asin_df = asin_df.loc[:, ['id', 'review', 'review_num_tokens', 'asin', 'positive_sentiment', 'negative_sentiment', 'rating']].copy()
        asin_data[asin] = asin_df.copy()

    return asin_data

def initial_review_clean_data(df, limit=3000):
    # Add the asin column to the dataframe
    df['asin'] = df['asin.original']

    # Process the reviews in the dataframe
    df.loc[:, 'review'] = df['review'].apply(clean_review)
    df.loc[:, 'num_tokens'] = df['review'].apply(num_tokens_from_string)
    df.loc[:, 'review'] = df.apply(lambda x: x['review'][:limit * 3] if x['num_tokens'] > limit else x['review'], axis=1)
    df.loc[:, 'review_num_tokens'] = df['review'].apply(num_tokens_from_string)

    return df



In [53]:
reviews = pd.read_csv("reviews_with_sentiment.csv")

#### THIS PART REDUCES THE REVIEW NUMBERS SO WE CAN TEST AT EASE

#### Select required review data
    - read csv
    - clean reviews
    - filter and sort reviews
    - select the number of required reviews

In [55]:
# Get the value counts for each unique value of 'asin.original'
counts = reviews['asin.original'].value_counts()

# Keep only the top 5 values
top = counts.head(10)

# Filter the reviews DataFrame to keep only rows with asin.original in the top 10
reviews_filtered = reviews[reviews['asin.original'].isin(top.index)]

# Get the datetime object for 12 months ago
date_12_months_ago = datetime.today() - timedelta(days=365)

# Convert the 'date.date' column to datetime format
reviews_filtered['date.date'] = pd.to_datetime(reviews_filtered['date.date'])

# Filter the reviews dataframe to only include reviews from the last 12 months
reviews_last_12_months = reviews_filtered[reviews_filtered['date.date'] >= date_12_months_ago]

# keep only latest  x reviews
reviews_count_filtered = reviews_last_12_months.groupby('asin.original').tail(150)

# reset index
reviews_count_filtered = reviews_count_filtered.reset_index(drop=True)
reviews_count_filtered["id"] = reviews_count_filtered.index


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reviews_filtered['date.date'] = pd.to_datetime(reviews_filtered['date.date'])


In [56]:
reviews_df = initial_review_clean_data(reviews_count_filtered)

#### WRITING DOWN TASKS FOR AI TO PROCESS IN PARALLEL

#### Process review data with GPT
    - describe prompts


In [57]:
# https://github.com/openai/openai-cookbook/blob/main/examples/How_to_format_inputs_to_ChatGPT_models.ipynb

User_Prompt_1 = """
Format your response as a JSON object with: \
{\
"Review Summary",\
"Observed Technical Facts"\
"Improvements expected / what users request",\
"Complaints and issues users have experienced",
"How the product is used",\
"Where the product is used",\
"Sentiment", \
"Anger",\
"Anger Reason",\
"Delight",\
"Delight Reason",\
"Disappointment",\
"Disappointment Reason",\
"Time",\
"Season",\
"Weather",\
"User Known",\
"User Description"\
}

Product REVIEW from an ecommerce site \
delimited with triple backticks. ``` \
If information isn't present, use "unknown" value. \
Review Summary : summary to give feedback to the \
product development department, responsible for improving the \
product. 30 words max. \
Anger, Delight, Disappointment, User Known values as "true" of "false".\
Issues identified: Complaints and issues users have experienced.\
Improvements Expected: What users request to be improved and should be known by the \
product development department.\

REVIEW: ```\
I purchased these headphones and have been using them in the morning for listening to music, \
watching movies, and attending online meetings. \
The Bluetooth connectivity is efficient and the headphones are comfortable to wear. \
However, the battery life is shorter than advertised and the noise-canceling feature is slightly loud. \
The packaging was good and secure, and the headphones are suitable for use at home, \
in the office, and on public transportation. Overall, a good product, \
but improvements can be made in battery life and noise levels.
```"""


AI_Prompt_1 = """\
{\
"Review Summary": "Good product overall, but improvements can be made in battery life and noise levels.",\
"Observed Technical Facts": "Bluetooth connectivity, 40-hour battery life, noise-canceling feature",\
"Improvements expected / what users request": "Extended battery life, quieter operation",\
"Complaints and issues users have experienced": "Shorter battery life than advertised, slightly loud noise-canceling feature",\
"How the product is used": "Listening to music, watching movies, attending online meetings",\
"Where the product is used": "Home, office, public transportation",\
"Sentiment": "Mixed",\
"Anger": "No",\
"Anger Reason": "N/A",\
"Delight": "Yes",\
"Delight Reason": "Efficient Bluetooth connectivity, comfortable fit",\
"Disappointment": "Yes",\
"Disappointment Reason": "Shorter battery life, noisy noise-canceling feature",\
"Time": "morning",\
"Season": "Winter",\
"Weather": "Cold and snowy",\
"User Known": "No",\
"User Description": "Regular user who enjoys music and works from home"\
}"""

#### Process review data with GPT
    - write tasks

In [58]:
import json

filename = "data/requests_to_parallel_process.jsonl"

# Clear the contents of the files before starting the loop
with open(filename, "w") as f:
    pass


# Iterate over the keys in the reviews_dataframes_dict dictionary
with open(filename, "a") as f:
    for id in reviews_df['id']:
        review = reviews_df[reviews_df['id'] == id]['review'].values[0]
        # Create a list of jobs with the required format
        jobs = [
            {
                "model": "gpt-3.5-turbo",
                "temperature": 0,
                "messages": [
                    {"role":"system","content":"ID: " + str(id)},
                    {"role": "user", "content": User_Prompt_1},
                    {"role": "assistant", "content": AI_Prompt_1},
                    {"role": "user", "content": f"REVIEW: ```{review}```"},
                ],
            }
        ]

        # Write the jobs to the file
        for job in jobs:
            json_string = json.dumps(job)
            f.write(json_string + "\n")


## Process review data with GPT
    - process tasks


##### Run the model in async / paralel. Take extra care to provide correct filepaths. Output file will append if not changed.

https://github.com/openai/openai-cookbook/blob/297c53430cad2d05ba763ab9dca64309cb5091e9/examples/api_request_parallel_processor.py


python api_request_parallel_processor.py --requests_filepath '/Users/vladbordei/Documents/Development/oaie2/data/requests_to_parallel_process.jsonl' --save_filepath '/Users/vladbordei/Documents/Development/oaie2/data/paralel/results.jsonl' --request_url "https://api.openai.com/v1/chat/completions" --max_requests_per_minute 100

In [59]:
!python api_request_parallel_processor.py --requests_filepath '/Users/vladbordei/Documents/Development/oaie2/data/requests_to_parallel_process.jsonl' --save_filepath '/Users/vladbordei/Documents/Development/oaie2/data/paralel/results.jsonl' --request_url "https://api.openai.com/v1/chat/completions" --max_requests_per_minute 100 --api_key=s...oq06c

DEBUG: request_json = {'model': 'gpt-3.5-turbo', 'temperature': 0, 'messages': [{'role': 'system', 'content': 'ID: 0'}, {'role': 'user', 'content': '\nFormat your response as a JSON object with: {"Review Summary [30 words]","Observed Technical Facts""Improvements expected / what users request","Complaints and issues users have experienced",\n"How the product is used","Where the product is used","Sentiment", "Anger","Anger Reason","Delight","Delight Reason","Disappointment","Disappointment Reason","Time","Season","Weather","User Known","User Description"}\n\nProduct REVIEW from an ecommerce site delimited with triple backticks. ``` If information isn\'t present, use "unknown" value. Review Summary : summary to give feedback to the product development department, responsible for improving the product. 30 words max. Anger, Delight, Disappointment, User Known values as "true" of "false".Issues identified: Complaints and issues users have experienced.Improvements Expected: What users reques

## Process Batch model results

#### Process resulting dictionary
    - map results

In [89]:
import json

filename = '/Users/vladbordei/Documents/Development/oaie2/data/paralel/results.jsonl'  # replace with the name of your file

responses_dict = {}

with open(filename, 'r') as f:
    for line in f:
        data = json.loads(line)
        response = data[1]['choices'][0]['message']['content']
        id = data[0]['messages'][0]['content']
        id_number = int(id.split(': ')[1])
        responses_dict[id_number] = response

print(responses_dict)



In [90]:
reviews_df['response'] = reviews_df['id'].map(responses_dict)

In [91]:
reviews_df.to_csv('data/reviews_df_interim.csv', index=False)

#### Process resulting dictionary
    - explode results to dataframe format

In [92]:
test_df = reviews_df['response'].explode()

In [94]:
# Initialize an empty list to store the dataframes
extracted_data = []

for idx, row in reviews_df.iterrows():
    response_json_str = row['response']

    if response_json_str and isinstance(response_json_str, str):
        try:
            # Convert the JSON string to a dictionary
            parsed_dict = json.loads(response_json_str)
        except json.JSONDecodeError:
            parsed_dict = {}
    else:
        parsed_dict = {}
    
    # Extract the relevant data, setting default values for missing keys
    review_summary = parsed_dict.get('Review Summary', 'NaN')
    observed_technical_facts = parsed_dict.get('Observed Technical Facts', 'NaN')
    improvements_expected = parsed_dict.get('Improvements expected / what users request', 'NaN')
    issues_identified = parsed_dict.get('Complaints and issues users have experienced', 'NaN')
    how_product_is_used = parsed_dict.get('How the product is used', 'NaN')
    where_product_is_used = parsed_dict.get('Where the product is used', 'NaN')
    sentiment = parsed_dict.get('Sentiment', 'NaN')
    anger = parsed_dict.get('Anger', 'NaN')
    anger_reason = parsed_dict.get('Anger Reason', 'NaN')
    delight = parsed_dict.get('Delight', 'NaN')
    delight_reason = parsed_dict.get('Delight Reason', 'NaN')
    disappointment = parsed_dict.get('Disappointment', 'NaN')
    disappointment_reason = parsed_dict.get('Disappointment Reason', 'NaN')
    time = parsed_dict.get('Time', 'NaN')
    season = parsed_dict.get('Season', 'NaN')
    weather = parsed_dict.get('Weather', 'NaN')
    user_description = parsed_dict.get('User Description', 'NaN')
    join_id = row['id']


    new_keys = [
            review_summary,
            observed_technical_facts,
            improvements_expected,
            issues_identified,
            how_product_is_used,
            where_product_is_used,
            sentiment,
            anger,
            anger_reason,
            delight,
            delight_reason,
            disappointment,
            disappointment_reason,
            time,
            season,
            weather,
            user_description,
            join_id
            ]

    extracted_data.append(new_keys)

    new_columns = [
            'review_summary',
            'observed_technical_facts',
            'improvements_expected',
            'issues_identified',
            'how_product_is_used',
            'where_product_is_used',
            'sentiment',
            'anger',
            'anger_reason',
            'delight',
            'delight_reason',
            'disappointment',
            'disappointment_reason', 
            'time',
            'season',
            'weather',
            'user_description',
            'join_id'
        ]


    # Create a DataFrame with the extracted data
    new_df = pd.DataFrame(
        extracted_data,
        columns=new_columns
    )


In [97]:
merged_df = reviews_df.merge(new_df, how='inner', left_on='id', right_on='join_id')

In [None]:
merged_df.drop(columns=['join_id','asin'], inplace=True)

#### Process resulting dictionary
    - save data

In [None]:
merged_df.to_csv('reviews_df_final.csv', index=False)

In [101]:
reviews_db = merged_df[[
       'rating', 'review_summary','observed_technical_facts',
       'positive_sentiment', 'negative_sentiment',
       'improvements_expected', 'issues_identified', 'how_product_is_used', 'media',
       'where_product_is_used', 'sentiment', 'anger', 'anger_reason',
       'delight', 'delight_reason', 'disappointment', 'disappointment_reason',
       'time', 'season', 'weather',
       'user_description','title',
       'review', 'asin.variant','id', 'asin.original'
       ]].copy()

reviews_db.to_csv('reviews_db.csv', index = False)