In [1]:
import pandas as pd
from groq import Groq
import json
import os
import time
import re
from tqdm import trange
from langchain_core.prompts import PromptTemplate
import logging
from math import ceil
from typing import List, Dict, Tuple, Any

logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)

## load evv variables
from dotenv import load_dotenv

load_dotenv()
GROQ_API_KEY = os.environ["GROQ_API_KEY"]
CHAT_MODEL   = os.environ["CHAT_MODEL"]
client       = Groq()

GENERATE_EN_LABELS_PROMPT = '''
You are an linguistics professor tasked with classifying seller feedback for an e-commerce platform. 
Each feedback item should be categorised into one or more appropriate labels from the following list:
['Negative Complaint','Constructive Criticism','Design Feedback','Positive Comment','Neutral']
You are not to write any code, but just use your knowledge to classify the feedback.
Your output should be the feedback IDs and their corresponding label.

Now classify the following feedback:
Feedbacks: {pairs}

Example Output format:
[{{"feedback_id": 123456, "label": ["Negative Complaint"]}}, {{"feedback_id": 423456, "label": ["Constructive Criticism","Design Feedback"]}}, {{"feedback_id": 654321, "label": ["Negative Complaint"]}}]

Double check and ensure that your format output matches the example output format provided.
''' 

In [4]:
def load_region_data(region: str) -> pd.DataFrame:
    # Define the file path based on the region
    region_path = f"../data/official_data/feedback_{region}.xls"

    # Specify columns to read
    columns_to_read = ["Feedback id", "Feedback 1", "Feedback 2"]

    # Load the data into a DataFrame
    df = pd.read_excel(region_path, usecols=columns_to_read)

    # Filter out rows with missing or invalid data
    df_filtered = df[
        (df['Feedback 1'].notna()) &
        (df['Feedback 2'].notna()) &
        (df['Feedback 2'] != '{"description":""}')
    ]

    # Extract the 'description' field from JSON in 'Feedback 2'
    df_filtered['Feedback 2'] = df_filtered['Feedback 2'].apply(
        lambda x: json.loads(x)['description'] if isinstance(x, str) else None
    )

    # Convert 'Feedback id' to numeric and drop rows with invalid IDs
    df_filtered['Feedback id'] = pd.to_numeric(df_filtered['Feedback id'], errors='coerce')
    df_filtered = df_filtered.dropna(subset=['Feedback id'])

    return df_filtered


def format_llm_input(df: pd.DataFrame) -> Tuple[List[Dict[str, str]], Dict[int, str]]:
    # Extract feedback IDs and feedback text
    feedback_ids = list(df['Feedback id'])
    feedback_texts = list(df['Feedback 2'])

    # Create a dictionary mapping feedback IDs to feedback text
    id_feedback = {int(feedback_id): feedback for feedback_id, feedback in zip(feedback_ids, feedback_texts)}

    # Prepare the LLM input as a list of dictionaries
    llm_input = [{'id': feedback_id, 'feedback': feedback} for feedback_id, feedback in id_feedback.items()]

    return llm_input, id_feedback


def get_id_labels(llm_response: str, pattern: str = r'\[\s*\{(?:.|\n)*\}\s*\]') -> List[Dict[str, str]]:
    if not isinstance(llm_response, str):
        raise TypeError("The LLM response must be a string.")

    try:
        # Find the match
        match = re.search(pattern, llm_response, re.DOTALL)
        if not match:
            print(f"THIS RESPONSE WAS PRODUCED AND WAS UNABLE TO BE PICKED UP:\n{llm_response}")
            raise ValueError("No valid JSON list found in the response.")

        json_string = match.group(0)
        result = json.loads(json_string)

        # Validate the structure of the result
        if not isinstance(result, list) or not all(isinstance(item, dict) for item in result):
            raise ValueError("Extracted JSON is not a list of dictionaries.")
        
        return result

    except json.JSONDecodeError as e:
        raise ValueError(f"Failed to decode JSON: {e}")
    except Exception as e:
        raise RuntimeError(f"An unexpected error occurred: {e}")


def generate_batch_labels(id_feedback_pairs, label_prompt: str, client):
    prompt = PromptTemplate(
        template=label_prompt,
        input_variables=["pairs"],
    )

    final_prompt = prompt.format(pairs=id_feedback_pairs)

    # Generate the completion by interacting with the language model API
    completion = client.chat.completions.create(
        model=CHAT_MODEL,
        messages=[
                    {
                        "role": "user", 
                        "content": final_prompt
                    }
                    ],
        temperature=0,  # Control the randomness of the output (lower means less random)
        max_tokens=1024,  # Limit the response length
        top_p=1,  # Nucleus sampling parameter (1 means only the most likely tokens are considered)
        stream=True,  # Enable streaming of the response chunks
        stop=None,  # Define stopping conditions (None means no stopping condition)
    )

    # Initialize an empty string to accumulate the response content
    response = """"""
    for chunk in completion:
        # Append each chunk of content to the response string
        response += chunk.choices[0].delta.content or ""
        
    pairings = get_id_labels(response)
    
    return pairings


def generate_labels(llm_input, num_per_batch, output_file=f'../data/llm_responses/llm_responses.json'):
    # Determine the number of batches
    num_batches = ceil(len(llm_input) / num_per_batch)

    # Initialise indices for batch processing
    start_index = 0

    just_in_case_stop_index = 0
    
    try:
        for i in trange(num_batches):
            # Calculate the batch indices
            end_index = start_index + num_per_batch
            batch_pairs = llm_input[start_index:end_index]
            try:
                
                # Call the function to generate labels for the current batch
                batch_labels = generate_batch_labels(batch_pairs, GENERATE_EN_LABELS_PROMPT, client)
            except ValueError:
                    intermediate_end = start_index+5
                    batch_pairs = llm_input[start_index:intermediate_end]
                    batch_labels = generate_batch_labels(batch_pairs, GENERATE_EN_LABELS_PROMPT, client)   
                    
                    # Write the current batch to the JSON file
                    with open(output_file, 'a') as json_file:
                        # Convert the batch to a JSON string and write it
                        for label in batch_labels:
                            json_file.write(json.dumps(label) + '\n')
                    
                    intermediate_start = intermediate_end
                    batch_pairs = llm_input[intermediate_start:end_index]
                    batch_labels = generate_batch_labels(batch_pairs, GENERATE_EN_LABELS_PROMPT, client)   
                    
                    # Write the current batch to the JSON file
                    with open(output_file, 'a') as json_file:
                        # Convert the batch to a JSON string and write it
                        for label in batch_labels:
                            json_file.write(json.dumps(label) + '\n')
                    
                    # Sleep for 60 seconds every 10 iterations
                    if (i + 1) % 5 == 0:
                        print(f"Completed {i + 1} iterations. To prevent rate limits, sleeping for 60 seconds...")
                        time.sleep(60)
                        
                    continue
                
            # Update the start index for the next batch
            start_index = end_index

            # Write the current batch to the JSON file
            with open(output_file, 'a') as json_file:
                # Convert the batch to a JSON string and write it
                for label in batch_labels:
                    json_file.write(json.dumps(label) + '\n')

            # Sleep for 60 seconds every 10 iterations
            if (i + 1) % 5 == 0:
                print(f"Completed {i + 1} iterations. To prevent rate limits, sleeping for 60 seconds...")
                time.sleep(60)
                
            just_in_case_stop_index = end_index
            # Include  extra rest (not sure why but just in case lol)
            time.sleep(2)

        print(f"All batches written to {output_file}")

    except Exception as e:
        
        print(f"An error occurred while processing: {e}")
        print(f"Stopped at batch {just_in_case_stop_index}\n")


def read_json_file(file_path: str):
    try:
        data = []
        with open(file_path, 'r') as json_file:
            for line in json_file:
                data.append(json.loads(line.strip()))
        return data
    except FileNotFoundError:
        print(f"Error: The file {file_path} does not exist.")
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        
        
def pair_id_feedback(id_feedback: dict, feedback_labels: list):
    for i in range(len(feedback_labels)):
        id = feedback_labels[i]['feedback_id']
        comment = id_feedback[id]
        feedback_labels[i]['Comment'] = comment
        
    return feedback_labels


def write_to_csv(region: str, combined):
    # Convert to a DataFrame
    combined_df = pd.DataFrame(combined)

    # Rename columns to match the required format
    combined_df.rename(columns={'feedback_id': 'Feedback id',
                                'label': 'Label',
                                'Comment': 'Comment'},
                        inplace=True)
    combined_df['Comment'] = combined_df['Comment'].str.replace('""', '"', regex=False).str.strip('"')


    # Save to CSV
    csv_filename = f'../data/{region}_labelled_feedback_data.csv'
    combined_df.to_csv(csv_filename, index=False)
    print(f"{region} Labels wrote to csv")

def main():
    region = "SG"
    df = load_region_data(region)
    llm_input, id_feedback = format_llm_input(df)
    generate_labels(llm_input, num_per_batch=10)
    feedback_labels = read_json_file(file_path=f'../data/llm_responses/llm_responses.json')
    combined = pair_id_feedback(id_feedback, feedback_labels)
    write_to_csv(region, combined)

In [5]:
main()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['Feedback 2'] = df_filtered['Feedback 2'].apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['Feedback id'] = pd.to_numeric(df_filtered['Feedback id'], errors='coerce')
  0%|          | 0/35 [00:00<?, ?it/s]2025-01-28 01:01:34,550 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
  3%|▎         | 1/35 [00:01<00:40,  1.20s/it]2025-01-28 01:01:35,788 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
 

Completed 5 iterations. To prevent rate limits, sleeping for 60 seconds...


 14%|█▍        | 5/35 [01:06<11:25, 22.86s/it]2025-01-28 01:02:40,704 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
 17%|█▋        | 6/35 [01:07<07:30, 15.54s/it]2025-01-28 01:02:41,952 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
 20%|██        | 7/35 [01:08<05:02, 10.80s/it]2025-01-28 01:02:43,018 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
 23%|██▎       | 8/35 [01:09<03:27,  7.69s/it]2025-01-28 01:02:44,088 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
 26%|██▌       | 9/35 [01:10<02:26,  5.64s/it]2025-01-28 01:02:45,209 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"


Completed 10 iterations. To prevent rate limits, sleeping for 60 seconds...


 29%|██▊       | 10/35 [02:11<09:28, 22.75s/it]2025-01-28 01:03:57,201 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
 31%|███▏      | 11/35 [02:47<10:39, 26.67s/it]2025-01-28 01:04:21,878 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
 34%|███▍      | 12/35 [02:48<07:14, 18.88s/it]2025-01-28 01:04:22,840 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
 37%|███▋      | 13/35 [02:49<04:59, 13.59s/it]2025-01-28 01:04:24,332 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
 40%|████      | 14/35 [02:50<03:26,  9.83s/it]2025-01-28 01:04:25,461 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"


Completed 15 iterations. To prevent rate limits, sleeping for 60 seconds...


 43%|████▎     | 15/35 [03:51<08:25, 25.28s/it]2025-01-28 01:05:26,500 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
 46%|████▌     | 16/35 [03:53<05:42, 18.02s/it]2025-01-28 01:05:29,337 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
 49%|████▊     | 17/35 [04:19<06:09, 20.54s/it]2025-01-28 01:06:15,250 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
 51%|█████▏    | 18/35 [04:59<07:27, 26.34s/it]2025-01-28 01:06:33,885 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
 54%|█████▍    | 19/35 [05:00<05:00, 18.80s/it]2025-01-28 01:06:35,105 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"


Completed 20 iterations. To prevent rate limits, sleeping for 60 seconds...


 57%|█████▋    | 20/35 [06:01<07:52, 31.49s/it]2025-01-28 01:07:36,204 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
 60%|██████    | 21/35 [06:02<05:13, 22.40s/it]2025-01-28 01:07:37,463 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
 63%|██████▎   | 22/35 [06:03<03:27, 15.98s/it]2025-01-28 01:07:38,386 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
 66%|██████▌   | 23/35 [06:04<02:17, 11.49s/it]2025-01-28 01:07:39,431 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
 69%|██████▊   | 24/35 [06:06<01:33,  8.46s/it]2025-01-28 01:07:40,837 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"


Completed 25 iterations. To prevent rate limits, sleeping for 60 seconds...


 71%|███████▏  | 25/35 [07:07<04:02, 24.28s/it]2025-01-28 01:08:42,560 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
 74%|███████▍  | 26/35 [07:09<02:37, 17.52s/it]2025-01-28 01:08:43,932 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
 77%|███████▋  | 27/35 [07:10<01:41, 12.64s/it]2025-01-28 01:08:45,003 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
 80%|████████  | 28/35 [07:33<01:51, 15.90s/it]2025-01-28 01:09:08,500 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
 83%|████████▎ | 29/35 [07:35<01:08, 11.44s/it]2025-01-28 01:09:09,553 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"


Completed 30 iterations. To prevent rate limits, sleeping for 60 seconds...


 86%|████████▌ | 30/35 [08:36<02:11, 26.33s/it]2025-01-28 01:10:10,752 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
 89%|████████▊ | 31/35 [08:37<01:15, 18.77s/it]2025-01-28 01:10:11,802 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
 91%|█████████▏| 32/35 [08:38<00:40, 13.57s/it]2025-01-28 01:10:13,214 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
 94%|█████████▍| 33/35 [08:39<00:19,  9.82s/it]2025-01-28 01:10:14,272 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
 97%|█████████▋| 34/35 [08:40<00:07,  7.19s/it]2025-01-28 01:10:15,339 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"


Completed 35 iterations. To prevent rate limits, sleeping for 60 seconds...


100%|██████████| 35/35 [09:41<00:00, 16.63s/it]

All batches written to ../data/llm_responses/llm_responses.json
SG Labels wrote to csv





In [None]:
# calculating rate limit per day (500,000 tokens)
