In [None]:
import pandas as pd
from groq import Groq
import json
import os
import re
from langchain_core.prompts import PromptTemplate
import logging
from math import ceil

logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)

## load evv variables
from dotenv import load_dotenv

load_dotenv()
GROQ_API_KEY = os.environ["GROQ_API_KEY"]
CHAT_MODEL = os.environ["CHAT_MODEL"]
client = Groq()

In [96]:
def load_region_data(region):
    region_path = f"../data/official_data/feedback_{region}.xls"

    columns_to_read = ["Feedback id", "Feedback 1", "Feedback 2"]
    df = pd.read_excel(region_path, usecols=columns_to_read)

    df_filtered = df[
        (df['Feedback 1'].notna()) &
        (df['Feedback 2'].notna()) &
        (df['Feedback 2'] != '{"description":""}')
    ]

    df_filtered['Feedback 2'] = df_filtered['Feedback 2'].apply(lambda x: json.loads(x)['description'])

    # Handle problematic 'Feedback id' values
    df_filtered['Feedback id'] = pd.to_numeric(df_filtered['Feedback id'], errors='coerce')
    df_filtered = df_filtered.dropna(subset=['Feedback id'])
    return df_filtered


def format_llm_input(df):
    feedback_id = list(df['Feedback id'])
    feedback_2 = list(df['Feedback 2'].values)
    id_feedback = {}
    for i in range(len(feedback_id)):
        id_feedback[feedback_id[i]]= feedback_2[i]
    input = []
    for id, feedback in id_feedback.items():
        input.append({id, feedback})
        
    return input, id_feedback


In [97]:
GENERATE_LABELS_PROMPT = '''
You are an linguistics professor tasked with classifying seller feedback for an e-commerce platform. 
Each feedback item should be categorised into one or more appropriate labels from the following list:
['Negative_Complaint','Constructive_Criticism','Design_Feedback','Positive Comment','Neutral']
You are not to write any code, but just use your knowledge to classify the feedback.
Your output should be the feedback IDs and their corresponding label.

Now classify the following feedback:
Feedbacks: {pairs}

Example Output format:
[{{"feedback_id": 123456, "label": "Negative_Complaint"}}, {{"feedback_id": 423456,"label": "Constructive_Criticism"}}, {{"feedback_id": 654321,"label": "Negative_Complaint"}}]

''' 

In [106]:
def get_id_labels(llm_response, pattern=r'\[\{.*?\}\]'):
    if not isinstance(llm_response, str):
        raise TypeError("The LLM response must be a string.")

    try:
        # Find the match
        match = re.search(pattern, llm_response, re.DOTALL)
        if not match:
            raise ValueError("No valid JSON list found in the response.")

        json_string = match.group(0)
        result = json.loads(json_string)

        # Validate the structure of the result
        if not isinstance(result, list) or not all(isinstance(item, dict) for item in result):
            raise ValueError("Extracted JSON is not a list of dictionaries.")

        return result

    except json.JSONDecodeError as e:
        raise ValueError(f"Failed to decode JSON: {e}")
    except Exception as e:
        raise RuntimeError(f"An unexpected error occurred: {e}")

def generate_batch_labels(id_feedback_pairs, label_prompt, client):
    prompt = PromptTemplate(
        template=label_prompt,
        input_variables=["pairs"],
    )

    final_prompt = prompt.format(pairs=id_feedback_pairs)

    # Generate the completion by interacting with the language model API
    completion = client.chat.completions.create(
        model=CHAT_MODEL,
        messages=[
                    {
                        "role": "user", 
                        "content": final_prompt
                    }
                    ],
        temperature=0,  # Control the randomness of the output (lower means less random)
        max_tokens=1024,  # Limit the response length
        top_p=1,  # Nucleus sampling parameter (1 means only the most likely tokens are considered)
        stream=True,  # Enable streaming of the response chunks
        stop=None,  # Define stopping conditions (None means no stopping condition)
    )

    # Initialize an empty string to accumulate the response content
    response = """"""
    for chunk in completion:
        # Append each chunk of content to the response string
        response += chunk.choices[0].delta.content or ""
        
    pairings = get_id_labels(response)

    return pairings

def generate_labels(llm_input, num_per_batch, output_file=f'../data/llm_responses.json'):
    # Determine the number of batches
    num_batches = ceil(len(llm_input) / num_per_batch)

    # Initialise indices for batch processing
    start_index = 0

    # Open the JSON file in append mode
    try:
        for i in range(num_batches):
            # Calculate the batch indices
            end_index = start_index + num_per_batch
            batch_pairs = llm_input[start_index:end_index]

            # Call the function to generate labels for the current batch
            batch_labels = generate_batch_labels(batch_pairs, GENERATE_LABELS_PROMPT, client)
            
            # Update the start index for the next batch
            start_index = end_index

            # Write the current batch to the JSON file
            with open(output_file, 'a') as json_file:
                # Convert the batch to a JSON string and write it
                for label in batch_labels:
                    json_file.write(json.dumps(label) + '\n')
            
            if i >=2:
                break

        print(f"All batches written to {output_file}")

    except Exception as e:
        print(f"An error occurred while processing: {e}")

def read_json_file(file_path):
    """
    Reads a JSON file where each line is a separate JSON object.

    Args:
        file_path (str): Path to the JSON file.

    Returns:
        list: A list of all JSON objects read from the file.
    """
    try:
        data = []
        with open(file_path, 'r') as json_file:
            for line in json_file:
                data.append(json.loads(line.strip()))
        return data
    except FileNotFoundError:
        print(f"Error: The file {file_path} does not exist.")
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        
def pair_id_feedback(id_feedback, feedback_labels):
    for i in range(len(feedback_labels)):
        id = feedback_labels[i]['feedback_id']
        comment = id_feedback[id]
        feedback_labels[i]['Comment'] = comment
        
    return feedback_labels


def write_to_csv(region, combined):
    # Convert to a DataFrame
    combined_df = pd.DataFrame(combined)

    # Rename columns to match the required format
    combined_df.rename(columns={'feedback_id': 'Feedback id',
                       'label': 'Label',
                       'Comment': 'Comment'},
             inplace=True)

    # Save to CSV
    csv_filename = f'../data/{region}_labelled_feedback_data.csv'
    combined_df.to_csv(csv_filename, index=False)
    print(f"{region} Labels wrote to csv")



In [102]:
region = "SG"
df = load_region_data(region)
llm_input, id_feedback = format_llm_input(df)
generate_labels(llm_input, num_per_batch=5)
feedback_labels = read_json_file(file_path=f'../data/llm_responses.json')
combined = pair_id_feedback(id_feedback, feedback_labels)
write_to_csv(region, combined)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['Feedback 2'] = df_filtered['Feedback 2'].apply(lambda x: json.loads(x)['description'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['Feedback id'] = pd.to_numeric(df_filtered['Feedback id'], errors='coerce')
2025-01-27 16:08:14,724 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
2025-01-27 16:08:15,637 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
2025-01-27 16:08:17,038 - INFO - HTTP Request:

All batches written to ../data/llm_responses.json
