In [1]:
import pandas as pd
from groq import Groq
import json
import os
import re
from langchain_core.prompts import PromptTemplate
import random
import logging

logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)

## load evv variables
from dotenv import load_dotenv

load_dotenv()
GROQ_API_KEY = os.environ["GROQ_API_KEY"]
CHAT_MODEL = os.environ["CHAT_MODEL"]
client = Groq()

In [64]:
df = pd.read_csv("../data/new_data/AI_feedback.csv", encoding='utf-8')
df.head()

Unnamed: 0,Comments,Labels
0,The article is poorly researched and full of i...,Negative_Complaint
1,You didn’t cover the key aspects of the topic.,Constructive_Criticism
2,The writing is too convoluted and hard to follow.,Constructive_Criticism
3,There are too many grammatical errors in the c...,Constructive_Criticism
4,Your articles are always biased.,Negative_Complaint


In [66]:
list(df['Labels'].unique())
# Add Content Suggestion
# 

['Negative_Complaint',
 'Constructive_Criticism',
 'Design_Feedback',
 'Positive Comment',
 'Neutral']

In [87]:
comments = list(df['Comments'])

In [83]:
GENERATE_LABELS_PROMPT = '''
You are an intelligent system tasked with classifying seller feedback for an e-commerce platform. Each feedback item should be categorised into one or more appropriate labels from the following list:
['Negative_Complaint','Constructive_Criticism','Design_Feedback','Positive Comment','Neutral']
Now classify the following feedback:
Feedback: {text}

Example Output format:
{{
    "feedback": "This platform is very confusing to use. The design needs improvement.",
    "label": "Negative_Complaint"
}}
''' 

In [71]:
def extract_answer(input_string):
    # Find the start and end indices of the JSON data within the input string
    # Assuming the JSON data starts with '{' and ends with '}'
    json_start = input_string.find("{")
    json_end = input_string.rfind("}") + 1

    # If either the start or end index is not found, raise an error
    if json_start == -1 or json_end == -1:
        raise ValueError("Invalid input: No JSON data found.")

    # Extract the substring that potentially contains the JSON data
    json_data = input_string[json_start:json_end]

    try:
        # Attempt to convert the JSON string to a Python dictionary
        data_dict = json.loads(json_data)
        return data_dict

    except json.JSONDecodeError:
        # If JSON decoding fails, search for a JSON object containing the 'questions' key
        # Using regex to match a pattern that includes the 'questions' key
        pattern = r'\{\s*"feedback":\s*".*?",\s*"label":\s*".*?"\s*\}'
        match = re.search(pattern, input_string, re.DOTALL)

        if match:
            # If a match is found, extract the matched JSON string and convert it to a dictionary
            data_json_str = match.group(0)
            data_dict = json.loads(data_json_str)
            return data_dict

        # If no valid JSON is found, the function will Log an error
        else:
            logging.error(
                "No dictionary with 'questions' as a key found in this input string. Error by LLM"
            )
            return {"error": "No dictionary with questions found"}


def generate_labels(feedback, label_prompt, client):
    prompt = PromptTemplate(
        template=label_prompt,
        input_variables=["text"],
    )

    final_prompt = prompt.format(text=feedback)

    # Generate the completion by interacting with the language model API
    completion = client.chat.completions.create(
        model=CHAT_MODEL,
        messages=[
                    {
                        "role": "user", 
                        "content": final_prompt
                    }
                    ],
        temperature=0,  # Control the randomness of the output (lower means less random)
        max_tokens=1024,  # Limit the response length
        top_p=1,  # Nucleus sampling parameter (1 means only the most likely tokens are considered)
        stream=True,  # Enable streaming of the response chunks
        stop=None,  # Define stopping conditions (None means no stopping condition)
    )

    # Initialize an empty string to accumulate the response content
    answer = """"""
    for chunk in completion:
        # Append each chunk of content to the answer string
        answer += chunk.choices[0].delta.content or ""
        
    label_dict = extract_answer(answer)

    # Return the dictionary containing the generated questions
    return label_dict

In [92]:
random_number = random.randrange(0,165)
gen_label_dict = generate_labels(comments[random_number], GENERATE_LABELS_PROMPT, client)
print("-"* 80)
print(f"FEEDBACK: {gen_label_dict['feedback']}")
print("-"* 80)
print(f"AI Generated label: {gen_label_dict['label']}")


2025-01-20 12:06:46,364 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"


--------------------------------------------------------------------------------
FEEDBACK: Consider adding a loyalty programme to reward regular buyers.
--------------------------------------------------------------------------------
AI Generated label: Constructive_Criticism


## Regions:
- SG (can)
- MY (can)
- TH
- TW
- ID
- PH
- VN
- BR

In [2]:
region = "SG"
region_path = f"../data/official_data/feedback_{region}.xls"
columns_to_read = ["Feedback id", "Feedback 1", "Feedback 2"]
df = pd.read_excel(region_path, usecols=columns_to_read)
df_filtered = df[(df['Feedback 1'].notna()) & (df['Feedback 2'] != '{"description":""}')]
df_filtered['Feedback 2'] = df_filtered['Feedback 2'].apply(lambda x: json.loads(x)['description'])
df_filtered.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['Feedback 2'] = df_filtered['Feedback 2'].apply(lambda x: json.loads(x)['description'])


Unnamed: 0,Feedback id,Feedback 1,Feedback 2
31,3577258,I didn't find any useful information.|The less...,try to do discount. yet totally different. can...
36,3569525,I didn't find any useful information.,The team should think along the purchase amoun...
64,3538575,I didn't find any useful information.,useless junk
79,3532189,I didn't find any useful information.,System shd send the alert to respective logist...
82,3531102,I didn't find any useful information.,"You did not solve my query, this is just a sma..."


In [None]:
feedback_1 = list(df_filtered['Feedback 1'].values)
# See if can do any god analysis on this? if needed

In [None]:
feedback_id = list(df_filtered['Feedback id'].values)
feedback_2 = list(df_filtered['Feedback 2'].values)

In [7]:
feedback_2

['try to do discount. yet totally different. cancel all discount. ',
 'The team should think along the purchase amount & not by product category for benefit to everyone, be it Shopee, Seller & customer. Thabk you ',
 'useless junk',
 'System shd send the alert to respective logistic team lead instead of let sellers fill up the form as  often fill up few times also no action fr team!',
 'You did not solve my query, this is just a small envelope but I need to pay a postage fee of $0.52, this was not mentioned when I purchased the vouche',
 'article says "Earnings from your sales will be credited to your Seller Balance within 2 to 7 working days  " but now agent says upto 25 days',
 'Did not state need to wait how long before a promotion can be set after price increase.',
 'It says that this only applies for selected sellers, what about others? Thanks.',
 "Hi We don't have a local base in Singapore. Would you please provide the information about how to sell on Shopee? Our brand: vivantwin