In [None]:

import pandas as pd
import json
from tqdm import tqdm
import os
import time
from groq import Groq

client = Groq(
    api_key=os.getenv("GROQ_API_KEY")
)



In [None]:
df = pd.read_csv("yelp.csv")

df_sample = df.sample(200, random_state=42)[["text", "stars"]]
df_sample.head()


Unnamed: 0,text,stars
6252,We got here around midnight last Friday... the...,4
4684,Brought a friend from Louisiana here. She say...,5
1731,"Every friday, my dad and I eat here. We order ...",3
4742,"My husband and I were really, really disappoin...",1
4521,Love this place! Was in phoenix 3 weeks for w...,5


In [None]:

def ask_model(prompt):
    response = client.chat.completions.create(
        model="openai/gpt-oss-120b",  
        messages=[
            {"role": "user", "content": prompt}
        ],
        temperature=0.2,
        max_tokens=200
    )
    return response.choices[0].message.content.strip()


In [4]:
def prompt_zero_shot(review):
    prompt = f"""
You are an AI that classifies Yelp reviews into a star rating from 1–5.

Return ONLY valid JSON in this format:
{{
  "predicted_stars": <1-5>,
  "explanation": "<brief reasoning>"
}}

Review:
\"\"\"{review}\"\"\"
"""
    return ask_model(prompt)


In [5]:
def prompt_few_shot(review):
    prompt = f"""
You are an expert review classifier. Here are examples:

Review: "Amazing food, friendly staff."
Output: {{ "predicted_stars": 5, "explanation": "Very positive sentiment." }}

Review: "Terrible service, food was cold."
Output: {{ "predicted_stars": 1, "explanation": "Strong negative sentiment." }}

Now classify the new review. Return ONLY JSON.

Review:
\"\"\"{review}\"\"\"
"""
    return ask_model(prompt)


In [6]:
def prompt_cot(review):
    prompt = f"""
Analyze the Yelp review step-by-step internally:
- Sentiment
- Emotion intensity
- Polarity cues (food, service, ambiance)

Do NOT show reasoning.

Finally output ONLY valid JSON:
{{
  "predicted_stars": <1-5>,
  "explanation": "<short summary>"
}}

Review:
\"\"\"{review}\"\"\"
"""
    return ask_model(prompt)


In [None]:
def evaluate(prompt_fn, data):
    records = []

    for _, row in tqdm(data.iterrows(), total=len(data)):
        review = row["text"]
        actual = row["stars"]

        raw = prompt_fn(review)

        try:
            parsed = json.loads(raw)
            json_valid = True
        except:
            parsed = {"predicted_stars": None}
            json_valid = False

        predicted = parsed.get("predicted_stars", None)
        accuracy = int(predicted == actual)

        records.append({
            "review": review,
            "raw_output": raw,
            "predicted": predicted,
            "actual": actual,
            "json_valid": json_valid,
            "accuracy": accuracy
        })
        time.sleep(3)

    return pd.DataFrame(records)


In [None]:
df_zero = evaluate(prompt_zero_shot, df_sample)
df_few  = evaluate(prompt_few_shot, df_sample)
df_cot  = evaluate(prompt_cot, df_sample)

100%|██████████| 200/200 [11:55<00:00,  3.58s/it]
100%|██████████| 200/200 [11:39<00:00,  3.50s/it]
 29%|██▉       | 58/200 [03:24<08:21,  3.53s/it]


RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for model `openai/gpt-oss-120b` in organization `org_01k328mb51e96s1axc4kk9tvqp` service tier `on_demand` on tokens per day (TPD): Limit 200000, Used 199960, Requested 197. Please try again in 1m7.824s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}

100%|██████████| 200/200 [12:03<00:00,  3.62s/it]


In [19]:
summary = pd.DataFrame([
    {
        "Prompt": "Zero-Shot",
        "Accuracy": df_zero["accuracy"].mean(),
        "JSON Validity": df_zero["json_valid"].mean()
    },
    {
        "Prompt": "Few-Shot",
        "Accuracy": df_few["accuracy"].mean(),
        "JSON Validity": df_few["json_valid"].mean()
    },
    {
        "Prompt": "Chain-of-Thought",
        "Accuracy": df_cot["accuracy"].mean(),
        "JSON Validity": df_cot["json_valid"].mean()
    },
])

summary


Unnamed: 0,Prompt,Accuracy,JSON Validity
0,Zero-Shot,0.64,0.95
1,Few-Shot,0.57,0.895
2,Chain-of-Thought,0.54,0.825


In [None]:
summary.to_csv("evaluation_summary.csv", index=False)