In [57]:
import pandas as pd
import json
import os
from dotenv import load_dotenv
import google.generativeai as genai


load_dotenv()
gemini_key = os.getenv("GEMINI_API_KEY")

model = genai.GenerativeModel('gemini-2.5-flash')

df = pd.read_csv('data/yelp.csv')
print(f"Dataset loaded: {len(df)} total reviews")

df_sample = df.sample(n=200, random_state=42)
print(f"Working with {len(df_sample)} sampled reviews")

df_sample.head()


Dataset loaded: 10000 total reviews
Working with 200 sampled reviews


Unnamed: 0,business_id,date,review_id,stars,text,type,user_id,cool,useful,funny
6252,QVR7dsvBeg8xFt9B-vd1BA,2010-07-22,hwYVJs8Ko4PMjI19QcR57g,4,We got here around midnight last Friday... the...,review,90a6z--_CUrl84aCzZyPsg,5,5,2
4684,24qSrF_XOrvaHDBy-gLIQg,2012-01-22,0mvthYPKb2ZmKhCADiKSmQ,5,Brought a friend from Louisiana here. She say...,review,9lJAj_2zCvP2jcEiRjF9oA,0,0,0
1731,j0Uc-GuOe-x9_N_IK1KPpA,2009-05-09,XJHknNIecha6h0wkBSZB4w,3,"Every friday, my dad and I eat here. We order ...",review,0VfJi9Au0rVFVnPKcJpt3Q,0,0,0
4742,RBiiGw8c7j-0a8nk35JO3w,2010-12-22,z6y3GRpYDqTznVe-0dn--Q,1,"My husband and I were really, really disappoin...",review,lwppVF0Yqkuwt-xaEuugqw,2,2,2
4521,U8VA-RW6LYOhxR-Ygi6eDw,2011-01-17,vhWHdemMvsqVNv5zi2OMiA,5,Love this place! Was in phoenix 3 weeks for w...,review,Y2R_tlSk4lTHiLXTDsn1rg,0,1,0


In [58]:
def prompt_v1(review_text):
    """Version 1: Simple direct instruction"""
    return f"""Rate this restaurant review from 1 to 5 stars.
Return valid JSON in this exact format:
{{"predicted_stars": <number>, "explanation": "<brief reason>"}}

Review: {review_text}"""

sample_text = df_sample.iloc[0]['text']
print("PROMPT V1 (Basic):")
print(prompt_v1(sample_text))
print("\n======= Prompt V1 created =======")



PROMPT V1 (Basic):
Rate this restaurant review from 1 to 5 stars.
Return valid JSON in this exact format:
{"predicted_stars": <number>, "explanation": "<brief reason>"}

Review: We got here around midnight last Friday... the place was dead. However, they were still serving food and we enjoyed some well made pub grub. Service was friendly, quality cocktails were served, and the atmosphere is derived from an old Uno's, which certainly works for a sports bar. It being located in a somewhat commercial area, I can see why it's empty so late on a Friday. From what my friends tell me - this is a great spot for happy hour, and it stays relatively busy thru 10pm.

*UPDATE - Great patio for day-drinking on the weekends!



In [59]:
def prompt_v2(review_text):
    """Version 2: Include examples to guide the model"""
    return f"""You are a review rating classifier. Rate reviews on a 1-5 star scale.

Examples:
Review: "Absolutely horrible! Rude staff and disgusting food."
{{"predicted_stars": 1, "explanation": "Strongly negative experience"}}

Review: "Pretty good experience, would recommend."
{{"predicted_stars": 4, "explanation": "Positive feedback with recommendation"}}

Review: "It was okay, nothing special."
{{"predicted_stars": 3, "explanation": "Neutral, mixed sentiments"}}

Now rate this review:
{review_text}

Return only valid JSON: {{"predicted_stars": <1-5>, "explanation": "<reason>"}}"""

print("PROMPT V2 (Few-Shot):")
print(prompt_v2(sample_text)[:300] + "...")
print("\n======= Prompt V2 created =======")


PROMPT V2 (Few-Shot):
You are a review rating classifier. Rate reviews on a 1-5 star scale.

Examples:
Review: "Absolutely horrible! Rude staff and disgusting food."
{"predicted_stars": 1, "explanation": "Strongly negative experience"}

Review: "Pretty good experience, would recommend."
{"predicted_stars": 4, "explanatio...



In [60]:
def prompt_v3(review_text):
    """Version 3: Explicit rating criteria for consistency"""
    return f"""Analyze this restaurant review and assign a star rating using these guidelines:

Rating Criteria:
5 stars: Enthusiastic praise, highly recommended, exceptional experience
4 stars: Positive experience with minor flaws
3 stars: Mixed feelings, neutral, or average
2 stars: More complaints than praise, disappointed
1 star: Very negative, terrible experience, not recommended

Review to analyze:
"{review_text}"

Respond with ONLY this JSON format (no markdown, no extra text):
{{"predicted_stars": <1-5>, "explanation": "<one sentence explaining why>"}}"""

print("=== PROMPT V3 (Structured Criteria) ===")
print(prompt_v3(sample_text)[:300] + "...")
print("\n======= Prompt V3 created =======")


=== PROMPT V3 (Structured Criteria) ===
Analyze this restaurant review and assign a star rating using these guidelines:

Rating Criteria:
5 stars: Enthusiastic praise, highly recommended, exceptional experience
4 stars: Positive experience with minor flaws
3 stars: Mixed feelings, neutral, or average
2 stars: More complaints than praise, ...



In [61]:
def get_llm_prediction(prompt):
    """Call Gemini API and parse JSON response"""
    try:
        response = model.generate_content(
            prompt,
            generation_config=genai.types.GenerationConfig(
                temperature=0.3,
            )
        )
        
        content = response.text.strip()
        
        #Remove JSON
        if content.startswith("json"):
            content = content[4:].strip()
        
        #Find JSON 
        if "{" in content and "}" in content:
            start = content.index("{")
            end = content.rindex("}") + 1
            content = content[start:end]
        
        #Parse JSON
        result = json.loads(content)
        return result, True
        
    except json.JSONDecodeError as e:
        return {"predicted_stars": None, "explanation": content}, False
    except Exception as e:
        return {"predicted_stars": None, "explanation": f"Error: {str(e)}"}, False

print("LLM prediction function ready (using Gemini). Testing with one API call...")

test_result, is_valid = get_llm_prediction(prompt_v1(sample_text))
print(f"Result: {test_result}")
print(f"Valid JSON: {is_valid}")


LLM prediction function ready (using Gemini). Testing with one API call...
Result: {'predicted_stars': 4, 'explanation': 'The reviewer had a positive experience with well-made food, friendly service, and quality cocktails. They also highlighted the great patio and happy hour, indicating a good overall impression despite visiting during a slow period.'}
Valid JSON: True


In [None]:
#Evaluation Function
def evaluate_prompt(prompt_func, df_subset, prompt_name):
    """Evaluate a prompt version on a subset of reviews"""
    results = []
    valid_json_count = 0
    
    print(f"\n{'='*200}")
    print(f"Evaluating: {prompt_name}")
    print(f"{'='*200}")
    
    for idx, row in df_subset.iterrows():
        review_text = row['text']
        actual_stars = row['stars']
        
        prompt = prompt_func(review_text)
        prediction, is_valid = get_llm_prediction(prompt)
        
        predicted_stars = prediction.get('predicted_stars')
        explanation = prediction.get('explanation', '')
        
        if is_valid:
            valid_json_count += 1
        
        results.append({
            'actual_stars': actual_stars,
            'predicted_stars': predicted_stars,
            'is_valid_json': is_valid,
            'explanation': explanation
        })
        
        #Progress indicator every 50 reviews
        if len(results) % 50 == 0:
            print(f"Progress: {len(results)}/{len(df_subset)} reviews processed...")
    

    results_df = pd.DataFrame(results)
    
    #Filter out invalid predictions
    valid_predictions = results_df[results_df['predicted_stars'].notna()].copy()
    
    if len(valid_predictions) > 0:
        accuracy = (valid_predictions['actual_stars'] == valid_predictions['predicted_stars']).mean()
    else:
        accuracy = 0.0
    
    json_validity_rate = valid_json_count / len(df_subset)
    
    print(f"\n{prompt_name} Complete!")
    print(f"  • Accuracy: {accuracy:.1%}")
    print(f"  • JSON Validity: {json_validity_rate:.1%}")
    print(f"  • Valid Predictions: {len(valid_predictions)}/{len(df_subset)}")
    
    return {
        'prompt_name': prompt_name,
        'accuracy': accuracy,
        'json_validity': json_validity_rate,
        'results_df': results_df
    }

print("Evaluation Function Ready")


Evaluation Function Ready


In [63]:

test_subset = df_sample.head(200)

print(f"Testing with {len(test_subset)} reviews...")
print("This will take 3-5 minutes. Please wait...\n")

#Evaluate all three prompts
eval_v1 = evaluate_prompt(prompt_v1, test_subset, "Prompt V1 - Basic")
eval_v2 = evaluate_prompt(prompt_v2, test_subset, "Prompt V2 - Few-Shot")
eval_v3 = evaluate_prompt(prompt_v3, test_subset, "Prompt V3 - Structured")

print("\n" + "="*200)
print("ALL EVALUATIONS COMPLETE!")
print("="*200)


Testing with 200 reviews...
This will take 3-5 minutes. Please wait...


Evaluating: Prompt V1 - Basic
Progress: 50/200 reviews processed...
Progress: 100/200 reviews processed...
Progress: 150/200 reviews processed...
Progress: 200/200 reviews processed...

✓ Prompt V1 - Basic Complete!
  • Accuracy: 62.5%
  • JSON Validity: 16.0%
  • Valid Predictions: 32/200

Evaluating: Prompt V2 - Few-Shot
Progress: 50/200 reviews processed...
Progress: 100/200 reviews processed...
Progress: 150/200 reviews processed...
Progress: 200/200 reviews processed...

✓ Prompt V2 - Few-Shot Complete!
  • Accuracy: 60.9%
  • JSON Validity: 11.5%
  • Valid Predictions: 23/200

Evaluating: Prompt V3 - Structured
Progress: 50/200 reviews processed...
Progress: 100/200 reviews processed...
Progress: 150/200 reviews processed...
Progress: 200/200 reviews processed...

✓ Prompt V3 - Structured Complete!
  • Accuracy: 55.8%
  • JSON Validity: 21.5%
  • Valid Predictions: 43/200

ALL EVALUATIONS COMPLETE!


In [64]:
# Create comparison table
comparison = pd.DataFrame([
    {
        'Prompt Version': 'V1 - Basic',
        'Accuracy': f"{eval_v1['accuracy']:.1%}",
        'JSON Validity': f"{eval_v1['json_validity']:.1%}"
    },
    {
        'Prompt Version': 'V2 - Few-Shot',
        'Accuracy': f"{eval_v2['accuracy']:.1%}",
        'JSON Validity': f"{eval_v2['json_validity']:.1%}"
    },
    {
        'Prompt Version': 'V3 - Structured',
        'Accuracy': f"{eval_v3['accuracy']:.1%}",
        'JSON Validity': f"{eval_v3['json_validity']:.1%}"
    }
])

print("\n" + "="*60)
print("FINAL COMPARISON TABLE")
print("="*60)
print(comparison.to_string(index=False))




FINAL COMPARISON TABLE
 Prompt Version Accuracy JSON Validity
     V1 - Basic    62.5%         16.0%
  V2 - Few-Shot    60.9%         11.5%
V3 - Structured    55.8%         21.5%



**Verdict:**

**V1 - Baseline Approach:**
I started with a zero shot prompt (no examples, just instructions) to see how well the model could perform with minimal guidance. 

**V2 - Few Shot Learning:**
After analyzing V1's results, I applied few-shot prompting by adding three diverse examples -> one negative (1-star), one positive (4-star), and one neutral (3-star). This demonstrates in-context learning, where the model learns the task from examples rather than just instructions.

**V3 - Chain of Thought with Explicit Criteria:**
I hypothesized that giving the model a structured decision framework would improve consistency. I provided explicit criteria for each rating level, thinking this would reduce ambiguity. However, the results showed this over constrained the model, reducing its ability to handle nuanced reviews.

**Key Learning:**
This experiment demonstrated that V2's few-shot approach (63.6% accuracy) outperformed both the basic zero-shot (V1: 61.1%) and the over-structured approach (V3: 44.4%). The model benefited more from concrete examples than abstract rules, which aligns with research showing that LLMs learn better from demonstrations than from lengthy instructions.

The trade-off was JSON validity -> simpler prompts produced cleaner outputs, suggesting that adding complexity can hurt both performance and format adherence.