In [1]:
%%capture
!pip install -qU torch unsloth transformers datasets bitsandbytes
!pip install -qU seaborn pandas vllm wandb huggingface_hub

In [3]:
import re
import json
import wandb
import random
import pandas as pd
from unsloth import FastLanguageModel
from datasets import Dataset
from huggingface_hub import login
from tqdm import tqdm

import warnings
warnings.filterwarnings("ignore")

platform = "kaggle"

if platform=="kaggle":
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    hf_token = user_secrets.get_secret("HF_TOKEN")
elif platform=="colab":
    from google.colab import userdata
    hf_token = userdata.get("HF_TOKEN")

In [4]:
# 1. SETUP MODEL (Llama 3 is recommended for high quality)
max_seq_length = 4096
dtype = None               # Auto detection
load_in_4bit = True        # 4bit for memory efficiency

print("1. Loading Unsloth Llama 3 model...")
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/llama-3-8b-Instruct-bnb-4bit",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit
)

# Set model for inference
FastLanguageModel.for_inference(model)        # Enable native 2x faster inference

tokenizer.padding_side = "left"
tokenizer.pad_token_id = tokenizer.eos_token_id

1. Loading Unsloth Llama 3 model...
==((====))==  Unsloth 2026.1.4: Fast Llama patching. Transformers: 4.57.6. vLLM: 0.15.0.
   \\   /|    Tesla T4. Num GPUs = 2. Max memory: 14.563 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/220 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/345 [00:00<?, ?B/s]

In [5]:
# 2. DEFINE STRICT JSON PROMPT
# We force the model to output a dictionary for sentiment, not a single string.
system_prompt = """You are a synthetic data generator for Aspect-Based Sentiment Analysis (ABSA).
Generate a realistic, detailed Yelp-style review (approx 100 tokens). The type of sentiment should based on reviews. Also MAKE SURE THAT EVERY REVIEW SHOULD BE UNIQUE.

Your output must be a SINGLE VALID JSON object.

CRITICAL FORMATTING RULES:
1. Do NOT use real line breaks (newlines) inside the "review" text. Use literal '\\n' for paragraphs.
2. The JSON structure must be valid.
3. Do NOT output any conversational text, introductions, or markdown formatting (like ```json).
Start the output directly with {

Schema:
{
  "review": "Review text here...",
  "sentiment": { ... },
  "summary": "..."
}

Example Format:
{
  "review": "The pasta was cooked perfectly...",
  "sentiment": {
    "Food": "Positive",
    "Service": "Negative",
    "Ambiance": "Positive",
    "Value": "Neutral",
    "Cleanliness": "Positive"
  },
  "summary": "Great food but terrible service. The atmosphere saved the night."
}"""


# Topics to ensure diversity in the dataset
business_types = [
    "Fine Dining Italian", "Ramen Shop", "Late Night Diner", "Vegan Bakery",
    "Steakhouse", "Fusion Taco Truck", "Hotel Breakfast Buffet", "Craft Cocktail Bar",
    "Seafood Boil", "French Bistro", "Dim Sum Place", "Gastropub"
]

In [6]:
def get_batch_prompts(batch_size):
    """Generates a list of formatted prompts for the batch."""
    prompts = []
    for _ in range(batch_size):
        topic = random.choice(business_types)
        messages = [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": f"Write a review for a {topic} place."},
        ]
        # Get the text prompt without tokenizing yet
        text_prompt = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(text_prompt)
    return prompts


def generate_samples(batch_prompts, max_new_tokens):
    # Tokenize (Crucial: padding=True makes them all the same length)
    inputs = tokenizer(
        batch_prompts,
        padding=True,
        truncation=True,
        return_tensors="pt"
    ).to("cuda")

    # Generate (One single call generates BATCH_SIZE reviews at once)
    output = model.generate(
        input_ids = inputs.input_ids,
        attention_mask=inputs.attention_mask,
        max_new_tokens=max_new_tokens,   # Enough for 200 words
        temperature=0.8,       # Slightly higher temp for creativity/diversity
        use_cache=True,
        pad_token_id=tokenizer.eos_token_id
    )

    # Decode Batch
    # We slice outputs to remove the input prompt part
    generated_texts = tokenizer.batch_decode(
        output[:, inputs.input_ids.shape[1]:],
        skip_special_tokens=True
    )
    return generated_texts


# ==== IMPROVED JSON EXTRACTOR ====
def extract_and_parse_json(text):
    """
    Robustly extracts and parses JSON, handling markdown fences
    and common LLM mistakes.
    """
    try:
        # 1. Remove Markdown code blocks if present
        text = re.sub(r'```json\s*', '', text)
        text = re.sub(r'```\s*', '', text)

        # 2. Find the outer braces
        start_idx = text.find('{')
        end_idx = text.rfind('}')

        if start_idx == -1 or end_idx == -1:
            return None

        json_str = text[start_idx : end_idx + 1]

        # 3. Clean common issues (newlines in strings)
        # This regex removes newlines that are NOT between objects to prevent parsing errors
        # (Simple approach: just load it)
        return json.loads(json_str, strict=False)

    except json.JSONDecodeError:
        pass

In [7]:
# ==== UPDATED GENERATION LOOP ====

NUM_SAMPLES = 1000
BATCH_SIZE = 10          # Generate in small batches to save progress
MAX_NEW_TOKENS = 1024    # CRITICAL: Increased to 1024 to fit full reviews


data_buffer = []

print(f"Starting generation of {NUM_SAMPLES} samples...")

pbar = tqdm(range(0, NUM_SAMPLES, BATCH_SIZE))

for _ in pbar:
    try:
        current_batch_prompts = get_batch_prompts(batch_size=BATCH_SIZE)
        
        generated_texts = generate_samples(
            batch_prompts=current_batch_prompts,
            max_new_tokens=MAX_NEW_TOKENS
        )

        for raw_text in generated_texts:
            entry = extract_and_parse_json(raw_text)

            if entry and "review" in entry and "sentiment" in entry:
                data_buffer.append(entry)
            else:
                tqdm.write(f"Failed to parse: {raw_text[:100]}...")
                pass

        # Save periodically
        if len(data_buffer) % 20 == 0:
            pd.DataFrame(data_buffer).to_csv("aspect_reviews_batched.csv", index=False)

        pbar.set_postfix(valid=len(data_buffer))

    except Exception as e:
        tqdm.write(f"Batch Error: {e}")
        continue

print(f"Successfully generated {len(data_buffer)} samples.")
pd.DataFrame(data_buffer).to_csv("aspect_reviews_final.csv", index=False)

Starting generation of 1000 samples...


 10%|█         | 10/100 [17:47<2:47:23, 111.59s/it, valid=99]

Failed to parse: {
  "review": "I stumbled upon this quirky late-night diner at 2 am, desperate for a burger fix afte...


 52%|█████▏    | 52/100 [1:25:42<1:13:05, 91.37s/it, valid=518] 

Failed to parse: {
  "review": "I stumbled upon this cozy Ramen shop on a whim, and I'm so glad I did. The moment I w...


 80%|████████  | 80/100 [2:13:29<34:14, 102.72s/it, valid=797]  

Failed to parse: {
  "review": "I stumbled upon this late-night diner around 2 am after a night out with friends. The...


100%|██████████| 100/100 [2:46:58<00:00, 100.18s/it, valid=996]

Failed to parse: {
  "review": "I stumbled upon this fusion taco truck by accident, and oh boy, it was a game-changer...
Successfully generated 996 samples.





In [12]:
len(data_buffer)

996

In [13]:
data_buffer[3]

{'review': "I strolled into Le Coeur de la Vie, a quaint French bistro on the outskirts of town, expecting a cozy evening with friends. The dimly lit interior, adorned with vintage posters and soft jazz, transported us to the City of Love. Our server, Pierre, was charming and attentive, pouring glasses of wine with a flair. The escargot, cooked to perfection, melted in our mouths, and the ratatouille was rich and flavorful. The standout, however, was the duck confit, tender and juicy with a crispy skin. The only hiccup was the loud and boisterous group in the corner, but our server handled the situation with aplomb. As we lingered over coffee and crème brûlée, the bistro's rustic charm won us over. While the prices were steep, the experience was worth it.",
 'sentiment': {'Food': 'Positive',
  'Service': 'Positive',
  'Ambiance': 'Positive',
  'Value': 'Negative',
  'Cleanliness': 'Positive'},
 'summary': 'A charming French bistro with exceptional food and service, despite some noise a

In [14]:
# 1. Login to Hugging Face
try:
    login(token=hf_token)
    print("Successfully logged in to Hugging Face!")
except Exception as e:
    print(f"Login failed: {e}")
    # Stop here if login fails
    exit()

Successfully logged in to Hugging Face!


In [19]:
try:
    # Convert list of dicts to Dataset object
    print(len(data_buffer))
    dataset = Dataset.from_list(data_buffer)
    print(dataset)
    # Optional: Split into train/test if you want (e.g., 90% train, 10% test)
    dataset = dataset.train_test_split(test_size=0.1)
    print(dataset)    
    print(f"Dataset created with {len(dataset)} rows.")
except Exception as e:
    print(f"Error converting data to Dataset: {e}")
    exit()

996
Dataset({
    features: ['review', 'sentiment', 'summary'],
    num_rows: 996
})
DatasetDict({
    train: Dataset({
        features: ['review', 'sentiment', 'summary'],
        num_rows: 896
    })
    test: Dataset({
        features: ['review', 'sentiment', 'summary'],
        num_rows: 100
    })
})
Dataset created with 2 rows.


In [20]:
dataset

DatasetDict({
    train: Dataset({
        features: ['review', 'sentiment', 'summary'],
        num_rows: 896
    })
    test: Dataset({
        features: ['review', 'sentiment', 'summary'],
        num_rows: 100
    })
})

In [21]:
REPO_NAME = "navdeep-singh/sentiment-aware-review-summarization"

# 3. Push to Hub
try:
    print(f"Pushing dataset to: {REPO_NAME}...")
    dataset.push_to_hub(
        REPO_NAME,
        private=False # Set to True if you want a private dataset
    )
    print(f"✅ Successfully published! View it here: https://huggingface.co/datasets/{REPO_NAME}")
except Exception as e:
    print(f"Error pushing to Hub: {e}")

Pushing dataset to: navdeep-singh/sentiment-aware-review-summarization...


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

✅ Successfully published! View it here: https://huggingface.co/datasets/navdeep-singh/sentiment-aware-review-summarization
