In [1]:
!pip install torch transformers pandas



In [7]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import pandas as pd
import re
import time
import random

# --- Step 1: Load the Model and Tokenizer ---
model_name = "EleutherAI/gpt-neo-125M"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using M1 GPU (MPS) for acceleration.")
else:
    device = torch.device("cpu")
    print("MPS not available, falling back to CPU.")
model.to(device)

# --- Step 2: Define a Simple Prompt ---
domain_prompt = """
Name: [First Last]
Rating: [1-5]
Comment: [10-20 word review]
"""

# --- Step 3: Function to Generate Synthetic Data ---
def generate_synthetic_review(attempt=0, max_attempts=3):
    """
    Generate a single synthetic customer review using GPT-Neo.
    
    Args:
        attempt (int): Current retry attempt.
        max_attempts (int): Max retries to avoid infinite loops.
    
    Returns:
        dict: Parsed review with 'Name', 'Rating', and 'Comment'.
    """
    if attempt >= max_attempts:
        print(f"Max attempts reached. Using fallback with variation.")
        return {
            "Name": f"User{random.randint(1, 100)} Smith",
            "Rating": random.randint(1, 5),
            "Comment": "Generic review with some variation here."
        }
    
    print("Starting generation...")
    start_time = time.time()
    
    # Add a random seed to prompt
    seed = random.randint(0, 10000)
    prompt = domain_prompt + f"\nSeed: {seed}\n"
    
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    print("Inputs prepared.")
    
    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_length=150,    # Increased to ensure full review
            num_beams=4,      # More beams for structure
            temperature=1.2,  # Higher for creativity
            top_k=50,
            top_p=0.9,       # Add top-p sampling
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )
    print(f"Generation completed in {time.time() - start_time:.2f} seconds.")
    
    generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    print(f"Generated text:\n{generated_text}\n")
    
    # Parse the generated text
    try:
        lines = [line.strip() for line in generated_text.split("\n") if line.strip()]
        print(f"Lines after filtering:\n{lines}\n")
        
        # Look for Name, Rating, Comment anywhere in the text
        name = None
        rating = None
        comment = None
        for line in lines:
            if re.match(r"Name: .+", line):
                name = re.search(r"Name: (.+)", line).group(1).strip()
            elif re.match(r"Rating: \d", line):
                rating = int(re.search(r"Rating: (\d)", line).group(1))
            elif re.match(r"Comment: .+", line):
                comment = re.search(r"Comment: (.+)", line).group(1).strip()
        
        if not all([name, rating, comment]):
            print("Incomplete review, retrying...")
            return generate_synthetic_review(attempt + 1, max_attempts)
        
        if rating < 1 or rating > 5:
            rating = max(1, min(5, rating))  # Clamp rating
        print(f"Parsed review: Name={name}, Rating={rating}, Comment={comment}")
        return {"Name": name, "Rating": rating, "Comment": comment}
    except (AttributeError, IndexError, ValueError) as e:
        print(f"Error parsing review: {e}. Retrying...")
        return generate_synthetic_review(attempt + 1, max_attempts)

# --- Step 4: Generate and Save Dataset ---
def create_synthetic_dataset(num_rows=10):
    """
    Generate a dataset of synthetic reviews and save to CSV.
    
    Args:
        num_rows (int): Number of rows to generate (default 10).
    """
    reviews = []
    for i in range(num_rows):
        print(f"\nGenerating review {i+1}/{num_rows}")
        review = generate_synthetic_review()
        reviews.append(review)
    
    df = pd.DataFrame(reviews, columns=["Name", "Rating", "Comment"])
    df.to_csv("synthetic_customer_reviews.csv", index=False)
    print(f"\nGenerated {len(df)} synthetic reviews. Saved to 'synthetic_customer_reviews.csv'.")
    
    print("\nDataset Statistics:")
    print(f"Mean Rating: {df['Rating'].mean():.2f}")
    print(f"Rating Variance: {df['Rating'].var():.2f}")
    print(f"Sample Review:\n{df.iloc[0]}")

# --- Step 5: Run the Generator ---
if __name__ == "__main__":
    print("Starting Synthetic Data Generation...")
    create_synthetic_dataset(num_rows=10)

Using M1 GPU (MPS) for acceleration.
Starting Synthetic Data Generation...

Generating review 1/10
Starting generation...
Inputs prepared.
Generation completed in 24.76 seconds.
Generated text:

Name: [First Last]
Rating: [1-5]
Comment: [10-20 word review]

Seed: 7009
Rating: [1-5]
Comment: [10-20 word review]

Lines after filtering:
['Name: [First Last]', 'Rating: [1-5]', 'Comment: [10-20 word review]', 'Seed: 7009', 'Rating: [1-5]', 'Comment: [10-20 word review]']

Incomplete review, retrying...
Starting generation...
Inputs prepared.
Generation completed in 9.52 seconds.
Generated text:

Name: [First Last]
Rating: [1-5]
Comment: [10-20 word review]

Seed: 7077
Rating: [1-5]
Comment: [10-20 word review]

Seed: 7077
Rating: [1-5]
Comment: [10-20 word review]

Seed: 7077
Rating: [1-5]
Comment: [10-20 word review]

Seed: 7077
Rating: [1-5]
Comment: [10-20 word review]

Seed: 7077
Rating: [1-5]
Comment: [10

Lines after filtering:
['Name: [First Last]', 'Rating: [1-5]', 'Comment: [10-20 