In [12]:
def build_persona_prompt(target_id: str, archetype: str) -> str:

    persona_generation_prompt =f"""
We want to first create a customer persona for an insurance use case.

The persona should be a structured representation of a SINGLE, UNIQUE customer matching the following specific profile:
PROFILE TO GENERATE: {archetype}

Each generated persona MUST be meaningfully different from previously generated personas.

----------------
IMPORTANT DIVERSITY & VARIATION RULES
----------------
- Do NOT reuse the same values across personas
- Avoid copying values from the example persona
- Ensure variation in:
  - commute distance
  - driving environment
  - parking location
  - annual mileage band
  - customer psychology
  - service preferences
  - product preferences
- Each persona should feel like a DIFFERENT real-world individual

----------------
1. Driving & Usage Profile
----------------
This section captures risk exposure and usage patterns. It must include:
- Commute distance (in kilometers, choose a realistic value between 5 and 60)
- Driving environment (city or highway)
- Daily parking location (street or private_garage)
- Business or rideshare usage (true or false)
- Annual mileage band (choose one: "0-5k", "5k-10k", "10k-15k", "15k-20k", "20k+")
- Primary drivers:
  - At least one driver
  - Driver age must be realistic (18–70)
  - Optionally include a second driver if applicable

----------------
2. Customer Segment Profile
----------------
This section captures psychological and experiential context. It must include:
- price_sensitivity ("low", "medium", or "high")
- buyer_type ("first_time" or "experienced")
- switching_reason:
  - Use a realistic reason OR null if buyer_type is "first_time"
- claims_trauma:
  - true ONLY if buyer_type is "experienced"
  - otherwise false

Ensure psychological attributes are logically consistent.

----------------
3. Preferences: How the Customer Wants to Buy & Be Served
----------------
This section is critical for multi-turn selling. It must include:
- communication_style ("concise" or "detailed")
- decision_style ("simple_recommendation" or "multiple_options")
- information_format ("numeric_breakdown" or "plain_language")
- preferred_channel ("whatsapp", "email", or "call")

Service preferences should align with customer personality.

----------------
4. Product Preferences
----------------
This section captures insurance-specific trade-offs. It must include:
- coverage_goal ("minimum", "balanced", or "maximum")
- deductible_appetite ("low", "medium", or "high")
- valued_addons:
  - Choose 1–3 relevant add-ons
  - Do NOT always use the same add-ons

----------------
EXAMPLE PERSONA (FORMAT REFERENCE ONLY — DO NOT COPY VALUES)
----------------
{{
  "persona_id": "P_017",
  "driving_usage_profile": {{
    "commute_distance_km": 28,
    "driving_environment": "city",
    "daily_parking": "street",
    "business_use": false,
    "annual_mileage_band": "10k-15k",
    "primary_drivers": [
      {{ "role": "self", "age": 34 }}
    ]
  }},
  "customer_segment_profile": {{
    "price_sensitivity": "high",
    "buyer_type": "experienced",
    "switching_reason": "renewal_premium_increase",
    "claims_trauma": true
  }},
  "service_preferences": {{
    "communication_style": "concise",
    "decision_style": "simple_recommendation",
    "information_format": "numeric_breakdown",
    "preferred_channel": "whatsapp"
  }},
  "product_preferences": {{
    "coverage_goal": "balanced",
    "deductible_appetite": "medium",
    "valued_addons": [
      "roadside_assistance",
      "windshield_cover"
    ]
  }}
}}

----------------
TASK
----------------
Based on the information available in the persona concept above, generate ONE realistic and DISTINCT customer persona with the ID: {target_id}
The persona must strictly follow the profile: {archetype}

----------------
OUTPUT RULES
----------------
- Output must be STRICTLY valid JSON
- Follow the same structure and key names as the example
- Do NOT include explanations, markdown, or extra text
- Ensure internal consistency across sections
- persona_id MUST be "{target_id}"

Now generate the persona.
"""


    return persona_generation_prompt

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch


model_name = "mistralai/Mistral-7B-Instruct-v0.2"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)



In [3]:
def llm_generate(tokenizer, model, prompt: str, max_tokens: int = 512):
    messages = f"<s>[INST] {prompt} [/INST]"

    inputs = tokenizer(messages, return_tensors="pt").to(model.device)

    outputs = model.generate(
        **inputs,
        max_new_tokens=max_tokens,
        do_sample=True,
        temperature=0.7,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id
    )

    generated_ids = outputs[0][inputs.input_ids.shape[1]:]
    return tokenizer.decode(generated_ids, skip_special_tokens=True)


In [13]:
import json
import re
from pathlib import Path


def get_next_persona_id(output_dir: str = "personas"):
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    
    max_id = 0
    for file_path in output_dir.glob("P_*.json"):
        match = re.search(r"P_(\d+)", file_path.name)
        if match:
            num = int(match.group(1))
            if num > max_id:
                max_id = num
    
    return f"P_{max_id + 1:03d}"


def save_persona_json(raw_output: str, output_dir: str = "personas"):


    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    persona = json.loads(raw_output.strip())
    persona_id = persona["persona_id"]

    file_path = output_dir / f"{persona_id}.json"

    if file_path.exists():
        raise FileExistsError(
            f"Persona {persona_id} already exists. Rerun to generate a new one."
        )

    with open(file_path, "w", encoding="utf-8") as f:
        json.dump(persona, f, indent=2)

    return file_path, persona


In [16]:
import random

archetypes = [
    "Young single software engineer, drives a sports car, lives in city apartment, price insensitive, prefers email.",
    "Retired couple, 70s, drives a safe sedan, lives in suburbs, low mileage, values phone support.",
    "Freelance gig worker, drives high mileage for work, tight budget, needs flexible payment options.",
    "Suburban family with 3 teenagers, drives a minivan, highly coverage-sensitive, worried about accidents.",
    "College student, first car, old reliable vehicle, price sensitive, prefers WhatsApp.",
    "Wealthy business owner, luxury SUV, wants concierge service and maximum addons, drives occasionally.",
    "Environmentally conscious driver, electric vehicle, city driving, interested in green discounts.",
    "Rural farmer, heavy duty truck, utilitarian, high deductible to save permium, experienced driver.",
    "Sales representative, company car, extremely high highway mileage, needs rental car coverage.",
    "New immigrant family, unfamiliar with insurance, needs plain language explanation, very cautious."
]


In [18]:

for _ in range(10):
    target_id = get_next_persona_id()
    archetype = random.choice(archetypes)
    print(f"Target Archetype: {archetype}")
    
    prompt = build_persona_prompt(target_id, archetype)
    print(f"Generating persona: {target_id}")

    raw_output = llm_generate(
        tokenizer=tokenizer,
        model=model,
        prompt=prompt
    )

    file_path, persona = save_persona_json(raw_output)

    print(f"Persona saved at: {file_path}")
    print(persona)


Target Archetype: Environmentally conscious driver, electric vehicle, city driving, interested in green discounts.
Generating persona: P_001


Persona saved at: personas/P_001.json
{'persona_id': 'P_001', 'driving_usage_profile': {'commute_distance_km': 12, 'driving_environment': 'city', 'daily_parking': 'private_garage', 'business_use': False, 'annual_mileage_band': '0-5k', 'primary_drivers': [{'role': 'self', 'age': 45}]}, 'customer_segment_profile': {'price_sensitivity': 'medium', 'buyer_type': 'first_time', 'switching_reason': None, 'claims_trauma': False}, 'service_preferences': {'communication_style': 'detailed', 'decision_style': 'multiple_options', 'information_format': 'plain_language', 'preferred_channel': 'email'}, 'product_preferences': {'coverage_goal': 'maximum', 'deductible_appetite': 'low', 'valued_addons': ['green_discount', 'electric_vehicle_coverage']}}
Target Archetype: New immigrant family, unfamiliar with insurance, needs plain language explanation, very cautious.
Generating persona: P_002
Persona saved at: personas/P_002.json
{'persona_id': 'P_002', 'driving_usage_profile': {'commute_distance_km': 12, '