## 1. Define strict classes for structured outputs
Includes:
- Prompting techniques: Prompt engineering strategies
- Prompt type: General category of task that the prompt falls into
- Prompt Example: Structured prompt class to be inputted
- Prompt Example Response: Response to the prompt example mentioned above
- Maps: Maps best techniques to a task type

In [1]:
from enum import Enum
from typing import List, Optional
from pydantic import BaseModel, Field

class PromptingTechnique(Enum):
    GENERAL_ZERO_SHOT = "GENERAL_ZERO_SHOT"  # Asking the model to perform a task without examples or specific guidance
    ONE_SHOT_FEW_SHOT = "ONE_SHOT_FEW_SHOT"  # Providing one or a few examples to guide the model's response
    SYSTEM_PROMPTING = "SYSTEM_PROMPTING"  # Setting overall behavior instructions for the model
    ROLE_PROMPTING = "ROLE_PROMPTING"  # Instructing the model to assume a specific role or persona
    CONTEXTUAL_PROMPTING = "CONTEXTUAL_PROMPTING"  # Providing relevant context to inform the model's response
    STEP_BACK_PROMPTING = "STEP_BACK_PROMPTING"  # Asking the model to take a broader perspective before answering
    CHAIN_OF_THOUGHT = "CHAIN_OF_THOUGHT"  # Guiding the model to show its reasoning process step by step
    SELF_CONSISTENCY = "SELF_CONSISTENCY"  # Having the model generate multiple solutions and select the most consistent one
    TREE_OF_THOUGHTS = "TREE_OF_THOUGHTS"  # Exploring multiple reasoning paths and selecting the most promising one
    REACT_REASON_ACT = "REACT_REASON_ACT"  # Alternating between reasoning and taking actions based on that reasoning
    CODE_PROMPTING = "CODE_PROMPTING"  # Specific techniques for generating or explaining code

class PromptType(Enum):
    INSTRUCTIONAL = "Instructional Prompts"
    INFORMATIONAL = "Informational or Factual Prompts"
    CONVERSATIONAL = "Conversational Prompts"
    CREATIVE_WRITING = "Creative Writing Prompts"
    COMPLETION = "Completion Prompts"
    QUESTION_ANSWERING = "Question-Answering Prompts"
    ROLE_PLAYING = "Role-Playing Prompts"
    PROGRAMMING_CODE_GENERATION = "Programming and Code Generation Prompts"
    SUMMARIZATION = "Summarization Prompts"
    TRANSLATION = "Translation Prompts"
    ANALYSIS_CRITIQUE = "Analysis or Critique Prompts"
    COMPARISON = "Comparison Prompts"
    DATA_EXTRACTION = "Data Extraction Prompts"
    CODE_EXPLANATION = "Code Explanation Prompts"
    STYLE_TONE_CHANGE = "Style or Tone Change Prompts"
    CLASSIFICATION_TAGGING = "Classification and Tagging Prompts"

class PromptExample(BaseModel):
    task_description: str
    complexity: str = Field(..., pattern="^(low|medium|high)$")
    bad_prompt: str
    good_prompt: str
    expected_answer: str
    prompting_techniques: List[PromptingTechnique]
    prompt_type: PromptType
    notes: Optional[str] = None  # Extra notes or observations about the example

import uuid

techniques = ['GENERAL_ZERO_SHOT', 'ONE_SHOT_FEW_SHOT','SYSTEM_PROMPTING','ROLE_PROMPTING', 'CONTEXTUAL_PROMPTING', 
    'STEP_BACK_PROMPTING', 'CHAIN_OF_THOUGHT', 'SELF_CONSISTENCY', 'TREE_OF_THOUGHTS', 'REACT_REASON_ACT', 'CODE_PROMPTING']

# Define the best prompting techniques for each prompt type
prompt_type_to_techniques = {
    PromptType.INSTRUCTIONAL: [PromptingTechnique.CHAIN_OF_THOUGHT, PromptingTechnique.STEP_BACK_PROMPTING, PromptingTechnique.CONTEXTUAL_PROMPTING],
    PromptType.INFORMATIONAL: [PromptingTechnique.GENERAL_ZERO_SHOT, PromptingTechnique.CONTEXTUAL_PROMPTING, PromptingTechnique.SYSTEM_PROMPTING],
    PromptType.CONVERSATIONAL: [PromptingTechnique.ROLE_PROMPTING, PromptingTechnique.CONTEXTUAL_PROMPTING, PromptingTechnique.CHAIN_OF_THOUGHT],
    PromptType.CREATIVE_WRITING: [PromptingTechnique.ROLE_PROMPTING, PromptingTechnique.ONE_SHOT_FEW_SHOT, PromptingTechnique.TREE_OF_THOUGHTS],
    PromptType.COMPLETION: [PromptingTechnique.CONTEXTUAL_PROMPTING, PromptingTechnique.ONE_SHOT_FEW_SHOT, PromptingTechnique.GENERAL_ZERO_SHOT],
    PromptType.QUESTION_ANSWERING: [PromptingTechnique.CHAIN_OF_THOUGHT, PromptingTechnique.ROLE_PROMPTING, PromptingTechnique.GENERAL_ZERO_SHOT],
    PromptType.ROLE_PLAYING: [PromptingTechnique.ROLE_PROMPTING, PromptingTechnique.CONTEXTUAL_PROMPTING, PromptingTechnique.SYSTEM_PROMPTING],
    PromptType.PROGRAMMING_CODE_GENERATION: [PromptingTechnique.CODE_PROMPTING, PromptingTechnique.CHAIN_OF_THOUGHT, PromptingTechnique.ONE_SHOT_FEW_SHOT],
    PromptType.SUMMARIZATION: [PromptingTechnique.CONTEXTUAL_PROMPTING, PromptingTechnique.CHAIN_OF_THOUGHT, PromptingTechnique.STEP_BACK_PROMPTING],
    PromptType.TRANSLATION: [PromptingTechnique.ONE_SHOT_FEW_SHOT, PromptingTechnique.CONTEXTUAL_PROMPTING, PromptingTechnique.SELF_CONSISTENCY],
    PromptType.ANALYSIS_CRITIQUE: [PromptingTechnique.CHAIN_OF_THOUGHT, PromptingTechnique.ROLE_PROMPTING, PromptingTechnique.STEP_BACK_PROMPTING],
    PromptType.COMPARISON: [PromptingTechnique.CONTEXTUAL_PROMPTING, PromptingTechnique.CHAIN_OF_THOUGHT, PromptingTechnique.ONE_SHOT_FEW_SHOT],
    PromptType.DATA_EXTRACTION: [PromptingTechnique.CONTEXTUAL_PROMPTING, PromptingTechnique.ONE_SHOT_FEW_SHOT, PromptingTechnique.CODE_PROMPTING],
    PromptType.CODE_EXPLANATION: [PromptingTechnique.CODE_PROMPTING, PromptingTechnique.CHAIN_OF_THOUGHT, PromptingTechnique.STEP_BACK_PROMPTING],
    PromptType.STYLE_TONE_CHANGE: [PromptingTechnique.ROLE_PROMPTING, PromptingTechnique.CONTEXTUAL_PROMPTING, PromptingTechnique.ONE_SHOT_FEW_SHOT],
    PromptType.CLASSIFICATION_TAGGING: [PromptingTechnique.ONE_SHOT_FEW_SHOT, PromptingTechnique.CONTEXTUAL_PROMPTING, PromptingTechnique.SYSTEM_PROMPTING]
}

prompt_map = {
    'GENERAL_ZERO_SHOT': PromptingTechnique.GENERAL_ZERO_SHOT,
    'ONE_SHOT_FEW_SHOT': PromptingTechnique.ONE_SHOT_FEW_SHOT,
    'SYSTEM_PROMPTING': PromptingTechnique.SYSTEM_PROMPTING,
    'ROLE_PROMPTING': PromptingTechnique.ROLE_PROMPTING,
    'CONTEXTUAL_PROMPTING': PromptingTechnique.CONTEXTUAL_PROMPTING,
    'STEP_BACK_PROMPTING': PromptingTechnique.STEP_BACK_PROMPTING,
    'CHAIN_OF_THOUGHT': PromptingTechnique.CHAIN_OF_THOUGHT,
    'SELF_CONSISTENCY': PromptingTechnique.SELF_CONSISTENCY,
    'TREE_OF_THOUGHTS': PromptingTechnique.TREE_OF_THOUGHTS,
    'REACT_REASON_ACT': PromptingTechnique.REACT_REASON_ACT,
    'CODE_PROMPTING': PromptingTechnique.CODE_PROMPTING
}

# Define the schema for our prompt example response
class PromptExampleResponse(typing.TypedDict):
    bad_prompt: str
    good_prompt: str
    expected_answer: str
    prompting_techniques: list[str]



## 2. Set up LLMs and Define a Prompt Outline and a specific format for prompting Gemini 2.0 Flash

In [None]:
from google import genai
from google.genai import types

from IPython.display import HTML, Markdown, display
from google.api_core import retry
import os
# Import environment variables from env.json
import json
import time 

# Load environment variables from env.json
with open('../../env.json', 'r') as f:
    env_vars = json.load(f)
# Set environment variables from the loaded file
os.environ["GOOGLE_CLOUD_PROJECT"] = env_vars["google_cloud_project"]
os.environ["GOOGLE_CLOUD_LOCATION"] = env_vars["google_cloud_location"]
os.environ["GOOGLE_GENAI_USE_VERTEXAI"] = env_vars["google_genai_use_vertexai"]
# Set the fine-tuned model ID as an environment variable
os.environ["GOOGLE_API_KEY"] = env_vars["google_api_keys"][1]
GOOGLE_API_KEY = os.environ["GOOGLE_API_KEY"]

is_retriable = lambda e: (isinstance(e, genai.errors.APIError) and e.code in {429, 503})

genai.models.Models.generate_content = retry.Retry(
    predicate=is_retriable)(genai.models.Models.generate_content)

client = genai.Client(api_key=GOOGLE_API_KEY)

# Use a specific model
model = genai.GenerativeModel('models/gemini-2.5-pro-preview-03-25')  # base untuned model


def build_llm_prompt(task_description, complexity, prompt_type):
    prompt_techniques = prompt_type_to_techniques[prompt_type]
    string_prompt_techniques = [technique.value.upper() for technique in prompt_techniques]

    import random
    # Shuffle the prompt techniques list to randomize the order
    random.shuffle(string_prompt_techniques)
    # Select 1-2 random techniques from the list
    num_techniques = random.randint(1, 2)
    selected_techniques = string_prompt_techniques[:num_techniques]
    
    return f"""
    You are a prompt engineering assistant.

    TASK: {task_description}
    COMPLEXITY: {complexity}

    1. Generate a bad prompt that would likely get a weak or vague answer.
    2. Generate an improved prompt that uses one or more prompting techniques, choosing the best techniques for the task, preferring the following list: {selected_techniques}
    3. Describe the expected answer (what an ideal output from an LLM would be).

    Return the result in JSON like this:
    {{
        "bad_prompt": "...",
        "good_prompt": "...",
        "expected_answer": "...",
        "prompting_techniques": ["..."]
    }}
    """.strip()

print(build_llm_prompt("Summarize the key points of climate change", "medium", PromptType.SUMMARIZATION))


def query_gemini(prompt: str, max_retries=3):
    """Query the Gemini model with better error handling."""
    for attempt in range(max_retries):
        try:
            response = client.models.generate_content(
                model='gemini-2.0-flash',
                config=types.GenerateContentConfig(
                    temperature=0.1,
                    response_mime_type="application/json",
                    response_schema=PromptExampleResponse,
                ),
                contents=prompt
            )
            
            if not response or not hasattr(response, 'text') or not response.text:
                print(f"Empty response received on attempt {attempt+1}")
                time.sleep(2)
                continue
                
            return response.text
            
        except Exception as e:
            print(f"API error on attempt {attempt+1}: {type(e).__name__}: {str(e)}")
            time.sleep(3 * (attempt + 1))  # Exponential backoff
    
    print("All query attempts failed")
    return None


## 3. Use standardized prompts to get a structured prompt example from Gemini

In [None]:
failed_attempts = 0

def create_prompt_example_with_schema(task_description: str, complexity: str, prompt_type: PromptType, max_attempts=3) -> Optional[PromptExample]:
    for attempt in range(max_attempts):
        try:
            prompt = build_llm_prompt(task_description, complexity, prompt_type)
            data = query_gemini(prompt)
            
            # Check if we received a valid response
            if data is None:
                print(f"Attempt {attempt+1}: Received None from API for task: {task_description}")
                continue
                
            # Parse the JSON response if it's returned as a string
            if isinstance(data, str):
                import json
                try:
                    data = json.loads(data)
                except json.JSONDecodeError:
                    print(f"Invalid JSON received: {data[:100]}...")
                    continue
            
            # Validate required fields exist
            required_fields = ["bad_prompt", "good_prompt", "expected_answer", "prompting_techniques"]
            if not all(field in data for field in required_fields):
                missing = [f for f in required_fields if f not in data]
                print(f"Missing fields in response: {missing}")
                continue
                
            
            return PromptExample(
                task_description=task_description,
                complexity=complexity,
                bad_prompt=data["bad_prompt"],
                good_prompt=data["good_prompt"],
                expected_answer=data["expected_answer"],
                prompting_techniques=[prompt_map[t.upper().replace(" ", "_")] for t in data["prompting_techniques"]],
                prompt_type=prompt_type
            )
        except Exception as e:
            print(f"Error on attempt {attempt+1}: {str(e)}")
            time.sleep(2)  # Add delay between retries
            
    # If we get here, all attempts failed
    print(f"Failed to create example after {max_attempts} attempts: {task_description}")
    failed_attempts += 1
    return None

create_prompt_example_with_schema(
        task_description="Summarize the key points of climate change",
        complexity="medium",
        prompt_type=PromptType.SUMMARIZATION
)

## 4. Define Task & Complexity distributions for training

### Complexity Distribution
- **Medium complexity**: 50% (most common real-world case)
- **Low complexity**: 25% (simpler tasks)
- **High complexity**: 25% (challenging scenarios)

### Prompt Type Distribution
Based on common usage patterns in AI interactions:

#### Higher frequency (10-15% each):
- INFORMATIONAL (12%)
- QUESTION_ANSWERING (12%)
- INSTRUCTIONAL (10%)
- SUMMARIZATION (10%)
- ANALYSIS_CRITIQUE (8%)

#### Medium frequency (5-8% each):
- PROGRAMMING_CODE_GENERATION (8%)
- CREATIVE_WRITING (7%)
- CONVERSATIONAL (7%)
- COMPARISON (6%)
- CODE_EXPLANATION (6%)

#### Lower frequency (2-5% each):
- ROLE_PLAYING (5%)
- DATA_EXTRACTION (5%)
- STYLE_TONE_CHANGE (4%)
- TRANSLATION (4%)
- COMPLETION (3%)
- CLASSIFICATION_TAGGING (3%)

In [None]:
import pandas as pd
import random
from enum import Enum
import csv
from typing import List, Optional
from tqdm import tqdm

# Define distributions based on the provided percentages
complexity_distribution = {
    "low": 0.25,
    "medium": 0.50,
    "high": 0.25
}

prompt_type_distribution = {
    PromptType.INFORMATIONAL: 0.12,
    PromptType.QUESTION_ANSWERING: 0.12,
    PromptType.INSTRUCTIONAL: 0.10,
    PromptType.SUMMARIZATION: 0.10,
    PromptType.ANALYSIS_CRITIQUE: 0.08,
    PromptType.PROGRAMMING_CODE_GENERATION: 0.08,
    PromptType.CREATIVE_WRITING: 0.07,
    PromptType.CONVERSATIONAL: 0.07,
    PromptType.COMPARISON: 0.06,
    PromptType.CODE_EXPLANATION: 0.06,
    PromptType.ROLE_PLAYING: 0.05,
    PromptType.DATA_EXTRACTION: 0.05,
    PromptType.STYLE_TONE_CHANGE: 0.04,
    PromptType.TRANSLATION: 0.04,
    PromptType.COMPLETION: 0.03,
    PromptType.CLASSIFICATION_TAGGING: 0.03
}

# Function to generate examples based on distributions
def generate_examples(num_examples=1000):
    examples = []
    total_attempts = 0
    max_attempts = num_examples * 10  # Set a maximum to prevent infinite loops
    
    # Create a tqdm progress bar
    with tqdm(total=num_examples) as pbar:
         while len(examples) < num_examples and total_attempts < max_attempts and failed_attempts < 30:
            total_attempts += 1
            
            # Select complexity based on distribution
            complexity = random.choices(
                ["low", "medium", "high"], 
                weights=[0.25, 0.50, 0.25], 
                k=1
            )[0]
            
            # Select prompt type based on distribution
            prompt_type = random.choices(
                list(prompt_type_distribution.keys()),
                weights=list(prompt_type_distribution.values())
            )[0]
            
            # Create example
            example = create_prompt_example_with_schema(
                task_description=generate_task_description(prompt_type, complexity),
                complexity=complexity,
                prompt_type=prompt_type,
            )
            
            # Only add if we got a valid example
            if example is not None:
                examples.append(example)
                pbar.update(1)
                
                # Save examples to a file every 500 examples
                if len(examples) % 100 == 0:
                    data = []
                    for exampl in examples:
                        data.append({
                            'task_description': exampl.task_description,
                            'complexity': exampl.complexity,
                            'bad_prompt': exampl.bad_prompt,
                            'good_prompt': exampl.good_prompt,
                            'expected_answer': exampl.expected_answer,
                            'prompting_techniques': [t.name for t in exampl.prompting_techniques],
                            'prompt_type': exampl.prompt_type.name,
                            'notes': exampl.notes
                        })

                    df = pd.DataFrame(data)
                    df.to_csv(f"examples_checkpoint_{len(examples)}.csv", index=False)
        
    return examples

# Helper function to generate task descriptions based on prompt type and complexity
def generate_task_description(prompt_type, complexity):
    # Dictionary of task templates by prompt type
    task_templates = {
        PromptType.INFORMATIONAL: [
            "Explain {topic} in {detail} detail",
            "Provide information about {topic}",
            "Describe how {topic} works",
            "Give a comprehensive overview of {topic}",
            "Share essential knowledge about {topic}"
        ],
        PromptType.QUESTION_ANSWERING: [
            "Answer questions about {topic}",
            "Respond to FAQs about {topic}",
            "Address common questions on {topic}",
            "Provide solutions to questions regarding {topic}",
            "Clarify inquiries related to {topic}"
        ],
        PromptType.INSTRUCTIONAL: [
            "Provide step-by-step instructions for {topic}",
            "Create a guide for {topic}",
            "Explain how to {topic}",
            "Develop a tutorial on {topic}",
            "Outline the process for {topic}"
        ],
        PromptType.SUMMARIZATION: [
            "Summarize the key points of {topic}",
            "Create a concise summary of {topic}",
            "Provide a brief overview of {topic}",
            "Condense the main ideas about {topic}",
            "Distill the essential information about {topic}"
        ],
        PromptType.ANALYSIS_CRITIQUE: [
            "Analyze the implications of {topic}",
            "Critique the approach to {topic}",
            "Evaluate the strengths and weaknesses of {topic}",
            "Examine the underlying factors of {topic}",
            "Assess the effectiveness of {topic}"
        ],
        PromptType.PROGRAMMING_CODE_GENERATION: [
            "Write code to implement {topic}",
            "Generate a function that {topic}",
            "Create a program for {topic}",
            "Develop an algorithm to solve {topic}",
            "Code a solution for {topic}"
        ],
        PromptType.CREATIVE_WRITING: [
            "Write a short story about {topic}",
            "Create a poem inspired by {topic}",
            "Develop a creative narrative about {topic}",
            "Compose a fictional scenario involving {topic}",
            "Craft an imaginative piece centered on {topic}"
        ],
        PromptType.CONVERSATIONAL: [
            "Have a conversation about {topic}",
            "Discuss the nuances of {topic}",
            "Chat about recent developments in {topic}",
            "Engage in a dialogue regarding {topic}",
            "Talk through the important aspects of {topic}"
        ],
        PromptType.COMPARISON: [
            "Compare and contrast {topic} with alternatives",
            "Highlight the differences between approaches to {topic}",
            "Analyze the pros and cons of different {topic} methods",
            "Distinguish between various {topic} techniques",
            "Evaluate competing perspectives on {topic}"
        ],
        PromptType.CODE_EXPLANATION: [
            "Explain how this code for {topic} works",
            "Break down the functionality of {topic} code",
            "Clarify the purpose of each section in this {topic} implementation",
            "Decode the logic behind this {topic} algorithm",
            "Interpret the structure of this {topic} codebase"
        ],
        PromptType.ROLE_PLAYING: [
            "Act as an expert in {topic} and provide insights",
            "Pretend you're a {topic} specialist and explain concepts",
            "Take on the role of a {topic} consultant",
            "Assume the identity of a {topic} researcher and share findings",
            "Embody a {topic} professional and offer advice"
        ],
        PromptType.DATA_EXTRACTION: [
            "Extract key information from this {topic} text",
            "Identify and list important points about {topic}",
            "Pull out relevant data about {topic} from this content",
            "Isolate the critical facts about {topic} in this document",
            "Retrieve the essential details about {topic} from this material"
        ],
        PromptType.STYLE_TONE_CHANGE: [
            "Rewrite this {topic} content in a professional tone",
            "Convert this technical {topic} explanation into simple language",
            "Transform this casual description of {topic} into academic writing",
            "Adapt this {topic} text for a different audience",
            "Modify the tone of this {topic} content to be more engaging"
        ],
        PromptType.TRANSLATION: [
            "Translate this {topic} content between languages",
            "Convert this {topic} explanation into another language",
            "Provide a translation of this {topic} text",
            "Render this {topic} information in a different language",
            "Transform this {topic} document into its equivalent in another language"
        ],
        PromptType.COMPLETION: [
            "Complete this paragraph about {topic}",
            "Finish this sentence about {topic}",
            "Continue this description of {topic}",
            "Extend this partial explanation of {topic}",
            "Fill in the missing details about {topic}"
        ],
        PromptType.CLASSIFICATION_TAGGING: [
            "Categorize these {topic} items",
            "Tag this content related to {topic}",
            "Classify these {topic} examples",
            "Organize these {topic} elements into groups",
            "Label these {topic} instances by type"
        ]
    }
    
    # Topics by complexity
    topics = {
        "low": ["basic arithmetic", "primary colors", "simple recipes", "daily routines", 
                "weather patterns", "basic geography", "common animals", "popular sports",
                "household chores", "traffic signs", "telling time", "family relationships",
                "basic hygiene", "playground games", "farm animals", "fruits and vegetables",
                "modes of transportation", "body parts", "seasons of the year", "classroom objects",
                "community helpers", "basic shapes", "counting numbers", "days of the week",
                "months of the year", "basic emotions", "five senses", "nursery rhymes",
                "bedtime routines", "breakfast foods", "clothing items", "musical instruments",
                "zoo animals", "ocean creatures", "garden tools", "kitchen utensils",
                "playground equipment", "grocery shopping", "basic manners", "neighborhood places",
                "holiday traditions", "pet care", "basic safety rules", "healthy habits",
                "simple crafts", "fairy tales", "basic tools", "school supplies",
                "outdoor activities", "indoor games", "basic first aid", "simple machines",
                "recycling basics", "dental care", "basic nutrition", "water cycle",
                "plant growth", "insect life cycles", "basic astronomy", "dinosaur types",
                "simple maps", "basic money concepts", "telling jokes", "making friends",
                "sharing toys", "following directions", "basic cooking", "simple science experiments",
                "letter recognition", "number patterns", "rhyming words", "opposites",
                "animal sounds", "basic art techniques", "simple songs", "finger games",
                "basic dance moves", "playground rules", "basic computer skills", "simple board games",
                "card games", "basic knots", "paper folding", "bubble blowing",
                "shadow puppets", "rock collecting", "cloud watching", "bird feeding",
                "seed planting", "leaf collecting", "puddle jumping", "sandcastle building",
                "snow activities", "bicycle riding", "jump rope games", "ball games",
                "coloring techniques", "basic drawing", "finger painting", "nature walks"],
        "medium": ["climate change", "renewable energy", "digital marketing", "healthy eating", 
                  "financial planning", "machine learning basics", "world history", "psychology concepts",
                  "interior design", "automotive maintenance", "social media strategy", "photography techniques",
                  "public speaking", "creative writing", "home gardening", "personal fitness",
                  "career development", "relationship advice", "parenting strategies", "home renovation",
                  "travel planning", "cooking techniques", "wine appreciation", "fashion trends",
                  "personal finance", "meditation practices", "yoga styles", "musical theory",
                  "film analysis", "literary criticism", "art history", "web development",
                  "graphic design", "content marketing", "project management", "team leadership",
                  "negotiation skills", "conflict resolution", "time management", "stress management",
                  "environmental sustainability", "urban planning", "political systems", "economic principles",
                  "cultural anthropology", "sociology concepts", "educational theory", "cognitive biases",
                  "nutritional science", "exercise physiology", "alternative medicine", "mental health awareness",
                  "sustainable agriculture", "biodiversity conservation", "renewable resources", "waste management",
                  "water conservation", "air quality management", "ecosystem services", "habitat restoration",
                  "wildlife conservation", "marine biology", "forest management", "soil science",
                  "weather forecasting", "climate adaptation", "disaster preparedness", "emergency response",
                  "first aid techniques", "infectious diseases", "immune system function", "genetic inheritance",
                  "evolutionary biology", "cellular processes", "molecular biology", "biochemistry basics",
                  "organic chemistry", "inorganic chemistry", "physical chemistry", "analytical methods",
                  "statistical analysis", "data visualization", "market research", "consumer behavior",
                  "brand management", "advertising strategies", "sales techniques", "customer service",
                  "supply chain management", "operations research", "quality control", "product development",
                  "user experience design", "interface design", "software development", "network security",
                  "cloud computing", "database management", "artificial intelligence", "virtual reality",
                  "augmented reality", "blockchain technology", "cryptocurrency basics", "internet of things"],
        "high": ["quantum computing", "geopolitical conflicts", "advanced economics", 
                "philosophical theories", "complex ethical dilemmas", "cutting-edge research", 
                "specialized medical procedures", "advanced mathematical concepts",
                "theoretical physics", "neuroscience research", "cryptographic protocols", "systems biology",
                "computational linguistics", "advanced artificial intelligence", "nanotechnology applications", "space colonization",
                "consciousness theories", "quantum field theory", "string theory", "dark matter research",
                "genetic engineering ethics", "climate modeling", "advanced robotics", "synthetic biology",
                "cognitive neuroscience", "advanced cryptography", "quantum cryptography", "multiverse theories",
                "advanced game theory", "complex systems theory", "chaos theory applications", "advanced topology",
                "abstract algebra", "non-Euclidean geometry", "differential geometry", "algebraic topology",
                "functional analysis", "measure theory", "complex analysis", "number theory",
                "combinatorial optimization", "computational complexity theory", "algorithm design", "machine learning theory",
                "deep learning architectures", "reinforcement learning", "natural language processing", "computer vision algorithms",
                "quantum machine learning", "quantum algorithms", "quantum error correction", "topological quantum computing",
                "molecular dynamics simulation", "computational chemistry", "protein folding prediction", "drug discovery methods",
                "CRISPR gene editing", "stem cell therapy", "regenerative medicine", "personalized genomics",
                "brain-computer interfaces", "neural prosthetics", "optogenetics", "connectomics",
                "advanced cosmology", "gravitational wave astronomy", "black hole thermodynamics", "quantum gravity theories",
                "supersymmetry", "M-theory", "loop quantum gravity", "causal set theory",
                "metamaterials design", "superconductivity research", "quantum materials", "topological insulators",
                "fusion energy research", "advanced nuclear reactor design", "smart grid technology", "carbon capture methods",
                "geoengineering proposals", "advanced climate modeling", "complex systems ecology", "theoretical ecology",
                "evolutionary game theory", "population genetics", "phylogenetic analysis", "metagenomics",
                "microbiome research", "epigenetic mechanisms", "RNA biology", "structural biochemistry",
                "immunotherapy development", "precision medicine", "advanced surgical robotics", "neural circuit manipulation",
                "consciousness research", "cognitive architectures", "embodied cognition", "extended mind theory",
                "moral philosophy", "metaphysics", "epistemology", "philosophy of science",
                "philosophy of mind", "political philosophy", "bioethics", "neuroethics"]
    }
    detail_level = {
        "low": "basic",
        "medium": "moderate",
        "high": "extensive"
    }
    
    # Select random template and topic based on complexity
    template = random.choice(task_templates.get(prompt_type, ["Explain {topic}"]))
    topic = random.choice(topics[complexity])
    
    # Generate task description
    return template.format(topic=topic, detail=detail_level[complexity])

## 5. Generate Prompt Engineering examples 

In [None]:
# 1450 example
dataset_examples = generate_examples(1450)

# Convert to DataFrame
data = []
for example in dataset_examples:
    data.append({
        'task_description': example.task_description,
        'complexity': example.complexity,
        'bad_prompt': example.bad_prompt,
        'good_prompt': example.good_prompt,
        'expected_answer': example.expected_answer,
        'prompting_techniques': [t.name for t in example.prompting_techniques],
        'prompt_type': example.prompt_type.name,
        'notes': example.notes
    })

df = pd.DataFrame(data)

# Display sample of the dataset
print(f"Generated {len(df)} examples")
print("\nSample of the dataset:")
display(df.head())

# Save to CSV
import os

# Save the DataFrame to CSV
df.to_csv('../data/prompt_examples_dataset.csv', index=False)
print("\nDataset saved to './data/prompt_examples_dataset.csv'")

## 6. Convert to desired JSONL format

In [None]:
# Convert the dataset to JSONL format for Vertex AI fine-tuning
import json

# Create a new DataFrame with the required structure
fine_tuning_data = []

for _, row in df.iterrows():
    # Extract the necessary fields
    task = row['task_description']
    expected_answer = row['good_prompt']
    system_prompt = row['bad_prompt']
    
    # Create the JSONL entry in the required format
    entry = {
        "systemInstruction": {
            "role": "system", 
            "parts": [{"text": "You are a prompt engineering expert that transforms simple prompts into more effective versions."}]
        }, 
        "contents": [
            {
                "role": "user", 
                "parts": [{"text": f"{system_prompt}"}]
            }, 
            {
                "role": "model", 
                "parts": [{"text": f"{expected_answer}"}]
            }
        ]
    }
    fine_tuning_data.append(entry)

# Save to JSONL file
jsonl_path = '../data/vertex_ai_fine_tuning.jsonl'
with open(jsonl_path, 'w') as f:
    for entry in fine_tuning_data:
        f.write(json.dumps(entry) + '\n')

print(f"Created JSONL file for Vertex AI fine-tuning at {jsonl_path}")
print(f"Total examples: {len(fine_tuning_data)}")

# Display a sample entry
print("\nSample entry:")
print(json.dumps(fine_tuning_data[0], indent=2))
