Firstly, we talk about seperating prompts with the widgets
for context and such


In [4]:
from google import genai
from google.genai import types

from IPython.display import HTML, Markdown, display
from google.api_core import retry
import os
# Import environment variables from env.json
import json

# Load environment variables from env.json
with open('../../env.json', 'r') as f:
    env_vars = json.load(f)
# Set environment variables from the loaded file
os.environ["GOOGLE_CLOUD_PROJECT"] = env_vars["google_cloud_project"]
os.environ["GOOGLE_CLOUD_LOCATION"] = env_vars["google_cloud_location"]
os.environ["GOOGLE_GENAI_USE_VERTEXAI"] = env_vars["google_genai_use_vertexai"]
# Set the fine-tuned model ID as an environment variable
os.environ["GOOGLE_API_KEY"] = env_vars["google_api_keys"][1]
GOOGLE_API_KEY = os.environ["GOOGLE_API_KEY"]

import google.generativeai as genai

genai.configure(api_key=GOOGLE_API_KEY)

# Use a specific model
model = genai.GenerativeModel('models/gemini-2.5-pro-preview-03-25')  # base untuned model


In [None]:
def generate_response(instruction, context):
    if context == "" or context == None:
        context = "No context provided" 
    prompt = f'''
    You are a prompt engineering expert.

    Your task is to rewrite the instruction below using advanced prompt engineering techniques. If context is provided, use it as *background knowledge* to better understand the task — but do not include it in the final output.

    Guidelines:
    - Enhance the instruction to be clearer, more specific, and more effective
    - Use any prompting technique that best fits
    - Ground your rewrite in the provided context, if applicable
    - Do NOT copy or reference the context in your rewritten instruction

    Context:
    {context}

    Original Instruction:
    {instruction}

    Output ONLY the improved instruction without any additional text, titling, explanations, or acknowledgment.
    '''
    
    response = model.generate_content(prompt)
    return response.text


In [6]:
import pandas as pd
import os
from tqdm.notebook import tqdm

# Load the validation data
validation_data_path = "../../data/Prompt_Training_2.0/seperated_validation_data.csv"
output_path = "../../data/Prompt_Training_2.0/labeled_validation_data.csv"

# Load the validation data
testing_data_path = "../../data/Prompt_Training_2.0/seperated_test_data.csv"
output_path_2 = "../../data/Prompt_Training_2.0/labeled_test_data.csv"

# Check if the file exists
if not os.path.exists(validation_data_path):
    print(f"Error: File {validation_data_path} not found.")
else:
    # Load the data
    df = pd.read_csv(validation_data_path)
    test_df = pd.read_csv(testing_data_path)
    print(f"Loaded {len(df)} rows from validation data.")
    
    # Create a new column for improved prompts
    df['improved_prompt'] = None
    test_df['improved_prompt'] = None
    
    # Process each row
    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Generating improved prompts"):
        instruction = row['instruction']
        context = row['context']
        
        try:
            # Generate improved prompt
            improved_prompt = generate_response(instruction, context)
            df.at[idx, 'improved_prompt'] = improved_prompt
        except Exception as e:
            print(f"Error processing row {idx}: {e}")
            df.at[idx, 'improved_prompt'] = f"Error: {str(e)}"

    for idx, row in tqdm(test_df.iterrows(), total=len(test_df), desc="Generating improved prompts"):
        instruction = row['instruction']
        context = row['context']
        
        try:
            # Generate improved prompt
            improved_prompt = generate_response(instruction, context)
            test_df.at[idx, 'improved_prompt'] = improved_prompt
        except Exception as e:
            print(f"Error processing row {idx}: {e}")
            test_df.at[idx, 'improved_prompt'] = f"Error: {str(e)}"

    # Save the results
    df.to_csv(output_path, index=False)
    test_df.to_csv(output_path_2, index=False)
    print(f"Saved results to {output_path}")

    


KeyboardInterrupt: 

add post processing steps


In [None]:
import pandas as pd
import json
import os

# Load the final datasets 
validation_df = pd.read_csv('../data/Prompt_Training_2.0/labeled_validation_final.csv')
train_df = pd.read_csv('../data/Prompt_Training_2.0/labeled_train_final.csv')

# Build the prompt template 
prompt_template = """You are a prompt engineering expert.

Your task is to rewrite the instruction below using advanced prompt engineering techniques. If context is provided, use it as *background knowledge* to better understand the task — but do not include it in the final output.

Guidelines:
- Enhance the instruction to be clearer, more specific, and more effective
- Use any prompting technique that best fits
- Ground your rewrite in the provided context, if applicable
- Do NOT copy or reference the context in your rewritten instruction

Context:
{context}

Original Instruction:
{instruction}

Output ONLY the improved instruction without any additional text, titling, explanations, or acknowledgment."""

# Function to create JSONL entries
def create_jsonl_entries(df, include_system_prompt=False):
    entries = []
    
    for _, row in df.iterrows():
        # Format the prompt with the template
        context = row['context'] if pd.notna(row['context']) else ""
        instruction = row['instruction'] if pd.notna(row['instruction']) else ""
        improved = row['improved_instruction'] if pd.notna(row['improved_instruction']) else ""
        
        formatted_prompt = prompt_template.format(context=context, instruction=instruction)
        
        # Create the entry based on whether to include system prompt
        if include_system_prompt:
            entry = {
                "systemInstruction": {
                    "role": "system", 
                    "parts": [{"text": "You are a prompt engineering expert that transforms prompts into more effective versions."}]
                }, 
                "contents": [
                    {
                        "role": "user", 
                        "parts": [{"text": formatted_prompt}]
                    }, 
                    {
                        "role": "model", 
                        "parts": [{"text": improved}]
                    }
                ]
            }
        else:
            entry = {
                "contents": [
                    {
                        "role": "user", 
                        "parts": [{"text": formatted_prompt}]
                    }, 
                    {
                        "role": "model", 
                        "parts": [{"text": improved}]
                    }
                ]
            }
        
        entries.append(entry)
    
    return entries

# Create JSONL entries for training and validation
train_entries = create_jsonl_entries(train_df, include_system_prompt=False)
validation_entries = create_jsonl_entries(validation_df, include_system_prompt=False)

# Save to JSONL files
train_jsonl_path = '../data/Prompt_Training_2.0/vertex_ai_fine_tuning_train.jsonl'
validation_jsonl_path = '../data/Prompt_Training_2.0/vertex_ai_fine_tuning_validation.jsonl'

# Ensure directory exists
os.makedirs(os.path.dirname(train_jsonl_path), exist_ok=True)

# Write training data
with open(train_jsonl_path, 'w') as f:
    for entry in train_entries:
        f.write(json.dumps(entry) + '\n')

# Write validation data
with open(validation_jsonl_path, 'w') as f:
    for entry in validation_entries:
        f.write(json.dumps(entry) + '\n')

print(f"Created training JSONL file at {train_jsonl_path}")
print(f"Total training examples: {len(train_entries)}")

print(f"Created validation JSONL file at {validation_jsonl_path}")
print(f"Total validation examples: {len(validation_entries)}")

# Display a sample entry
print("\nSample entry:")
print(json.dumps(train_entries[0], indent=2))

In [None]:
with open('../env.json', 'r') as f:
    env_vars = json.load(f)
# Set environment variables from the loaded file
os.environ["GOOGLE_CLOUD_PROJECT"] = env_vars["google_cloud_project"]
os.environ["GOOGLE_CLOUD_LOCATION"] = env_vars["google_cloud_location"]
os.environ["GOOGLE_GENAI_USE_VERTEXAI"] = env_vars["google_genai_use_vertexai"]
# Set the fine-tuned model ID as an environment variable
os.environ["FINE_TUNED_MODEL_ID"] = env_vars["fine_tuned_v2_model_id"]


import os
from google import genai
from google.genai.types import HttpOptions

# Initialize the GenAI client for Vertex AI
client = genai.Client(http_options=HttpOptions(api_version="v1"))

# Define a test prompt
instruction = "Write a function to calculate the Fibonacci sequence in Python"
context = "The Fibonacci sequence is a series of numbers where each number is the sum of the two preceding ones, usually starting with 0 and 1."

test_prompt = prompt_template.format(context=context, instruction=instruction)

# Call the fine-tuned model
response = client.models.generate_content(
    model=os.environ["FINE_TUNED_MODEL_ID"],
    contents=test_prompt,
)

# Print the response
print("Original prompt:")
print(test_prompt)
print("\nImproved prompt from fine-tuned model:")
print(response.text)