In [38]:
from google import genai
from google.genai import types

from IPython.display import HTML, Markdown, display
from google.api_core import retry
import os
# Import environment variables from env.json
import json

# Load environment variables from env.json
with open('../env.json', 'r') as f:
    env_vars = json.load(f)
# Set environment variables from the loaded file
os.environ["GOOGLE_CLOUD_PROJECT"] = env_vars["google_cloud_project"]
os.environ["GOOGLE_CLOUD_LOCATION"] = env_vars["google_cloud_location"]
os.environ["GOOGLE_GENAI_USE_VERTEXAI"] = env_vars["google_genai_use_vertexai"]
# Set the fine-tuned model ID as an environment variable
os.environ["FINE_TUNED_MODEL_ID"] = env_vars["fine_tuned_model_id"]
os.environ["GOOGLE_API_KEY"] = env_vars["google_api_keys"][3]
GOOGLE_API_KEY = os.environ["GOOGLE_API_KEY"]


is_retriable = lambda e: (isinstance(e, genai.errors.APIError) and e.code in {429, 503})

genai.models.Models.generate_content = retry.Retry(
    predicate=is_retriable)(genai.models.Models.generate_content)

client = genai.Client(api_key=GOOGLE_API_KEY)

In [39]:
import google.generativeai as genai

genai.configure(api_key=GOOGLE_API_KEY)

# List available models
print("Available models:")
for m in genai.list_models():
    print(m.name)

# Use a specific model
model = genai.GenerativeModel('gemini-2.0-flash-lite-001')  # base untuned model

# Generate content
response = model.generate_content("How can I be a better team leader?")
print(response.text)


Available models:
models/chat-bison-001
models/text-bison-001
models/embedding-gecko-001
models/gemini-1.0-pro-vision-latest
models/gemini-pro-vision
models/gemini-1.5-pro-latest
models/gemini-1.5-pro-001
models/gemini-1.5-pro-002
models/gemini-1.5-pro
models/gemini-1.5-flash-latest
models/gemini-1.5-flash-001
models/gemini-1.5-flash-001-tuning
models/gemini-1.5-flash
models/gemini-1.5-flash-002
models/gemini-1.5-flash-8b
models/gemini-1.5-flash-8b-001
models/gemini-1.5-flash-8b-latest
models/gemini-1.5-flash-8b-exp-0827
models/gemini-1.5-flash-8b-exp-0924
models/gemini-2.5-pro-exp-03-25
models/gemini-2.5-pro-preview-03-25
models/gemini-2.0-flash-exp
models/gemini-2.0-flash
models/gemini-2.0-flash-001
models/gemini-2.0-flash-exp-image-generation
models/gemini-2.0-flash-lite-001
models/gemini-2.0-flash-lite
models/gemini-2.0-flash-lite-preview-02-05
models/gemini-2.0-flash-lite-preview
models/gemini-2.0-pro-exp
models/gemini-2.0-pro-exp-02-05
models/gemini-exp-1206
models/gemini-2.0-fla

In [28]:
# The genai module doesn't have a Client attribute, so we'll use the GenerativeModel directly
# No need to create a client instance

def generate_response(prompt):
    inputs = f'''You are a prompt engineering expert that transforms simple 
    prompts into more effective versions. Analyze the input prompt and create
      an improved version that includes specific details, context, desired output format, 
      and any relevant constraints. Make the prompt clear, specific, and designed to 
      generate high-quality responses.
      
      Input Prompt: {prompt}

      Respond ONLY with the text of the improved prompt, without any explanations, 
      introductions, or additional commentary.
      '''

    response = model.generate_content(contents=inputs)
    return response.text

print(generate_response('Explain AI to me like im a little kid'))

Imagine you're talking to a 5-year-old. Explain Artificial Intelligence (AI) in simple terms. Use analogies a child would understand, like toys, games, or animals. Focus on what AI can *do*, not how it works internally. Explain how AI is different from a regular toy. Keep the explanation concise (under 150 words) and make it fun. The output should be a short paragraph suitable for a child.



In [29]:
import pandas as pd

# Load the ShareGPT dataset from CSV
try:
    # Define the path to the CSV file
    csv_path = "../data/ShareGPT/testing_prompts_cleaned.csv"
    
    # Load the CSV file into a pandas DataFrame
    df_sharegpt = pd.read_csv(csv_path)
    
    # Display basic information about the dataset
    print(f"Dataset loaded successfully with {len(df_sharegpt)} rows")
    print("\nDataset columns:")
    for col in df_sharegpt.columns:
        print(f"- {col}")
    
    # Display the first few rows of the dataset
    print("\nFirst 5 rows of the dataset:")
    display(df_sharegpt.head())

    df_sharegpt.drop(columns=['context', 'prompt'], inplace=True)
    
except FileNotFoundError:
    print(f"Error: The file at '../data/ShareGPT/separated_prompts_clean.csv' was not found.")
except Exception as e:
    print(f"An error occurred while loading the dataset: {str(e)}")
    


Dataset loaded successfully with 1000 rows

Dataset columns:
- original_id
- original_prompt
- context
- prompt

First 5 rows of the dataset:


Unnamed: 0,original_id,original_prompt,context,prompt
0,140731,One-pot vegetarian pasta recipes for busy nights,,One-pot vegetarian pasta recipes for busy nights
1,121053,We have the following blog content... what is ...,,We have the following blog content... what is ...
2,37805,how o sort element using merge sort technique ...,,how o sort element using merge sort technique ...
3,116016,"make a javascript class ""GraphicLayer"" which i...",,"make a javascript class ""GraphicLayer"" which i..."
4,132819,!Please outline the steps to build an automate...,,Please outline the steps to build an automated...


In [30]:
# Apply the generate_response function to each original_prompt in the dataframe
# This will create a new column 'base_response' with the improved prompts
print("Generating improved prompts for each original prompt...")

# Import tqdm for progress bar
from tqdm import tqdm

# Define a function to safely apply generate_response
def safe_generate_response(prompt):
    try:
        return generate_response(prompt)
    except Exception as e:
        print(f"Error processing prompt: {str(e)[:100]}...")
        return "Error generating response"

# Create directory if it doesn't exist
import os
os.makedirs("../data/BaseModelResponses", exist_ok=True)

# Initialize counter for saving
counter = [0]
total_rows = len(df_sharegpt)

# Define a function to process each row and save every 100 rows
def process_and_save(prompt):
    response = safe_generate_response(prompt)
    return response

# Apply the function to each row in the dataframe with tqdm progress bar
tqdm.pandas(desc="Processing prompts")
df_sharegpt['base_response'] = df_sharegpt['original_prompt'].progress_apply(process_and_save)

# Save the final complete dataset
df_sharegpt.to_csv("../data/BaseModelResponses/improved_prompts.csv", index=False)
print(f"Completed generating {total_rows} improved prompts")
print(f"Final results saved to ../data/BaseModelResponses/improved_prompts.csv")

print("\nSample of results:")
display(df_sharegpt[['original_prompt', 'base_response']].head(3))


Generating improved prompts for each original prompt...


Processing prompts: 100%|██████████| 1000/1000 [35:50<00:00,  2.15s/it] 

Completed generating 1000 improved prompts
Final results saved to ../data/BaseModelResponses/improved_prompts.csv

Sample of results:





Unnamed: 0,original_prompt,base_response
0,One-pot vegetarian pasta recipes for busy nights,Develop five unique and easy one-pot vegetaria...
1,We have the following blog content... what is ...,Analyze the provided blog content about choosi...
2,how o sort element using merge sort technique ...,Create a Java program that implements the Merg...


In [None]:
import os
from google import genai
from google.genai.types import HttpOptions



# Access the model ID from environment variables
fine_tuned_model_id = os.environ["FINE_TUNED_MODEL_ID"]


# Initialize the GenAI client for Vertex AI
client = genai.Client(http_options=HttpOptions(api_version="v1"))

# Define your prompt
prompt = "How can I be a better team leader?"

# Call your fine-tuned model
response = client.models.generate_content(
    model=fine_tuned_model_id,
    contents=prompt,
)

# Print the response
print(response.text)


In [3]:
#works with no context, but you can do with context too
def generate_response(prompt):
    inputs = f'''You are a prompt engineering expert that transforms simple 
    prompts into more effective versions. Analyze the input prompt and create
      an improved version that includes specific details, context, desired output format, 
      and any relevant constraints. Make the prompt clear, specific, and designed to 
      generate high-quality responses.
      
      Input Prompt: {prompt}

      Respond ONLY with the text of the improved prompt, without any explanations, 
      introductions, or additional commentary.
      '''

    return client.models.generate_content(
        model=fine_tuned_model_id,
        contents=inputs).text

In [None]:
import pandas as pd

# Load the ShareGPT dataset from CSV
try:
    # Define the path to the CSV file
    csv_path = "../data/ShareGPT/separated_prompts_clean.csv"
    
    # Load the CSV file into a pandas DataFrame
    df_sharegpt = pd.read_csv(csv_path)
    
    # Display basic information about the dataset
    print(f"Dataset loaded successfully with {len(df_sharegpt)} rows")
    print("\nDataset columns:")
    for col in df_sharegpt.columns:
        print(f"- {col}")
    
    # Display the first few rows of the dataset
    print("\nFirst 5 rows of the dataset:")
    display(df_sharegpt.head())

    df_sharegpt.drop(columns=['context', 'prompt'], inplace=True)
    
except FileNotFoundError:
    print(f"Error: The file at '../data/ShareGPT/separated_prompts_clean.csv' was not found.")
except Exception as e:
    print(f"An error occurred while loading the dataset: {str(e)}")
    

In [None]:
# Apply the generate_response function to each original_prompt in the dataframe
# This will create a new column 'base_response' with the improved prompts
print("Generating improved prompts for each original prompt...")

# Import tqdm for progress bar
from tqdm import tqdm

# Define a function to safely apply generate_response
def safe_generate_response(prompt):
    try:
        return generate_response(prompt)
    except Exception as e:
        print(f"Error processing prompt: {str(e)[:100]}...")
        return "Error generating response"

# Create directory if it doesn't exist
import os
os.makedirs("../data/FineTunedResponses", exist_ok=True)

# Initialize counter for saving
counter = [0]
total_rows = len(df_sharegpt)

# Define a function to process each row and save every 100 rows
def process_and_save(prompt):
    response = safe_generate_response(prompt)
    counter[0] += 1
    
    # Save every 100 rows
    if counter[0] % 100 == 0:
        batch_num = counter[0] // 100
        print(f"Processed {counter[0]} rows. Saving batch {batch_num}...")
        df_sharegpt.to_csv(f"../data/FineTunedResponses/responses_batch_{batch_num}.csv", index=False)
    
    return response

# Apply the function to each row in the dataframe with tqdm progress bar
tqdm.pandas(desc="Processing prompts")
df_sharegpt['fine_tuned_response'] = df_sharegpt['original_prompt'].progress_apply(process_and_save)

# Save the final complete dataset
df_sharegpt.to_csv("../data/FineTunedResponses/all_responses.csv", index=False)
print(f"Completed generating {total_rows} improved prompts")
print(f"Final results saved to ../data/FineTunedResponses/all_responses.csv")

print("\nSample of results:")
display(df_sharegpt[['original_prompt', 'fine_tuned_response']].head(3))


In [None]:
print(df_sharegpt['original_prompt'].iloc[0])

print(generate_response(df_sharegpt['original_prompt'].iloc[0]))

In [None]:
# Display random examples from the dataset
import random

# Set a random seed for reproducibility
random.seed(42)

# Select 5 random indices from the dataframe
random_indices = random.sample(range(len(df_sharegpt)), 5)

print("\nRandom examples of prompts and their fine-tuned responses:")
for idx in random_indices:
    print(f"\n--- Example {idx} ---")
    print(f"Original Prompt:\n{df_sharegpt['original_prompt'].iloc[idx]}")
    print(f"\nFine-tuned Response:\n{df_sharegpt['fine_tuned_response'].iloc[idx]}")
    print("-" * 80)

# Alternative display using pandas
print("\nRandom examples as a dataframe:")
display(df_sharegpt.loc[random_indices, ['original_prompt', 'fine_tuned_response']])


In [None]:
# Load the fine-tuned responses
print("Loading fine-tuned responses...")
df_fine_tuned = pd.read_csv("../data/FineTunedResponses/all_responses.csv")

# Load the Flash 2.0 responses
print("Loading Flash 2.0 responses...")
df_flash = pd.read_csv("../data/Flash2.0Responses/all_responses.csv")

# Join the datasets on original_id
print("Joining datasets on original_id...")
df_combined = pd.merge(
    df_fine_tuned, 
    df_flash,
    on="original_id",
    suffixes=("_fine_tuned", "_flash")
)

# Display information about the combined dataset
print(f"Combined dataset shape: {df_combined.shape}")
print("\nSample of combined results:")
display(df_combined[['original_id', 'original_prompt_fine_tuned', 'fine_tuned_response', 'base_response']].head(5))

# Save the combined dataset
df_combined.to_csv("../data/combined_responses.csv", index=False)
print("Combined dataset saved to ../data/combined_responses.csv")


In [None]:
# Display some readable examples from the combined dataset
print("\nReadable examples from the combined dataset:")
print("=" * 100)

# Select different examples to display in a more readable format
# Choose examples with interesting contrasts between fine-tuned and base responses
sample_indices = [116016, 132819, 654]  # Different examples from the dataset

for i, idx in enumerate(sample_indices):
    row_idx = df_combined.index[df_combined['original_id'] == idx][0] if idx in df_combined['original_id'].values else i
    print(f"\n\n--- Example {i+1} ---")
    print(f"Original Prompt:\n{df_combined['original_prompt_fine_tuned'].iloc[row_idx]}")
    print("\n" + "-" * 50)
    print(f"Fine-tuned Model Response:\n{df_combined['fine_tuned_response'].iloc[row_idx]}")
    print("\n" + "-" * 50)
    print(f"Base Model Response:\n{df_combined['base_response'].iloc[row_idx]}")
    print("\n" + "=" * 100)

# Get 5 random examples for additional variety
random_indices = random.sample(range(len(df_combined)), 5)
print("\n\nAdditional random examples:")
print("=" * 100)

for i, idx in enumerate(random_indices):
    print(f"\n\n--- Random Example {i+1} ---")
    print(f"Original Prompt:\n{df_combined['original_prompt_fine_tuned'].iloc[idx]}")
    print("\n" + "-" * 50)
    print(f"Fine-tuned Model Response:\n{df_combined['fine_tuned_response'].iloc[idx]}")
    print("\n" + "-" * 50)
    print(f"Base Model Response:\n{df_combined['base_response'].iloc[idx]}")
    print("\n" + "=" * 100)


In [None]:
model = genai.GenerativeModel('gemini-2.0-flash')
def get_response(prompt):
    response = model.generate_content(prompt)
    return response.text


# Rename fine_tuned_response to fine_tuned_prompt in the all_prompts.csv file
import pandas as pd
import time
from tqdm import tqdm

# Load the prompts from the CSV file
prompts_df = pd.read_csv("../data/FineTunedResponses/all_prompts.csv")

# Rename the column
prompts_df = prompts_df.rename(columns={'fine_tuned_response': 'fine_tuned_prompt'})

# Create a new column to store the responses
prompts_df['response_to_fine_tuned'] = None

# Process each prompt and get a response
print("\nProcessing fine-tuned prompts:")
for i in tqdm(range(len(prompts_df))):
    try:
        # Get the response for the fine-tuned prompt
        response = get_response(prompts_df.loc[i, 'fine_tuned_prompt'])
        prompts_df.loc[i, 'response_to_fine_tuned'] = response
        
        # Add a small delay to avoid rate limiting
        time.sleep(0.5)
    except Exception as e:
        print(f"Error processing prompt {i}: {e}")
        prompts_df.loc[i, 'response_to_fine_tuned'] = f"Error: {str(e)}"
        
        # Wait a bit longer if there's an error
        time.sleep(2)

# Save the results to a new CSV file
prompts_df.to_csv("../data/FineTunedResponses/responses_to_fine_tuned.csv", index=False)
print("\nResponses saved to ../data/FineTunedResponses/responses_to_fine_tuned.csv")

# Display a few examples of the responses
print("\nSample responses to fine-tuned prompts:")
print(prompts_df[['fine_tuned_prompt', 'response_to_fine_tuned']].head(3))





In [41]:

# Create the model
model = genai.GenerativeModel('gemini-2.0-flash')  # base untuned model

def get_response(prompt):
    response = model.generate_content(prompt)
    return response.text


# Process the prompts from Flash2.0Responses
import pandas as pd
import time
from tqdm import tqdm

# Load the prompts from the CSV file
prompts_df = pd.read_csv("../data/BaseModelResponses/improved_prompts.csv")

# Rename the columns to match our expected format
prompts_df = prompts_df.rename(columns={'base_response': 'base_prompt'})

# Create a new column to store the responses
prompts_df['response'] = None

# Process each prompt and get a response
print("\nProcessing BaseModelResponses prompts:")
for i in tqdm(range(len(prompts_df))):
    try:
        # Get the response for the base prompt
        response = get_response(prompts_df.loc[i, 'original_prompt'])
        prompts_df.loc[i, 'response'] = response
        
        # Add a small delay to avoid rate limiting
        time.sleep(0.1)
    except Exception as e:
        print(f"Error processing prompt {i}: {e}")
        prompts_df.loc[i, 'response'] = f"Error: {str(e)}"
        
        # Wait a bit longer if there's an error
        time.sleep(2)

# Save the results to a new CSV file
prompts_df.to_csv("../data/BaseModelResponses/responses.csv", index=False)
print("\nResponses saved to ../data/BaseModelResponses/responses.csv")

# Display a few examples of the responses
print("\nSample responses to Flash2.0 prompts:")
print(prompts_df[['original_prompt', 'response']].head(3))






Processing BaseModelResponses prompts:


 82%|████████▏ | 823/1000 [1:32:55<06:39,  2.26s/it]  

Error processing prompt 823: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 58
}
]


 82%|████████▏ | 824/1000 [1:32:57<06:30,  2.22s/it]

Error processing prompt 824: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 56
}
]


100%|██████████| 1000/1000 [1:53:15<00:00,  6.80s/it]



Responses saved to ../data/BaseModelResponses/responses.csv

Sample responses to Flash2.0 prompts:
                                     original_prompt  \
0   One-pot vegetarian pasta recipes for busy nights   
1  We have the following blog content... what is ...   
2  how o sort element using merge sort technique ...   

                                            response  
0  Okay, here are some delicious and easy one-pot...  
1  Okay, based on the content provided, here's th...  
2  ```java\npublic class MergeSort {\n\n    // Ma...  


In [None]:
model = genai.GenerativeModel('gemini-2.0-flash')
def get_response(prompt):
    response = model.generate_content(prompt)
    return response.text


# Process the prompts from ShareGPT
import pandas as pd
import time
from tqdm import tqdm

# Load the prompts from the CSV file
prompts_df = pd.read_csv("../data/ShareGPT/separated_prompts_clean.csv")

# Create a new column to store the responses
prompts_df['og_response'] = None

# Process each prompt and get a response
print("\nProcessing ShareGPT prompts:")
for i in tqdm(range(len(prompts_df))):
    try:
        # Get the response for the prompt
        response = get_response(prompts_df.loc[i, 'original_prompt'])
        prompts_df.loc[i, 'og_response'] = response
        
        # Add a small delay to avoid rate limiting
        time.sleep(0.5)
    except Exception as e:
        print(f"Error processing prompt {i}: {e}")
        prompts_df.loc[i, 'og_response'] = f"Error: {str(e)}"
        
        # Wait a bit longer if there's an error
        time.sleep(2)

# Save the results to a new CSV file
prompts_df.to_csv("../data/ShareGPT/og_responses.csv", index=False)
print("\nResponses saved to ../data/ShareGPT/og_responses.csv")

# Display a few examples of the responses
print("\nSample responses to ShareGPT prompts:")
print(prompts_df[['original_prompt', 'og_response']].head(3))