# How to improve a prompt by evaluating metrics iteratively
### Lucía Cordero Sánchez

## Step 1: Setup

In [None]:
!pip install neptune==1.10.4
!pip install torch==2.3.1
!pip install textstat
!pip install nltk==3.8.1
!pip install openai==1.41.0

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-

In [6]:
import neptune
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import textstat
import nltk
from nltk.tokenize import word_tokenize
import openai

# Check your dependencies' versions
neptune.__version__
torch.__version__
textstat.__version__
nltk.__version__
openai.__version__

'1.41.0'

In [None]:
# Initialize Neptune with your credentials
run = neptune.init_run(
    project="your_name",
    api_token="your_token",
)

client = OpenAI(
    # This is the default and can be omitted
    api_key="your_token",
)

[neptune] [info   ] Neptune initialized. Open in the app: https://app.neptune.ai/lucia.corsan/Prompt-Engineering-Tutorial1/e/PROMPT-16


## Step 2: Define metrics (qualitative)

In [None]:
# Load pre-trained model and tokenizer for perplexity calculation
model_name = 'gpt2'
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

def calculate_diversity(text):
    tokens = word_tokenize(text.lower())
    num_tokens = len(tokens)
    num_unique_tokens = len(set(tokens))
    diversity = num_unique_tokens / num_tokens if num_tokens > 0 else 0
    return diversity

def calculate_fluency(text):
    readability_score = textstat.flesch_reading_ease(text)
    return readability_score

def calculate_perplexity(text):
    tokens_tensor = tokenizer.encode(text, return_tensors='pt')
    with torch.no_grad():
        outputs = model(tokens_tensor, labels=tokens_tensor)
        loss = outputs.loss
        perplexity = torch.exp(loss).item()
    return perplexity

def generate_text(prompt, max_tokens):
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        max_tokens=max_tokens,
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ]
    )
    return response.choices[0].message.content

def evaluate_prompt(prompt, max_tokens):
    generated_text = generate_text(prompt, max_tokens)
    print(f"Generated Text: {generated_text}")

    metrics = {
    'diversity': calculate_diversity(generated_text),
    'fluency': calculate_fluency(generated_text),
    'perplexity': calculate_perplexity(generated_text),
    }
    return metrics

## Step 3: Initial testing and logging

In [None]:
# Define the range of max_tokens to test
token_ranges = range(15, 200, 15)

results = []

for max_tokens in token_ranges:
    metrics = evaluate_prompt("Write a story about hero's journey. ", max_tokens)

    # Log metrics to neptune.ai
    run["metrics/diversity"].append(metrics['diversity'])
    run["metrics/fluency"].append(metrics['fluency'])
    run["metrics/perplexity"].append(metrics['perplexity'])
    run["max_tokens"].append(max_tokens)

# Finalize the experiment
run.stop()


Generated Text: Once upon a time, in a small village nestled among the mountains, there
Generated Text: Once upon a time in the peaceful village of Oakwood, there lived a young farmer named Thomas. Thomas was known for his kindness, hard work,
Generated Text: Once upon a time, in a small village nestled between majestic mountains and lush forests, there lived a young man named Leo. Leo had always dreamt of embarking on a great adventure, much like the heroes he had read
Generated Text: Once upon a time in the quaint village of Willowbrook, there lived a young orphan named Marcus. Marcus had always dreamed of going on grand adventures and becoming a hero like the legendary warriors of old. One day, a mysterious visitor arrived in the village and revealed to Marcus that he was the chosen one
Generated Text: Once upon a time, in a small village nestled at the foot of a mighty mountain, there lived a young orphan named Finn. Despite his humble upbringing, Finn possessed a kind heart and an

## Step 4: Refine prompts based on results

In [None]:
new_prompt_1 = "Write a story about hero's journey. Write a short scene where the hero receives an urgent call to a life-changing mission, write a short scene that reveals the hero's internal conflict and how he overcomes it during his journey."

new_prompt_2 = "Write a story about hero's journey. Describe in a few lines the hero's world or environment before he begins his adventure."

## Step 5: Reevaluate Refined Prompts and Log New Metrics


In [None]:
# REMINDER: Execute again the cell with your credentials before this one
# Define the range of max_tokens to test
token_ranges = range(15, 200, 15)

results = []

for max_tokens in token_ranges:
    metrics = evaluate_prompt(new_prompt_1, max_tokens)

    # Log metrics to neptune.ai
    run["metrics/diversity"].append(metrics['diversity'])
    run["metrics/fluency"].append(metrics['fluency'])
    run["metrics/perplexity"].append(metrics['perplexity'])
    run["max_tokens"].append(max_tokens)

# Finalize the experiment
run.stop()

Generated Text: Once upon a time, in the quaint village of Thornwood, there lived
Generated Text: Once upon a time in the mystical land of Eldoria, there lived a humble blacksmith named Finn. Though Finn spent his days crafting swords and shields
Generated Text: In the land of Ashenvale, there lived a young blacksmith named Aiden. His village had been plagued by a terrible curse that left the crops barren and the people ailing. One day, as Aiden hammered
Generated Text: Once upon a time, in the village of Elloria, there lived a humble blacksmith named Aiden. Aiden was known for his kindness and skill in crafting weapons and armor for the villagers. Little did he know, his simple life was about to change forever.

One evening, as Aiden
Generated Text: Once upon a time, in the quaint village of Oakwood, there lived a young farmer named Thomas. Thomas harbored dreams of adventure and heroism, but the mundane responsibilities of tending to his family’s farm kept him bound to the monotony o

In [None]:
# Define the range of max_tokens to test
token_ranges = range(15, 200, 15)

results = []

for max_tokens in token_ranges:
    metrics = evaluate_prompt(new_prompt_2, max_tokens)

    # Log metrics to neptune.ai
    run["metrics/diversity"].append(metrics['diversity'])
    run["metrics/fluency"].append(metrics['fluency'])
    run["metrics/perplexity"].append(metrics['perplexity'])
    run["max_tokens"].append(max_tokens)

# Finalize the experiment
run.stop()

Generated Text: Once upon a time in the peaceful kingdom of Serenia, lived a young
Generated Text: In the peaceful kingdom of Veridian, there lived a humble blacksmith named Aiden. Surrounded by rolling meadows and towering mountains, Veridian
Generated Text: Once upon a time in the peaceful kingdom of Eldoria, there lived a young farmer named Marcus. He spent his days tending to his crop fields and helping his village prosper under the guidance of the wise King Elric.
Generated Text: Once upon a time, in the peaceful village of Evergreen Valley, lived a young farmer named Ethan. His world was one of routine and simplicity, waking up at dawn to tend to the crops, surrounded by the lush green fields and the gentle sounds of nature. Ethan was content with his life,
Generated Text: In the peaceful kingdom of Elysia, nestled between majestic mountains and shimmering lakes, lived a young blacksmith named Alex. He spent his days crafting swords and armor, surrounded by the laughter of childr

### **Hands-on Practice**: Implement relevance metric in the evaluation function and run some experiments to test it.

In [None]:
def get_embedding(text, tokenizer, model):
    """
    Convert text to embedding using the tokenizer and model.
    """
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    # Use the [CLS] token representation for the embedding
    embedding = outputs.last_hidden_state[:, 0, :].numpy()
    return embedding

def calculate_relevance(text, instructions, tokenizer, model):
    """
    Evaluate the relevance of the generated text with respect to the instructions.
    """
    # Get embeddings for both text and instructions
    text_embedding = get_embedding(text, tokenizer, model)
    instructions_embedding = get_embedding(instructions, tokenizer, model)

    # Compute cosine similarity between the embeddings
    similarity = cosine_similarity(text_embedding, instructions_embedding)

    # Return the average similarity as a measure of relevance
    relevance_score = similarity.mean()
    return relevance_score