# Edge Case Generation Pipeline

This notebook demonstrates a pipeline for generating edge case prompts and testing them against LLMs using Ollama.

In [None]:
import json
import os
import requests  # Retained for general purpose, though not directly for Ollama chat call if ollama library handles it
import time
import ollama
import datetime
import pandas as pd
from tqdm import tqdm
from IPython.display import (
    display,
)  # Added for better display of DataFrames in notebooks

## Setup Configuration

Define settings for our pipeline. We will be using Ollama with the specified model.

In [None]:
# Configuration
MODEL_NAME = "artifish/llama3.2-uncensored"  # Ollama model name
OUTPUT_FILE = f"edge_case_outputs_{MODEL_NAME.replace('/', '_')}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.jsonl"
MAX_TOKENS = 4096  # Corresponds to 'num_predict' in Ollama options
TEMPERATURE = 0.7  # Corresponds to 'temperature' in Ollama options

## Load Edge Case Prompts

We'll load edge case prompts from a JSONL file.

In [None]:
# Load edge case prompts
def load_prompts(file_path):
    prompts = []
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            if line.strip():  # Skip empty lines
                try:
                    data = json.loads(line)
                    prompts.append(data)
                except json.JSONDecodeError as e:
                    print(f"Error parsing line: {line}. Error: {e}")
    return prompts


# Example usage (ensure 'edge_case_prompts.jsonl' exists or adjust path)
# prompts = load_prompts('edge_case_prompts.jsonl')
# if prompts:
# print(f"Loaded {len(prompts)} edge case prompts")
# else:
# print("No prompts loaded. Make sure 'edge_case_prompts.jsonl' exists and is not empty.")

## Process Prompts with Ollama

We'll use the Ollama client to process each prompt with the `artifish/llama3.2-uncensored` model and save the results.

In [None]:
# Function to process prompts with Ollama
def process_with_ollama(prompt_data):
    prompt_text = prompt_data["prompt"]
    category = prompt_data.get("category", "")

    start_time = time.time()

    try:
        # Call Ollama API
        response = ollama.chat(
            model=MODEL_NAME,
            messages=[{"role": "user", "content": prompt_text}],
            options={"temperature": TEMPERATURE, "num_predict": MAX_TOKENS},
        )

        elapsed_time = time.time() - start_time

        # Extract the model's response text
        model_response = response["message"]["content"]

        return {
            "prompt": prompt_text,
            "category": category,
            "response": model_response,
            "model": MODEL_NAME,
            "time": elapsed_time,
            "timestamp": datetime.datetime.now().isoformat(),
            "success": True,
        }
    except Exception as e:
        elapsed_time = time.time() - start_time
        return {
            "prompt": prompt_text,
            "category": category,
            "response": f"Error: {str(e)}",
            "model": MODEL_NAME,
            "time": elapsed_time,
            "timestamp": datetime.datetime.now().isoformat(),
            "success": False,
        }

## Run the Pipeline

Now we'll process all prompts and save the results.

In [None]:
# Process all prompts and save results
def run_pipeline(prompts_list, output_file_path):
    results = []

    # Check if prompts_list is None or empty
    if not prompts_list:
        print("No prompts to process. Aborting pipeline run.")
        return results

    for prompt_data in tqdm(prompts_list, desc="Processing prompts"):
        result = process_with_ollama(prompt_data)
        results.append(result)

        # Save result incrementally to prevent data loss
        with open(output_file_path, "a", encoding="utf-8") as f:
            f.write(json.dumps(result) + "\n")

        # Add a small delay to avoid overwhelming the Ollama server if it's local
        time.sleep(0.5)

    return results

In [None]:
# Example: Create a dummy edge_case_prompts.jsonl if it doesn't exist for testing
if not os.path.exists("edge_case_prompts.jsonl"):
    dummy_prompts = [
        {"prompt": "What is the capital of France?", "category": "general_knowledge"},
        {
            "prompt": "Explain quantum computing in simple terms.",
            "category": "technical_explanation",
        },
    ]
    with open("edge_case_prompts.jsonl", "w", encoding="utf-8") as f:
        for p in dummy_prompts:
            f.write(json.dumps(p) + "\n")
    print("Created dummy 'edge_case_prompts.jsonl' for demonstration.")

prompts = load_prompts("edge_case_prompts.jsonl")
if prompts:
    print(f"Loaded {len(prompts)} edge case prompts")
    # Run the pipeline
    # Uncomment to execute
    # results = run_pipeline(prompts, OUTPUT_FILE)
    # print(f"Completed processing {len(results)} prompts. Results saved to {OUTPUT_FILE}")
else:
    print("No prompts loaded. Pipeline will not run.")

## Analysis

Let's analyze the results to identify patterns and issues.

In [None]:
# Analyze results from the saved file
def analyze_results(file_path):
    # Load results
    results_list = []  # Renamed to avoid conflict with previous 'results' variable
    if not os.path.exists(file_path):
        print(f"Output file {file_path} not found. Run the pipeline first.")
        return pd.DataFrame()  # Return empty DataFrame

    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            if line.strip():  # Skip empty lines
                try:
                    data = json.loads(line)
                    results_list.append(data)
                except json.JSONDecodeError as e:
                    print(f"Error parsing line: {line}. Error: {e}")

    if not results_list:
        print("No results found in the file.")
        return pd.DataFrame()

    # Convert to DataFrame for easier analysis
    df = pd.DataFrame(results_list)

    # Basic statistics
    total = len(results_list)
    successful = sum(1 for r in results_list if r.get("success", False))
    error_rate = (total - successful) / total if total > 0 else 0
    avg_time = sum(r.get("time", 0) for r in results_list) / total if total > 0 else 0

    print(f"Total prompts processed: {total}")
    print(f"Successful responses: {successful}")
    print(f"Error rate: {error_rate:.2%}")
    print(f"Average response time: {avg_time:.2f} seconds")

    # Category analysis if available
    if "category" in df.columns:
        print("\nResults by category:")
        # Ensure 'success' column is boolean or 0/1 for mean calculation
        df["success_numeric"] = df["success"].astype(float)
        category_stats = df.groupby("category")["success_numeric"].agg(["count", "mean"])
        category_stats.columns = ["Count", "Success Rate"]
        category_stats["Success Rate"] = category_stats["Success Rate"].apply(lambda x: f"{x:.2%}")
        display(category_stats)

    return df

In [None]:
# Analyze results
# Uncomment to execute after running the pipeline
# df_results = analyze_results(OUTPUT_FILE)
# if not df_results.empty:
# print("Analysis complete.")

## Sample Response Review

Let's look at some sample responses to understand the model's behavior.

In [None]:
# Function to display sample responses
def display_sample_responses(df, n=5):
    if df.empty or len(df) == 0:
        print("No results to display.")
        return

    # Sample responses
    sample_df = df.sample(min(n, len(df)))

    for i, row in enumerate(sample_df.itertuples(), 1):
        print(f"\n--- Sample {i} ---")
        if hasattr(row, "category"):
            print(f"Category: {row.category}")
        print(f"\nPrompt:\n{row.prompt}")
        print(f"\nResponse:\n{row.response}")
        if hasattr(row, "time"):
            print(f"\nTime: {row.time:.2f} seconds")
        print("-" * 80)

In [None]:
# Display sample responses
# Uncomment to execute after running analysis and having df_results
# if 'df_results' in locals() and not df_results.empty:
# display_sample_responses(df_results)
# else:
# print("DataFrame 'df_results' not available or empty. Run analysis first.")

## Generate New Edge Cases

Based on our findings, we can generate new edge cases that target specific weaknesses.

In [None]:
# Function to generate new edge cases
def generate_new_edge_cases():
    # Define new edge case categories and templates
    new_categories = [
        {
            "category": "instruction_overload",
            "templates": [
                "Follow these 20 steps precisely: {steps}",
                "I need you to perform these actions in sequence: {actions}",
            ],
        },
        {
            "category": "complex_reasoning",
            "templates": [
                "Solve this multi-step problem: {problem}",
                "Analyze this scenario with multiple stakeholders: {scenario}",
            ],
        },
    ]

    # Generate specific instances
    new_prompts_list = []  # Renamed to avoid conflict

    # Example for instruction_overload
    steps = "\n".join([f"{i+1}. Perform task {chr(65+i)}" for i in range(20)])
    new_prompts_list.append(
        {
            "prompt": f"Follow these 20 steps precisely: {steps}",
            "category": "instruction_overload",
        }
    )

    # Example for complex_reasoning
    problem = "A company needs to allocate resources across 5 projects with different ROIs, time constraints, and team preferences. Project A has a 15% ROI but requires senior staff. Project B has a 12% ROI with no special requirements. Project C has a 20% ROI but a 6-month timeline. Project D has an 18% ROI with high risk. Project E has a 10% ROI but strengthens client relationships. How should they prioritize?"
    new_prompts_list.append(
        {
            "prompt": f"Solve this multi-step problem: {problem}",
            "category": "complex_reasoning",
        }
    )

    # Save new prompts
    output_new_prompts_file = "new_edge_cases.jsonl"
    with open(output_new_prompts_file, "w", encoding="utf-8") as f:
        for prompt_item in new_prompts_list:
            f.write(json.dumps(prompt_item) + "\n")

    print(
        f"Generated {len(new_prompts_list)} new edge cases and saved to {output_new_prompts_file}"
    )
    return new_prompts_list

In [None]:
# Generate new edge cases
# Uncomment to execute
# new_edge_cases_list = generate_new_edge_cases()

## Test New Edge Cases

We can now test our newly generated edge cases with Ollama.

In [None]:
# Test new edge cases
def test_new_edge_cases(new_prompts_to_test):
    if not new_prompts_to_test:
        print("No new edge cases to test.")
        return []
    output_file_new_results = f"new_edge_case_results_{MODEL_NAME.replace('/', '_')}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.jsonl"
    new_run_results = run_pipeline(new_prompts_to_test, output_file_new_results)
    print(
        f"Completed testing {len(new_run_results)} new edge cases. Results saved to {output_file_new_results}"
    )
    return new_run_results

In [None]:
# Test new edge cases
# Uncomment to execute after generating new_edge_cases_list
# if 'new_edge_cases_list' in locals() and new_edge_cases_list:
# new_results_data = test_new_edge_cases(new_edge_cases_list)
# else:
# print("Variable 'new_edge_cases_list' not available or empty. Generate them first.")

## Conclusions

Summarize findings and recommendations for model improvements based on edge case testing with Ollama and the `artifish/llama3.2-uncensored` model.

In [None]:
# Function to summarize findings
def summarize_findings():
    findings_text = f"""
    # Edge Case Testing Findings for {MODEL_NAME}
    
    ## Strengths
    - [Note strengths identified in testing with {MODEL_NAME}]
    - [Add more as discovered]
    
    ## Weaknesses
    - [Note weaknesses identified in testing with {MODEL_NAME}]
    - [Add more as discovered]
    
    ## Recommendations
    - [List recommendations for model improvements or prompt engineering]
    - [Add more as discovered]
    
    ## Next Steps
    - Develop more targeted edge cases for specific weaknesses identified.
    - Test with different Ollama parameters (e.g., different temperature, top_k, top_p).
    - Compare performance with other models available via Ollama or other platforms.
    """

    print(findings_text)

    # Save findings to a file
    findings_filename = f"findings_{MODEL_NAME.replace('/', '_')}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.md"
    with open(findings_filename, "w", encoding="utf-8") as f:
        f.write(findings_text)
    print(f"Findings saved to {findings_filename}")
    return findings_text

In [None]:
# Summarize findings
# Uncomment to execute
# summary_text = summarize_findings()