In [None]:
# Imports
import os
from groq import Groq
import time
from groq import InternalServerError
import json
import jinja2
import datetime
from openai import OpenAI

Here we have:
- paths to the prompt template and the annotation files folder


- the list of types of prompts (zero-shot, one-shot, few-shot)


- the list of models tested


- the list of entities in which we are interested in

**MODIFY THE LIST OF PROMPTS AND MODELS TO YOUR LIKING**

In [2]:
# Folder with the ground-truth texts
ANNOTATIONS_FOLDER = "../annotations/"

# Folder with the prompt templates
PROMPT_PATH = "../prompt_templates/"

# List of prompt templates
LIST_PROMPTS = [
    "zero_shot",
    "one_shot",
    "few_shot"
]

# List of models to test
LIST_MODELS_GROQ = [
    "gemma2-9b-it",
    "mistral-saba-24b",
    "llama-3.3-70b-versatile",
    "qwen-qwq-32b",
    "meta-llama/llama-4-maverick-17b-128e-instruct",
    "deepseek-r1-distill-llama-70b"
    # "deepseek-r1-distill-qwen-32b" # has been decommissioned
    ]

LIST_MODELS_OPENAI = [
    "gpt-4.1-2025-04-14",
    # "gpt-4o-2024-11-20",
    #"o3-2025-04-16"
]

# List of entities to tag (by the llms) and then extract
TAGS = ["MOL", "SOFTNAME", "SOFTVERS", "STIME", "TEMP", "FFM"]

API_TYPE = "groq"

NUMBER_OF_TEXTS_TO_ANNOTATE = 100

# **Prompt engineering with LLMs**
---

Currently in this notebook, we have tested only models using the Groq API.
Models from using the OpenAI API will be added soon.

In [3]:
# Choose which API to use
def get_api(api_type: str) -> object:
    if api_type == "openai":
        return OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
    elif api_type == "groq":
        return Groq(api_key=os.environ.get("GROQ_API_KEY"))
    else:
        raise ValueError("Invalid API type. Choose 'openai' or 'groq'.")


In [4]:
# Set up the environment variable for the API key
client = get_api(API_TYPE)

Firstly, we need a helper function to extract certain information from the ground-truth data:
- The input text that needs to be annotated (`input_text`)


- The manually-found entities (`ground_truth_entities`)

In [5]:
# Process one JSON file to extract the ground truth entities and the input text
def process_json_file(json_file: str) -> tuple:
    with open(json_file, "r") as f:
        data = json.load(f)

    # Extract the input text
    annotation_entry = data["annotations"][0]
    input_text = annotation_entry[0]
    ground_truth_entities = annotation_entry[1]["entities"]

    return input_text, ground_truth_entities

Next, we will define several helper functions to assist with the annotation workflow:

1. **Render the prompt template**  

   We use a Jinja2 template to dynamically inject the text that needs to be annotated. This allows for flexible and reusable prompt formatting.

2. **Interact with the LLM using the template**  

   This function handles communication with the language model using the rendered prompt. It is currently tailored for the Groq API, though the structure may vary if you use a different API.

3. **Save the LLM response to a JSON file**  

   To maintain a record of the process, we save the model's response along with metadata, including the model used, the prompt sent, and the annotated output (in XML format with entity annotations).


In [None]:
def load_and_render_prompt(template_path: str, text_to_annotate: str) -> str:
    """
    Load a Jinja2 template from a file and render it with the provided text to annotate.
    
    Args:
        template_path (str): Path to the template file.
        text_to_annotate (str): Text to be annotated.
    
    Returns:
        str: Rendered prompt string.
    """
    with open(template_path, "r") as f:
        template_content = f.read()
    template = jinja2.Template(template_content)
    return template.render(text_to_annotate=text_to_annotate)


def chat_with_template(prompt:str, template_path: str, model:str, text_to_annotate:str) -> str:
    """
    Chat with the Groq API using a template and a model.
    Args:
        template_path (str): Path to the template file.
        prompt (str): Rendered prompt.
        model (str): Model to use for the chat.
        text_to_annotate (str): Text to be annotated.
    Returns:
        str: Response from the chat completion.
    """
    delay = 1          # seconds
    max_retries = 5

    prompt = load_and_render_prompt(template_path, text_to_annotate)

    for attempt in range(max_retries + 1):
        try:
            chat_completion = client.chat.completions.create(
                messages=[{"role": "user", "content": prompt}],
                model=model,
            )
            return chat_completion.choices[0].message.content
        except InternalServerError as err:
            if getattr(err, "status_code", None) == 503 and attempt < max_retries:
                time.sleep(delay)
                delay *= 2          # exponential back-off
                continue
            raise  # non-503 or out of retries


def save_response_as_json(response_text:str, output_path:str) -> None:
    """
    Takes the response text from the AI and saves it as a JSON file.
    Args:
        response_text (str): The response text to save.
        output_path (str): Path to the output JSON file.
    """
    response_text
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(response_text, f, ensure_ascii=False, indent=2)

Before we start running annotations, we need to set up a directory structure to organize the outputs based on prompt types and models used.

The following code will:
- Create a root folder to store all LLM annotations.


- For each prompt template in `LIST_PROMPTS`, create a subfolder.


- Within each prompt folder, create additional subfolders for each model in `LIST_MODELS`.

In [7]:
# Get the current date and time
date_and_time = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

if API_TYPE == "groq":
    LIST_MODELS = LIST_MODELS_GROQ
elif API_TYPE == "openai":
    LIST_MODELS = LIST_MODELS_OPENAI
else:
    raise ValueError("Invalid API type. Choose 'openai' or 'groq'.")

# Make repository for LLM annotations
ouput_llm_annotation_folder = f"../llm_outputs/annotations_{date_and_time}/"
if not os.path.exists(ouput_llm_annotation_folder):
    os.makedirs(ouput_llm_annotation_folder)

# Make a folder for annotation stats
ouput_llm_stats_folder = f"../llm_outputs/stats_{date_and_time}/"
if not os.path.exists(ouput_llm_stats_folder):
    os.makedirs(ouput_llm_stats_folder)

# Create folder for each prompt type
# Then within that, create a folder for each model
for prompt in LIST_PROMPTS:
    prompt_name = os.path.basename(prompt)
    output_prompt_folder = os.path.join(ouput_llm_annotation_folder, prompt_name)
    if not os.path.exists(output_prompt_folder):
        os.makedirs(output_prompt_folder)

    for model in LIST_MODELS:
        output_model_folder = os.path.join(output_prompt_folder, model)
        if not os.path.exists(output_model_folder):
            os.makedirs(output_model_folder)

## **Run LLM annotations**
---

We will now test our LLM annotation pipeline on a subset of input texts. Specifically, we will:

- Select the first 10 input files from the annotations folder.

    - For each file, we will apply:

        - Each prompt template in `LIST_PROMPTS`

        - Each language model in `LIST_MODELS`

    - Save the LLM's annotated response as a JSON file in a their designated directory: `../output_llm_annotations/{prompt_name}/{model}/{filename}`



We can modify the amount of files that are annotated. To give an idea, for two models, three types of prompts, and 10 texts to annotate, this takes **~ 7mins**

In [8]:
# Use 10 input texts from the annotation folder
number_texts = 0

for filename in os.listdir(ANNOTATIONS_FOLDER): # Loop through the files in the annotations folder
    if number_texts >=NUMBER_OF_TEXTS_TO_ANNOTATE:
        break

    if filename.endswith(".json") and filename.count("_") == 1:
        number_texts += 1

        print(f"\nProcessing file {number_texts}: {filename} ==============")
        input_text, _ = process_json_file(os.path.join(ANNOTATIONS_FOLDER, filename))

        for prompt in LIST_PROMPTS: # Testing each type of prompt
            print(f"\nFile {number_texts} - Testing prompt: {prompt} -------\n")

            prompt_name = os.path.basename(prompt)
            output_prompt_folder = os.path.join(ouput_llm_annotation_folder, prompt_name)

            for model in LIST_MODELS: # Testing each model
                print(f"File {number_texts} - Testing model: {model}")

                output_model_folder = os.path.join(output_prompt_folder, model)

                response = chat_with_template(
                    prompt=prompt,
                    template_path=os.path.join(PROMPT_PATH, f"{prompt}.txt"),
                    model=model,
                    text_to_annotate=input_text
                )

                # Save the response as a JSON file
                output_path_for_json = os.path.join(output_prompt_folder, model, filename)
                data = {
                    "model": model,
                    "text_to_annotate": input_text,
                    "response": response
                }
                save_response_as_json(data, output_path_for_json)



File 1 - Testing prompt: zero_shot -------

File 1 - Testing model: gemma2-9b-it
File 1 - Testing model: mistral-saba-24b
File 1 - Testing model: llama-3.3-70b-versatile
File 1 - Testing model: qwen-qwq-32b
File 1 - Testing model: meta-llama/llama-4-maverick-17b-128e-instruct
File 1 - Testing model: deepseek-r1-distill-llama-70b

File 1 - Testing prompt: one_shot -------

File 1 - Testing model: gemma2-9b-it
File 1 - Testing model: mistral-saba-24b
File 1 - Testing model: llama-3.3-70b-versatile
File 1 - Testing model: qwen-qwq-32b
File 1 - Testing model: meta-llama/llama-4-maverick-17b-128e-instruct
File 1 - Testing model: deepseek-r1-distill-llama-70b

File 1 - Testing prompt: few_shot -------

File 1 - Testing model: gemma2-9b-it
File 1 - Testing model: mistral-saba-24b
File 1 - Testing model: llama-3.3-70b-versatile
File 1 - Testing model: qwen-qwq-32b
File 1 - Testing model: meta-llama/llama-4-maverick-17b-128e-instruct
File 1 - Testing model: deepseek-r1-distill-llama-70b


Fil

InternalServerError: Error code: 503 - {'error': {'message': 'Service Unavailable', 'type': 'internal_server_error'}}