## Description
This notebook takes the labelled data from 01_Prototype_LM and labelles them by passing them through a prompt to the large language model (LLM) Google Gemini. Four different prompts are tested, each with batch sizes of 1, 5, and 10. The resulting twelve combinations are then compared to evaluate their performance.

The code was created with the assistance of ChatGPT-4.

In [None]:
import pandas as pd
from google import genai
from google.genai import types
import json
from tqdm import tqdm
from itertools import cycle
import time

inputdata_file = '../01_Prototype_LM/data/03_labelled_data.csv' # the same file as in 01_Prototype_LM
outputdata_file ='data/01_predicted_data.csv'

with open("data/apikeys.json") as f:
    config = json.load(f)
API_KEYS = config["GOOGLE_API_KEYS"]
API_KEYS_CYCLE = cycle(API_KEYS)

# structure of apikey.json:
# {
#   "GOOGLE_API_KEYS": [
#     "key_1",
#     "key_2",
#     ...
#     "key_n"
#   ]
# }

In [63]:
df = pd.read_csv(inputdata_file, dtype={'mobilitydata_labelled': 'string'}, low_memory=False)

# Drop rows where 'mobilitydata_labelled' is empty (NaN)
df = df.dropna(subset=['mobilitydata_labelled'])

# Convert 'mobilitydata_labelled' to boolean type
df['mobilitydata_labelled'] = df['mobilitydata_labelled'].map({'True': True, 'False': False})

# Print the number of rows remaining after filtering
print(f"Number of labelled rows after filtering: {len(df)}")

Number of labelled rows after filtering: 150


In [None]:
# Configuration for iterating through API keys
requests_per_key = 15  # Max requests allowed per API key
key_count = len(API_KEYS)
current_key_index = 0  # Index of the currently used API key
key_request_counter = 0  # Counter for requests per key
cycle_start_time = time.time()  # Timestamp for rate-limiting

# All prompts
def build_prompt(prompt_id, chunk_lines):
    if prompt_id == 1:
        # Basic prompt with T/F format
        return (
            "Handelt es sich bei folgendem Inhalt um Verkehrs- oder Mobilitätsdaten?"
            "Antworte nur mit T (True) oder F (False).\n\n" + "\n\n".join(chunk_lines) +
            "Antwort:"
        )
    elif prompt_id == 2:
        # Prompt with T/F/U format (uncertainty included)
        return (
            "Handelt es sich bei folgendem Inhalt um Verkehrs- oder Mobilitätsdaten?"
            "Antworte nur mit T (True), F (False) oder U (Uncertain).\n\n" + "\n\n".join(chunk_lines) +
            "Antwort:"
        )
    elif prompt_id == 3:
        # Compact format, answers concatenated without space or punctuation
        return (
            "Handelt es sich bei folgendem Inhalt um Verkehrs- oder Mobilitätsdaten?\n\n" +
            "\n\n".join(chunk_lines) + "\n\n" +
            "Antworte nur mit T für True, F für False oder U für Uncertain und reihe alle Antworten direkt aneinander ohne Leerzeichen, Umbrüche, Texte oder Sonderzeichen zu verwenden. Beispielhaftes Antwortschema:\n\n" +
            "TFTFFTFUTT"
            "Antwort:"
        )

    elif prompt_id == 4:
        # Expert-level instruction with clear format and rules
        return (
            "Als Experte für Datenannotation im Bereich Mobilitäts- und Verkehrsdaten ist es Ihre Aufgabe, öffentliche Datensätze daraufhin zu prüfen, ob sie Informationen enthalten, die eindeutig Mobilitäts- oder Verkehrsdaten betreffen. Ihre Einschätzung hilft bei der sachgerechten Klassifizierung dieser Inhalte auf einem nationalen Datenportal.\n"
            "Aufgabe:\n" 
            "Beurteilen Sie, ob es sich bei dem folgenden Datensatz um Mobilitäts- oder Verkehrsdaten handelt.\n\n"
            "Antwortformat:\n"  
            "Antworten Sie **nur mit einer der folgenden Optionen**, ohne zusätzliche Zeichen oder Erläuterungen (T für True, F für False, U für Uncertain):\n\n"
            "- T\n"  
            "- F\n"  
            "- U\n\n"
            "Hinweise:\n" 
            "- Sollten mehrer Anfragen auf einmal verarbeitet werden, so antworten sie im folgenden Format (Beispiel für 10 Anfragen): TFTFFTFUTT"
            "- Verwenden Sie **U: Uncertain**, wenn die Informationen im Titel oder in der Beschreibung unklar oder nicht ausreichend sind.\n"  
            "- Berücksichtigen Sie Aspekte wie Verkehrsmittel, Infrastruktur, Mobilitätsverhalten oder Verkehrsfluss.\n"  
            "- Geben Sie keine zusätzlichen Erläuterungen – nur die ausgewählte Option.\n\n"
            "Datensatzbeschreibungen:\n" +
            "\n\n".join(chunk_lines) +
            "Antwort:"
        )
    else:
        raise ValueError("Ungültige Prompt-ID")

# Process a chunk of rows with the selected prompt
def process_chunks(df, indices, chunk_size, prompt_id):
    global current_key_index, key_request_counter, cycle_start_time

    results = []

    for i in tqdm(range(0, len(indices), chunk_size)):
        # Rotate API key if limit is reached
        if key_request_counter >= requests_per_key:
            current_key_index += 1
            key_request_counter = 0

            # If all keys exhausted, wait before restarting
            if current_key_index >= key_count:
                elapsed = time.time() - cycle_start_time
                if elapsed < 60:
                    wait_time = int(60 - elapsed)
                    print(f"Max requests per minute reached. Waiting {wait_time} seconds...")
                    time.sleep(wait_time + 1)
                current_key_index = 0
                cycle_start_time = time.time()

        CURRENT_API_KEY = API_KEYS[current_key_index]
        client = genai.Client(api_key=CURRENT_API_KEY)

        batch_indices = indices[i:i + chunk_size]
        chunk_df = df.loc[batch_indices][['dataset_title_DE', 'dataset_description_DE']]

        # Combine title and description into formatted text
        chunk_lines = chunk_df.apply(
            lambda row: f"Titel: {row['dataset_title_DE']}\nBeschreibung: {row['dataset_description_DE']}",
            axis=1
        ).tolist()

        prompt = build_prompt(prompt_id, chunk_lines)

        max_retries = 3
        for attempt in range(max_retries):
            try:
                response = client.models.generate_content_stream(
                    model="gemini-2.0-flash-lite-001",
                    contents=[prompt],
                    config=types.GenerateContentConfig(
                        max_output_tokens=chunk_size,
                        temperature=0,
                        top_p=0.95,  # Default value
                        top_k=64,    # Default value
                        candidate_count=1  # Default value
                    )
                )
                result_text = "".join(chunk.text for chunk in response)
                break
            except Exception as e:
                error_str = str(e)
                if "RESOURCE_EXHAUSTED" in error_str or "429" in error_str:
                    print(f"Rate limit reached. Waiting 60 seconds... (Attempt {attempt+1} of {max_retries})")
                    time.sleep(60)
                else:
                    print(f"Error: {error_str}")
                    break
        else:
            # All retries failed – store error result for each row
            for idx in batch_indices:
                results.append({
                    "index": idx,
                    "mobilitydata_generated": "ERROR",
                    "prompt_id": prompt_id,
                    "chunk_size": chunk_size
                })
            continue

        # Split model result into individual predictions
        predictions = list(result_text.strip())
        for rel_idx, prediction in zip(batch_indices, predictions):
            results.append({
                "index": rel_idx,
                "mobilitydata_generated": prediction if prediction in ["T", "F", "U"] else "ERROR",
                "prompt_id": prompt_id,
                "chunk_size": chunk_size
            })

        key_request_counter += 1
        time.sleep(0.8)  # Small delay to avoid hitting rate limits

    return results

# Initialize result column
df['mobilitydata_generated'] = None
all_indices = df.index.tolist()

all_results = []

# Create result columns for each prompt and chunk combination
for prompt_id in range(1, 5):
    for chunk_size in [10, 5, 1]:
        col_name = f"mobilitydata_generated_p{prompt_id}_c{chunk_size}"
        df[col_name] = None

# Run model for all combinations and collect results
for prompt_id in range(1, 5):
    for chunk_size in [10, 5, 1]:
        print(f"Running Prompt {prompt_id} with chunk_size {chunk_size}")
        result_rows = process_chunks(df.copy(), all_indices, chunk_size, prompt_id)
        all_results.extend(result_rows)

# Write results into the corresponding columns
for row in all_results:
    idx = row['index']
    prompt_id = row['prompt_id']
    chunk_size = row['chunk_size']
    value = row['mobilitydata_generated']
    col_name = f"mobilitydata_generated_p{prompt_id}_c{chunk_size}"
    df.at[idx, col_name] = value

# Replace empty or invalid values with "ERROR"
for prompt_id in range(1, 5):
    for chunk_size in [10, 5, 1]:
        col_name = f"mobilitydata_generated_p{prompt_id}_c{chunk_size}"
        df[col_name] = df[col_name].apply(lambda x: x if x in ["T", "F", "U", "ERROR"] else "ERROR")


In [None]:
analysis = []

# Automatically identify result columns
result_columns = [col for col in df.columns if col.startswith("mobilitydata_generated_p")]

for col in sorted(result_columns):
    try:
        parts = col.split("_")
        prompt_id = int(parts[2][1:])   # e.g. 'p1' → 1
        chunk_size = int(parts[3][1:])  # e.g. 'c10' → 10

        # Filter valid predictions (only T/F)
        valid = df[df[col].isin(['T', 'F'])].copy()
        valid['prediction'] = valid[col].map({'T': True, 'F': False})

        # Additional counts for uncertain and error values
        count_u = df[col].eq('U').sum()
        count_error = df[col].eq('ERROR').sum()

        # Calculate performance metrics
        tp = ((valid['mobilitydata_labelled'] == True) & (valid['prediction'] == True)).sum()
        tn = ((valid['mobilitydata_labelled'] == False) & (valid['prediction'] == False)).sum()
        fp = ((valid['mobilitydata_labelled'] == False) & (valid['prediction'] == True)).sum()
        fn = ((valid['mobilitydata_labelled'] == True) & (valid['prediction'] == False)).sum()
        total = len(valid)

        accuracy = (tp + tn) / total if total > 0 else 0
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

        analysis.append({
            'prompt_id': prompt_id,
            'chunk_size': chunk_size,
            'TP': tp,
            'TN': tn,
            'FP': fp,
            'FN': fn,
            'Total': total,
            'U': count_u,
            'ERROR': count_error,
            'Accuracy': round(accuracy, 4),
            'Precision': round(precision, 4),
            'Recall': round(recall, 4),
            'F1-Score': round(f1_score, 4)
        })
    except Exception as e:
        # Skip column in case of processing error
        print(f"Überspringe Spalte {col} wegen Fehler: {e}")

# Create and sort the evaluation results as DataFrame
analysis_df = pd.DataFrame(analysis)
analysis_df = analysis_df.sort_values(['prompt_id', 'chunk_size'])

# Display summary
print("\nAuswertung der Prompt-/Chunk-Kombinationen:")
print(analysis_df.to_string(index=False))

In [None]:
# Write dataframe in new csv-File
df.to_csv(outputdata_file, index=False)

print(f'The file has been successfully saved as {outputdata_file}.')