# Batching Examples

## Setup

In [None]:
%pip install -U -q "google-genai>=1.0.0"  # Install the Python SDK

To run the following cell, your API key must be stored it in a Colab Secret named `GOOGLE_API_KEY`. If you don't already have an API key, or you're not sure how to create a Colab Secret, see the [Authentication](../quickstarts/Authentication.ipynb) quickstart for an example.

In [None]:
from google.colab import userdata
from google import genai

GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')
client = genai.Client(api_key=GOOGLE_API_KEY)

MODEL_ID = "gemini-2.5-flash"

In [None]:
from google.genai import types

import requests
import json
import math

questions = requests.get("https://raw.githubusercontent.com/phil-daniel/gemini-batcher/refs/heads/main/examples/demo_files/questions.txt").text.split('\n')
content = requests.get("https://raw.githubusercontent.com/phil-daniel/gemini-batcher/refs/heads/main/examples/demo_files/content.txt").text

## Batching example 1 - no batching (baseline)

In this example, the baseline number of tokens required to answer the first five questions is calculated. Each question is sent to the model sequentially, along with the entire transcript.

The response is returned in JSON format for easier comparison to the batched example.

In [None]:
system_prompt = "Answer the questions using *only* the content provided, with each answer being a different string in the JSON response."

total_input_tokens_no_batching = 0
total_output_tokens_no_batching = 0

for question in questions[:5]:
    response = client.models.generate_content(
        model=MODEL_ID,
        config=types.GenerateContentConfig(
            response_mime_type="application/json",
            response_schema=list[str],
            system_instruction=system_prompt,
        ),
        contents=[f'Content:\n{content}', f'\nQuestion:\n{question}']
    )
    total_input_tokens_no_batching += response.usage_metadata.prompt_token_count
    total_output_tokens_no_batching += response.usage_metadata.candidates_token_count

print (f'Total input tokens used with no batching: {total_input_tokens_no_batching}')
print (f'Total output tokens used with no batching: {total_output_tokens_no_batching}')

## Batching example 2 - with batching
In this example, the model is asked the same five questions, but rather than being asked individually, they are answered all at once. This results in a significant reduction in the number of input tokens used as the model is only provided with the large content once rather than five times.

The response is returned in JSON format to allow for easier separation of each question's answer.

In [None]:
system_prompt = "Answer the questions using *only* the content provided, with each answer being a different string in the JSON response."

batched_questions = ("\n").join(questions[:5])

batched_response = client.models.generate_content(
    model=MODEL_ID,
    config=types.GenerateContentConfig(
        response_mime_type="application/json",
        response_schema=list[str],
        system_instruction=system_prompt,
        thinking_config=types.ThinkingConfig(thinking_budget=0,)
    ),
    contents=[f'Content:\n{content}', f'\nQuestions:\n{batched_questions}']
)

answers = batched_response.text
batched_answers = json.loads(answers.strip())

total_input_tokens_with_batching = batched_response.usage_metadata.prompt_token_count
total_output_tokens_with_batching = batched_response.usage_metadata.candidates_token_count

print (f'Total input tokens used with batching: {total_input_tokens_with_batching}')
print (f'Total output tokens used with batching: {total_output_tokens_with_batching}')