In [None]:
# In[1]: Read Documents and Fixed-Size Batching
import csv
import json
import os
import sys

# Increase CSV field size limit for large text fields
csv.field_size_limit(sys.maxsize)

# Read all documents
csv_path = "/home/nena-meijer/PyCharmMiscProject/dataset/VWS_subset/6-VWS_documents_NER_nl_labels.csv"
print(f"Reading CSV from {csv_path}")
docs = []
with open(csv_path, newline='', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile)
    for idx, row in enumerate(reader, start=1):
        docs.append({'id': row['document_id'], 'text': row['document_text']})
        if idx % 500 == 0:
            print(f"Loaded {idx} documents")
print(f"Total documents loaded: {len(docs)}")

# Split into fixed-size batches of 100 documents each
BATCH_SIZE = 5
batches = [docs[i:i + BATCH_SIZE] for i in range(0, len(docs), BATCH_SIZE)]
print(f"Created {len(batches)} batches of up to {BATCH_SIZE} docs each.")

# In[2]: Save Batches to Files
output_dir = "batches"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
for idx, batch in enumerate(batches, start=1):
    batch_path = os.path.join(output_dir, f"batch_{idx}.json")
    with open(batch_path, 'w', encoding='utf-8') as bf:
        json.dump({'documents': batch}, bf, ensure_ascii=False, indent=2)
    print(f"Saved batch {idx}/{len(batches)} with {len(batch)} docs to {batch_path}")

In [None]:
API_key=""

In [None]:
import os
import json
import re
from google import genai
from google.genai import types

# Setup Gemini client and model
client = genai.Client(api_key=API_key)
MODEL_NAME = "models/gemini-2.0-flash"
print(f"Using model: {MODEL_NAME} for API generation")

# Instruction template
INSTRUCTIONS = (
    "Je krijgt een JSON-array met documenten met betrekking tot de coronacrisis in het Nederlands of Engels.\n"
    "Geef de resultaten terug in dezelfde taal als het document. Voor elk document extraheer je:\n"
    "1. \"document_date\": één datum voor het hele document in het format: DD-MM-YYYY, dit kan de eerste datum in de tekst zijn of de meest representatieve datum.\n"
    "2. \"summary\": korte samenvatting.\n"
    "3. \"events\": array met gebeurtenissen, elk met date in het format: DD-MM-YYYY, en een korte omschrijving van de gebeurtenis.\n"
    "4. \"names\": array met alle namen/ministers die in het document voorkomen.\n"
    "5. \"organizations\": array met alle genoemde ministeries/organisaties.\n"
    "6. \"groups\": array met alle genoemde groepen mensen.\n"
    "Stuur als output één JSON-object met een \"results\"-array.\n"
    "Neem per document het document_id mee in de response.\n"
    "Lever uitsluitend geldige JSON zonder extra markdown‑fences, zonder trailing commas, met alle strings correct geescaped.\n"
)

# Batch directory and range
BATCH_DIR = "/home/nena-meijer/PyCharmMiscProject/event_extraction/batches"
BATCH_START = 1
BATCH_END = 4575  # inclusive

aggregated_results = []

for i in range(BATCH_START, BATCH_END + 1):
    batch_path = os.path.join(BATCH_DIR, f"batch_{i}.json")
    if not os.path.isfile(batch_path):
        print(f"Batch {i}: bestand niet gevonden, overslaan...")
        continue

    with open(batch_path, 'r', encoding='utf-8') as f:
        try:
            batch = json.load(f)
        except json.JSONDecodeError as e:
            print(f"Batch {i}: JSON decode error bij het inlezen van bestand: {e}")
            aggregated_results.append({'batch': i, 'error': f'File load error: {e}', 'raw_response': None})
            continue

    prompt = INSTRUCTIONS + json.dumps({'documents': batch}, ensure_ascii=False, indent=2)
    print(len(prompt))
    print(f"Processing batch {i} with {len(batch)} documents...")

    contents = [types.Content(role="user", parts=[types.Part.from_text(text=prompt)])]
    config = types.GenerateContentConfig(response_mime_type="text/plain")

    try:
        response_text = ''.join(
            chunk.text for chunk in client.models.generate_content_stream(
                model=MODEL_NAME, contents=contents, config=config
            )
        )

        # Clean up potential Markdown code fences
        cleaned = response_text.strip()
        cleaned = re.sub(r"^```json", "", cleaned, flags=re.MULTILINE)
        cleaned = re.sub(r"```$", "", cleaned, flags=re.MULTILINE)
        cleaned = cleaned.strip()

        data = json.loads(cleaned)
        results = data.get('results', [])
        print(f"Batch {i}: parsed {len(results)} results")
        aggregated_results.extend(results)

    except json.JSONDecodeError as e:
        print(f"Batch {i} JSON parse error: {e}\nIncluding raw response in results.json")
        aggregated_results.append({'batch': i, 'error': str(e), 'raw_response': cleaned})

    except Exception as e:
        print(f"Batch {i}: onverwachte fout: {e}")
        aggregated_results.append({'batch': i, 'error': str(e), 'raw_response': None})

# Save final aggregated results
output_path = 'results_all.json'
with open(output_path, 'w', encoding='utf-8') as outfile:
    json.dump({'results': aggregated_results}, outfile, ensure_ascii=False, indent=2)

print(f"Saved aggregated results: {len(aggregated_results)} entries to '{output_path}'")
