In [1]:
# Check for CUDA and GPU, and if True, GPU will be used.

import torch
torch.cuda.is_available()

True

In [2]:
# Bring in the sample dataset, the smaller sub-corpus.

import os
path = "YOUR_DATA_test"

def read_txt_files(directory):
    # Reads all .txt files in a directory and returns a combined string of their contents.

    file_contents = ''
    
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            filepath = os.path.join(directory, filename)
            with open(filepath, "r", encoding="utf8") as f:
                file_contents = file_contents + (f.read())
    return file_contents

texts = read_txt_files(path)

In [3]:
"""
Developing function to identify and resolve toponyms, and detect emotions in context 
on either side of each toponym.  Context length is based on trying different lengths,
with the final context length chosen based on which gives the most likely detected emotion
with the highest confidence score.

"""
# Access libraries
import os
from openai import OpenAI
from pydantic import BaseModel
import pandas as pd
import json
import numpy as np

# Set a global variable for my OpenAI API key so that the model can be accessed.
os.environ["OPENAI_API_KEY"] = "YOUR_KEY"
client = OpenAI()

# Alternative data for testing:
#texts = "I traveled from Paris to Berlin and saw New York on the way.  It was fantastic.  I was so happy."

# ====== General Utility: Robust Extraction ======
def extract_json_from_arguments(response):
    """
    Robust extraction for OpenAI responses.
    Handles both function call and text output scenarios.
    Returns dict or list or [].
    """
    # Case 1: Function call pattern
    if hasattr(response, "output") and response.output:
        first = response.output[0]
        if hasattr(first, "arguments"):  # should be a string
            arguments_string = first.arguments
            if isinstance(arguments_string, (str, bytes)):
                try:
                    return json.loads(arguments_string)
                except Exception as e:
                    print(f"JSON parsing error: {e}\nARGUMENTS STRING: {arguments_string}")
                    return []
            else:
                # If already parsed (rare)
                return arguments_string
        # If it's classic text response
        if hasattr(first, "content") and first.content:
            text_fragment = getattr(first.content[0], "text", None)
            if text_fragment:
                try:
                    return json.loads(text_fragment)
                except Exception as e:
                    print(f"JSON parsing error (text): {e}\nTEXT: {text_fragment}")
                    return []
    # Case 2: Tool-style .outputs (not present in your current responses)
    if hasattr(response, "outputs") and response.outputs and hasattr(response.outputs[0], "arguments"):
        arguments = response.outputs[0].arguments
        if arguments is not None:
            return arguments
    print("No recognizable output format found in OpenAI response.")
    return []

# ====== Utility: Chunk Text for Context Window ======
def chunk_text(text, chunk_size=17000, overlap=1000):
    chunks = []
    idx = 0
    while idx < len(text):
        end = min(len(text), idx + chunk_size)
        chunk = text[idx:end]
        chunks.append((chunk, idx))
        idx += chunk_size - overlap
    return chunks

def batch(lst, n):
    """Yield successive n-sized batches from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

# ====== 1. Load Prompts ======
with open("openai_ToponymExtraction_prompt_complicated.txt", encoding="utf-8") as f:
    extraction_instructions = f.read()
with open("openai_ToponymEmotionAnalysis_prompt_complicated.txt", encoding="utf-8") as f:
    analysis_instructions = f.read()

# ====== 2. Stage One: Extract Toponyms ======
chunks = chunk_text(texts)
extracted_toponyms = []
for chunk, offset in chunks:
    response = client.responses.create(
        model="gpt-4.1-2025-04-14",
        instructions=extraction_instructions,
        input=chunk,
        text={"format": {"type": "text"}},
        reasoning={},
        tools=[
            {
                "type": "function",
                "name": "recognize_toponyms",
                "description": "Given the user input text, identify all the toponyms in the text.",
                "parameters": {
                    "type": "object",
                    "required": ["input_text", "toponyms"],
                    "properties": {
                        "input_text": {
                            "type": "string",
                            "description": "The text string from which to recognize and identify toponyms."
                        },
                        "toponyms": {
                            "type": "array",
                            "description": "Array of recognized and identified toponyms.",
                            "items": {
                              "type": "object",
                              "properties": {
                                "toponym": {"type": "string"},
                                "start_idx": {"type": "integer"},
                                "end_idx": {"type": "integer"}
                              },
                              "required": ["toponym", "start_idx", "end_idx"],
                              "additionalProperties": False
                            }
                        }
                    },
                    "additionalProperties": False
                },
                "strict": True
            }
        ],
        temperature=1.0,
        max_output_tokens=32768,
        top_p=1,
        store=True
    )
    toponyms_chunk = extract_json_from_arguments(response)
    if isinstance(toponyms_chunk, list):
        # Toponyms immediately
        for t in toponyms_chunk:
            if 'start_idx' in t: t["start_idx"] += offset
            if 'end_idx' in t: t["end_idx"] += offset
        extracted_toponyms += toponyms_chunk
        print(f"Extracted {len(toponyms_chunk)} toponyms from this chunk.")
    elif isinstance(toponyms_chunk, dict) and "toponyms" in toponyms_chunk:
        # Sometimes a dict with .toponyms key
        for t in toponyms_chunk["toponyms"]:
            if 'start_idx' in t: t["start_idx"] += offset
            if 'end_idx' in t: t["end_idx"] += offset
        extracted_toponyms += toponyms_chunk["toponyms"]
        print(f"Extracted {len(toponyms_chunk['toponyms'])} toponyms from this chunk.")
    else:
        print(f"Extracted 0 toponyms from this chunk (result was {toponyms_chunk!r})")

print(f"\nStage 1 complete: Extracted {len(extracted_toponyms)} total toponym instances.")
with open("extracted_toponyms.json", "w", encoding="utf-8") as f:
    json.dump(extracted_toponyms, f, ensure_ascii=False, indent=2)

# ====== 3. Stage Two: Toponym Analysis ======
analysis_results = []
batch_size = 5  # Adjust as appropriate for your response size

for batch_num, toponym_batch in enumerate(batch(extracted_toponyms, batch_size)):
    user_input = {
        "original_text": texts,
        "toponym_instances": toponym_batch
    }
    response = client.responses.create(
        model="gpt-4.1-2025-04-14",
        instructions=analysis_instructions,
        input=json.dumps(user_input),
        text={"format": {"type": "text"}},
        reasoning={},
        tools=[
          {
              "type": "function",
              "name": "resolve_toponyms_and_detect_emotions",
              "description": "Given the user input of the original text and the extracted toponyms from the previous step, determine latitude and longitude of each toponym and perform emotion detection on surrounding context for each toponym.",
              "parameters": {
                "type": "object",
                "required": [
                  "original_text",
                  "toponym_instances"
                ],
                "properties": {
                  "original_text": {
                    "type": "string",
                    "description": "The text string from which to disambiguate toponyms and utilize their surrounding context."
                  },
                  "toponym_instances": {
                    "type": "array",
                    "description": "Array of identified toponyms, each containing properties of location details and emotional context.",
                    "items": {
                      "type": "object",
                      "properties": {
                        "toponym": {
                          "type": "string",
                          "description": "The name of the toponym as found in the previous step."
                        },
                        "resolved_name": {
                          "type": "string",
                          "description": "The name of the resolved toponym as identified and disambiguated."
                        },
                        "latitude": {
                          "type": "number",
                          "description": "The latitude coordinate of the toponym."
                        },
                        "longitude": {
                          "type": "number",
                          "description": "The longitude coordinate of the toponym."
                        },
                        "emotion": {
                          "type": "string",
                          "description": "The most likely detected emotion around the toponym.",
                          "enum": [
                            "anger",
                            "disgust",
                            "fear",
                            "joy",
                            "sadness",
                            "surprise",
                            "neutral"
                          ]
                        },
                        "confidence_score": {
                          "type": "number",
                          "description": "The confidence score for the detected emotion, on a scale of 0 to 1."
                        },
                        "context": {
                          "type": "string",
                          "description": "The text block surrounding the toponym used for emotion detection, whose length is determined based on trying different lengths and seeing which one gives the highest confidence score for the most likely detected emotion."
                        },
                        "context_length": {
                          "type": "number",
                          "description": "The length, in characters including spaces, of the final text block surrounding the toponym used for emotion detection."
                        },
                        "original_range": {
                          "type": "number",
                          "description": "The original start and end position in the text of the toponym."
                        }
                      },
                      "required": [
                        "toponym",
                        "resolved_name",
                        "latitude",
                        "longitude",
                        "emotion",
                        "confidence_score",
                        "context",
                        "context_length",
                        "original_range"                          
                      ],
                      "additionalProperties": False
                    }
                  }
                },
                "additionalProperties": False
              },
              "strict": True
          }
        ],
        temperature=1,
        max_output_tokens=32768,
        top_p=1,
        store=True
    )

    batch_result = extract_json_from_arguments(response)
    # Either a list (preferred), or a dict with a .toponym_instances key
    if isinstance(batch_result, list):
        analysis_results += batch_result
        print(f"Processed batch {batch_num+1}: {len(batch_result)} results.")
    elif isinstance(batch_result, dict) and "toponym_instances" in batch_result:
        batch_list = batch_result["toponym_instances"]
        analysis_results += batch_list
        print(f"Processed batch {batch_num+1}: {len(batch_list)} results (from dict).")
    else:
        print(f"Processed batch {batch_num+1}: 0 results (batch_result was {batch_result!r})")

print(f"\nStage 2 complete: Produced {len(analysis_results)} detailed toponym analyses.")
with open("analysis_results.json", "w", encoding="utf-8") as f:
    json.dump(analysis_results, f, ensure_ascii=False, indent=2)


Extracted 57 toponyms from this chunk.
Extracted 0 toponyms from this chunk.
Extracted 70 toponyms from this chunk.

Stage 1 complete: Extracted 127 total toponym instances.
JSON parsing error (text): Expecting value: line 1 column 1 (char 0)
TEXT: Let's analyze each toponym instance, one by one, reflecting both on toponym resolution and emotion detection with optimal context window size, to maximize confidence and accuracy.

---

### 1. Haute-Loire (119–130)
**Assessment:**  
The text says:  
"During rest hour Mrs. Cavailhon called me. She received a letter asking whether she wanted to send some children to a very nice school in Haute-Loire at an elevation of one thousand meters."  
Haute-Loire is a department in south-central France, historically well known for hosting Le Chambon-sur-Lignon and other safe havens for refugees. In WWII context, “Haute-Loire” unambiguously refers to this French administrative region.  
**Coordinates:** 45.0477, 3.8886  

**Emotion context optimization:*

APITimeoutError: Request timed out.

In [31]:
print("RAW RESPONSE:", response)
print("RAW OUTPUTS:", getattr(response, "outputs", None))

RAW RESPONSE: Response(id='resp_683793512c8c8191bee5e5ea4a5d7f020823006e689be9ad', created_at=1748472657.0, error=None, incomplete_details=None, instructions='## IDENTITY\nYou are a natural language processer now performing both toponym resolution \nand emotion detection.  \n\n## INSTRUCTIONS\nFor each toponym instance below (with its position in the provided text), do all of the following:\n- Use context to disambiguate to the correct modern or historic entity and obtain coordinates (latitude, longitude).\n- Extract several blocks of text (100–600 chars each side). For each, run emotion detection (anger, disgust, fear, joy, sadness, surprise, neutral), select the emotion+window with highest confidence score, and record:\n    - toponym (as-found),\n    - resolved_name,\n    - latitude,\n    - longitude,\n    - detected emotion,\n    - confidence score,\n    - text block used,\n    - length of text block used (in characters, including spaces),\n    - original start & end positions.\nRet

In [27]:
# Take response output in json format, put into a dataframe, then assign numeric values 
# to the detected emotions.

df = pd.DataFrame(analysis_results)

conditions = [
    df["emotion"] == "anger",
    df["emotion"] == "disgust",
    df["emotion"] == "fear",
    df["emotion"] == "joy",
    df["emotion"] == "neutral",
    df["emotion"] == "sadness",
    df["emotion"] == "surprise"
]
values = ["0", "1", "2", "3", "4", "5", "6"]
df["emotion_numeric"] = np.select(conditions, values, default="Unknown")

df

Unnamed: 0,toponym,resolved_name,latitude,longitude,emotion,confidence_score,context,context_length,original_range,emotion_numeric
0,Paris,"Paris, France",48.8566,2.3522,neutral,0.8,I traveled from Paris to Berlin and saw New Yo...,62,17,4
1,Berlin,"Berlin, Germany",52.52,13.405,neutral,0.79,I traveled from Paris to Berlin and saw New Yo...,62,26,4
2,New York,"New York City, USA",40.7128,-74.006,joy,0.97,I traveled from Paris to Berlin and saw New Yo...,88,42,3


In [13]:
# Export results to csv

df.to_csv("Results8_ToponymsEmotions_smallSubCorpus.csv", encoding="utf-8-sig", index=False, header=True, mode="w+")