In [11]:
# Check for CUDA and GPU, and if True, GPU will be used.

import torch
torch.cuda.is_available()

True

In [25]:
# Bring in the sample dataset, the smaller sub-corpus.

import os
path = "YOUR_DATA_test"

def read_txt_files(directory):
    # Reads all .txt files in a directory and returns a combined string of their contents.

    file_contents = ''
    
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            filepath = os.path.join(directory, filename)
            with open(filepath, "r", encoding="utf8") as f:
                file_contents = file_contents + (f.read())
    return file_contents

texts = read_txt_files(path)

In [29]:
"""
Developing function to identify and resolve toponyms, and detect emotions in context 
on either side of each toponym.  Context length is based on trying different lengths,
with the final context length chosen based on which gives the most likely detected emotion
with the highest confidence score.

"""
# Access libraries
import os
from openai import OpenAI
from pydantic import BaseModel
import pandas as pd
import json
import numpy as np
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
import tiktoken


# Set a global variable for my OpenAI API key so that the model can be accessed.
os.environ["OPENAI_API_KEY"] = "YOUR_KEY"
client = OpenAI()

# Alternative data for testing:
#texts = "I traveled from Paris to Berlin and saw New York on the way.  It was fantastic.  I was so happy."

# ========== Robust OpenAI Output Extraction ==========
def extract_json_from_arguments(response):
    """
    Robust extraction for OpenAI responses.
    Handles both function call and text output scenarios.
    Returns dict or list or [].
    """
    # Case 1: Function call pattern
    if hasattr(response, "output") and response.output:
        first = response.output[0]
        if hasattr(first, "arguments"): # should be a string
            arguments_string = first.arguments
            if isinstance(arguments_string, (str, bytes)):
                try:
                    return json.loads(arguments_string)
                except Exception as e:
                    print(f"JSON parsing error: {e}\nARGUMENTS STRING: {arguments_string}")
                    return []
            else:
                # If already parsed (rare)
                return arguments_string
        # If it's classic text response
        if hasattr(first, "content") and first.content:
            text_fragment = getattr(first.content[0], "text", None)
            if text_fragment:
                try:
                    return json.loads(text_fragment)
                except Exception as e:
                    print(f"JSON parsing error (text): {e}\nTEXT: {text_fragment}")
                    return []
    # Case 2: Tool-style .outputs (not present in your current responses)
    if hasattr(response, "outputs") and response.outputs and hasattr(response.outputs[0], "arguments"):
        arguments = response.outputs[0].arguments
        if arguments is not None:
            return arguments
    print("No recognizable output format found in OpenAI response.")
    return []

# 2. Token-based Chunking (if needed). For huge inputs: For texts under 1M tokens: use the entire text in one go. For larger: chunk by tokens.

def chunk_text_by_tokens(text, max_tokens=3000, overlap_tokens=600):
    enc = tiktoken.get_encoding("cl100k_base")
    all_tokens = enc.encode(text)
    chunks = []
    i, text_len = 0, len(all_tokens)
    while i < text_len:
        start = i
        end = min(i + max_tokens, text_len)
        token_chunk = all_tokens[start:end]
        chunk_text = enc.decode(token_chunk)
        offset = text.find(chunk_text)
        chunks.append((chunk_text, offset))
        if end == text_len:
            break
        i += max_tokens - overlap_tokens
    return chunks

# 3. API Call with Retry for Thread Use

def call_api_with_retry_chunk(chunk, offset, extraction_instructions, client, max_output_tokens=2048, retries=4):
    for attempt in range(retries):
        try:
            response = client.responses.create(
                model="gpt-4.1-2025-04-14",
                instructions=extraction_instructions,
                input=chunk,
                text={"format": {"type": "text"}},
                reasoning={},
                tools=[
                    {
                        "type": "function",
                        "name": "recognize_toponyms",
                        "description": "Given the user input text, identify all the toponyms in the text.",
                        "parameters": {
                            "type": "object",
                            "required": ["input_text", "toponyms"],
                            "properties": {
                                "input_text": {"type": "string", "description": "The text string from which to recognize and identify toponyms."},
                                "toponyms": {
                                    "type": "array",
                				    "description": "Array of recognized and identified toponyms.",
                                    "items": {
                                        "type": "object",
                                        "properties": {
                                            "toponym": {"type": "string"},
                                            "start_idx": {"type": "integer"},
                                            "end_idx": {"type": "integer"},
                                        },
                                        "required": ["toponym", "start_idx", "end_idx"],
                                        "additionalProperties": False
                                    }
                                }
                            },
                            "additionalProperties": False
                        },
                        "strict": True
                    }
                ],
                temperature=1.0,
                tool_choice="required",
                max_output_tokens=max_output_tokens,
                top_p=1,
                store=True
            )
            return extract_json_from_arguments(response), offset
        except Exception as e:
            wait = 2 ** attempt
            print(f"[API] Error: {e}\nRetrying in {wait}s (chunk at char {offset})...")
            time.sleep(wait)
    print(f"[API] Failed after retries for chunk at {offset}")
    return [], offset

# 4. Stage 1: Parallel Toponym Extraction

# ====== Load Extraction Prompt ======
with open("openai_ToponymExtraction_prompt.txt", encoding="utf-8") as f:
    extraction_instructions = f.read()

# ====== Chunk Input ======
enc = tiktoken.get_encoding("cl100k_base")
input_tokens = len(enc.encode(texts))
if input_tokens < 3000:
    chunks = [(texts, 0)]
    print("Text fits in one chunk for extraction.")
else:
    chunks = chunk_text_by_tokens(texts, max_tokens=3000, overlap_tokens=600)
    print(f"Text split into {len(chunks)} chunks for extraction.")

# ====== Run Extraction in Parallel ======
from concurrent.futures import ThreadPoolExecutor, as_completed

max_workers = 20   # safe for modern high-tier; can adjust up/down
extracted_toponyms = []

with ThreadPoolExecutor(max_workers=max_workers) as executor:
    futures = [
        executor.submit(
            call_api_with_retry_chunk, chunk, offset, extraction_instructions, client, 16384
        )
        for chunk, offset in chunks
    ]
    for f in as_completed(futures):
        toponyms_chunk, offset = f.result()
        num_found = 0
        if isinstance(toponyms_chunk, list):
            for t in toponyms_chunk:
                if 'start_idx' in t: t["start_idx"] += offset
                if 'end_idx' in t: t["end_idx"] += offset
            extracted_toponyms += toponyms_chunk
            num_found = len(toponyms_chunk)
        elif isinstance(toponyms_chunk, dict) and "toponyms" in toponyms_chunk:
            for t in toponyms_chunk["toponyms"]:
                if 'start_idx' in t: t["start_idx"] += offset
                if 'end_idx' in t: t["end_idx"] += offset
            extracted_toponyms += toponyms_chunk["toponyms"]
            num_found = len(toponyms_chunk["toponyms"])
        print(f"Extracted {num_found} toponyms from chunk at char {offset}.")

print(f"\nStage 1 complete: Extracted {len(extracted_toponyms)} total toponym instances.")
with open("extracted_toponyms.json", "w", encoding="utf-8") as f:
    json.dump(extracted_toponyms, f, ensure_ascii=False, indent=2)

# 5. Stage 2: Parallel Toponym Analysis

# ====== Load Analysis Prompt ======
with open("openai_ToponymEmotionAnalysis_prompt.txt", encoding="utf-8") as f:
    analysis_instructions = f.read()

def call_api_with_retry_analysis(
    toponym_obj,
    texts,
    client,
    analysis_instructions,
    max_output_tokens=2048,
    retries=4,
):
    toponym_str = toponym_obj["toponym"]
    idx = texts.lower().find(toponym_str.lower())
    if idx == -1:
        start_idx, end_idx = 0, 0
        context = ""  # fallback, though function is expected to handle context dynamically
    else:
        # Start with a large context, function will try multiple sizes
        window = 600  # You may increase this further
        start_idx = max(0, idx - window)
        end_idx = min(len(texts), idx + len(toponym_str) + window)
        context = texts[start_idx:end_idx]
    user_input = {
        "original_text": context if context else texts,
        "toponym_instances": [{**toponym_obj, "original_range": [start_idx, end_idx]}]
    }
    for attempt in range(retries):
        try:
            response = client.responses.create(
                model="gpt-4.1-2025-04-14",
                instructions=analysis_instructions,
                input=json.dumps(user_input),
                text={"format": {"type": "text"}},
                reasoning={},
                tools=[{
                    "type": "function",
                    "name": "resolve_toponyms_and_detect_emotions",
                    "description": (
                        "Given the user input of the original text and extracted toponyms, "
                        "determine latitude and longitude of each toponym and perform emotion detection. "
                        "Try multiple possible context window sizes (~different context lengths) for each toponym and "
                        "return the window (context) that maximizes the confidence score for the most likely detected emotion."
                    ),
                    "parameters": {
                        "type": "object",
                        "required": ["original_text", "toponym_instances"],
                        "properties": {
                            "original_text": {"type": "string", "description": "The text string from which to disambiguate toponyms and utilize their surrounding context."},
                            "toponym_instances": {
                                "type": "array",
                				"description": "Array of identified toponyms, each containing properties of location details and emotional context.",
                                "items": {
                                    "type": "object",
                                    "required": [
                                        "toponym", "resolved_name", "latitude",
                                        "longitude", "emotion", "confidence_score",
                                        "context", "context_length", "original_range"
                                    ],
                                    "properties": {
                                        "toponym": {"type": "string", "description": "The name of the toponym as found in the previous step."},
                                        "resolved_name": {"type": "string", "description": "The name of the resolved toponym as identified and disambiguated."},
                                        "latitude": {"type": "number", "description": "The latitude coordinate of the toponym."},
                                        "longitude": {"type": "number", "description": "The longitude coordinate of the toponym."},
                                        "emotion": {"type": "string", "description": "The most likely detected emotion around the toponym.", "enum": [
                                            "anger", "disgust", "fear", "joy", "sadness", "surprise", "neutral"
                                        ]},
                                        "confidence_score": {"type": "number", "description": "The confidence score for the detected emotion, on a scale of 0 to 1."},
                                        "context": {"type": "string", "description": "The text block surrounding the toponym used for emotion detection, whose length is determined based on trying different lengths and seeing which one gives the highest confidence score for the most likely detected emotion."},
                                        "context_length": {"type": "number", "description": "The length, in characters including spaces, of the final text block surrounding the toponym used for emotion detection."},
                                        "original_range": {"type": "array", "description": "The original start and end position in the text of the toponym.", "items": {"type": "number"}}
                                    },
                                    "additionalProperties": False,
                                },
                            }
                        },
                        "additionalProperties": False,
                    },
                    "strict": True
                }],
                temperature=1,
                tool_choice="required",
                max_output_tokens=max_output_tokens,
                top_p=1,
                store=True
            )
            return extract_json_from_arguments(response), toponym_str
        except Exception as e:
            wait = 2 ** attempt
            print(f"[API] Analysis error for '{toponym_str}': {e}\nRetrying in {wait}s...")
            time.sleep(wait)
    print(f"[API] Analysis failed after retries for '{toponym_str}'.")
    return {"toponym": toponym_str, "error": "Failed after retries"}, toponym_str

# Run Stage 2 in Parallel

# ---- Load the extracted_toponyms ----
with open("extracted_toponyms.json", encoding="utf-8") as f:
    extracted_toponyms = json.load(f)

analysis_results = []
max_workers_analysis = 20  # You can go higher if needed

with ThreadPoolExecutor(max_workers=max_workers_analysis) as executor:
    futures = [
        executor.submit(
            call_api_with_retry_analysis, t, texts, client, analysis_instructions, 2048
        )
        for t in extracted_toponyms
    ]
    for f in as_completed(futures):
        batch_result, toponym_str = f.result()
        # Handle lists/dicts as before
        if isinstance(batch_result, list):
            analysis_results += batch_result
            print(f"Analyzed: {toponym_str} (got list)")
        elif isinstance(batch_result, dict) and "toponym_instances" in batch_result:
            analysis_results += batch_result["toponym_instances"]
            print(f"Analyzed: {toponym_str} (from .toponym_instances)")
        else:
            analysis_results.append(batch_result)
            print(f"Analyzed: {toponym_str} (error or unexpected shape)")

with open("analysis_results.json", "w", encoding="utf-8") as f:
    json.dump(analysis_results, f, ensure_ascii=False, indent=2)
print(f"\nStage 2 complete: Produced {len(analysis_results)} detailed toponym analyses.")


Text split into 5 chunks for extraction.
Extracted 0 toponyms from chunk at char 18995.
Extracted 0 toponyms from chunk at char 0.
Extracted 58 toponyms from chunk at char 9305.
Extracted 36 toponyms from chunk at char 37267.
Extracted 53 toponyms from chunk at char 28243.

Stage 1 complete: Extracted 147 total toponym instances.
Analyzed: Washington (from .toponym_instances)
Analyzed: France (from .toponym_instances)
Analyzed: Romania (from .toponym_instances)
Analyzed: Chambon (from .toponym_instances)
Analyzed: France (from .toponym_instances)
Analyzed: Haute-Loire (from .toponym_instances)
Analyzed: Chambon-sur-Lignon (from .toponym_instances)
Analyzed: Europe (from .toponym_instances)
Analyzed: Germany (from .toponym_instances)
Analyzed: Saint-Etienne (from .toponym_instances)
Analyzed: Eastern Europe (from .toponym_instances)
Analyzed: Maison des Roches (from .toponym_instances)
Analyzed: Maison des Roches (from .toponym_instances)
Analyzed: Belgium (from .toponym_instances)
Anal

In [30]:
# Take response output in json format, put into a dataframe, then assign numeric values 
# to the detected emotions.

df = pd.DataFrame(analysis_results)

conditions = [
    df["emotion"] == "anger",
    df["emotion"] == "disgust",
    df["emotion"] == "fear",
    df["emotion"] == "joy",
    df["emotion"] == "neutral",
    df["emotion"] == "sadness",
    df["emotion"] == "surprise"
]
values = ["0", "1", "2", "3", "4", "5", "6"]
df["emotion_numeric"] = np.select(conditions, values, default="Unknown")

df

Unnamed: 0,toponym,resolved_name,latitude,longitude,emotion,confidence_score,context,context_length,original_range,emotion_numeric
0,Washington,,0.000000,0.000000,neutral,0.00,,0,"[10816, 10826]",4
1,France,,0.000000,0.000000,neutral,0.00,,0,"[954, 960]",4
2,Romania,Romania,45.943200,24.966800,sadness,0.91,biggest tragedy of that time was to have those...,691,"[9668, 9675]",5
3,Chambon,"Le Chambon-sur-Lignon, Haute-Loire, France",45.058600,4.293600,neutral,0.98,"At the same time, the thing from the UGIF [Uni...",600,"[10863, 10870]",4
4,France,France,46.603354,1.888334,neutral,0.81,We were told that we'll have to get out in two...,698,"[954, 2160]",4
...,...,...,...,...,...,...,...,...,...,...
142,Gurs,"Gurs internment camp, Pyrénées-Atlantiques, Fr...",43.293889,-0.676389,sadness,0.94,I found an Austrian with whom I get along well...,382,"[31697, 31701]",5
143,Swiss Red Cross,Swiss Red Cross,46.798562,8.231974,joy,0.84,Her name was Hirsch. And she took us by train ...,417,"[30234, 30249]",3
144,France,France,46.603354,1.888334,neutral,0.95,We were told that we'll have to get out in two...,635,"[954, 960]",4
145,University of Lyon,University of Lyon,45.750300,4.852700,neutral,0.97,who was also a professor of economics of the U...,588,"[30311, 30329]",4


In [32]:
# Export results to csv

df.to_csv("Results9_ToponymsEmotions_smallSubCorpus.csv", encoding="utf-8-sig", index=False, header=True, mode="w+")