In [1]:
# Check for CUDA and GPU, and if True, GPU will be used.

import torch
torch.cuda.is_available()

True

In [9]:
# Bring in the sample dataset, the smaller sub-corpus.

import os
path = "YOUR_DATA_test"

def read_txt_files(directory):
    # Reads all .txt files in a directory and returns a combined string of their contents.

    file_contents = ''
    
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            filepath = os.path.join(directory, filename)
            with open(filepath, "r", encoding="utf8") as f:
                file_contents = file_contents + (f.read())
    return file_contents

texts = read_txt_files(path)

In [10]:
"""
Developing function to identify and resolve toponyms, and detect emotions in context 
on either side of each toponym.  Context length is based on trying different lengths,
with the final context length chosen based on which gives the most likely detected emotion
with the highest confidence score.

"""
# Access libraries
import os
from openai import OpenAI
from pydantic import BaseModel
import pandas as pd
import json
import numpy as np
import time
import hashlib
from difflib import SequenceMatcher
from concurrent.futures import ThreadPoolExecutor, as_completed

# Set a global variable for my OpenAI API key so that the model can be accessed.
os.environ["OPENAI_API_KEY"] = "YOUR_KEY"
client = OpenAI()

# Alternative data for testing and to DEBUG:
#texts = "I traveled from Paris to Berlin and saw New York on the way.  It was fantastic.  I was so happy."

# ========== Robust OpenAI Output Extraction ==========
def extract_json_from_arguments(response):
    """
    Robust extraction for OpenAI responses.
    Handles both function call and text output scenarios.
    Returns dict or list or [].
    """
    # Case 1: Function call pattern
    if hasattr(response, "output") and response.output:
        first = response.output[0]
        if hasattr(first, "arguments"): # should be a string
            arguments_string = first.arguments
            if isinstance(arguments_string, (str, bytes)):
                try:
                    return json.loads(arguments_string)
                except Exception as e:
                    print(f"JSON parsing error: {e}")
                    return []
            else:
                # If already parsed (rare)
                return arguments_string
        # If it's classic text response
        if hasattr(first, "content") and first.content:
            text_fragment = getattr(first.content[0], "text", None)
            if text_fragment:
                try:
                    return json.loads(text_fragment)
                except Exception as e:
                    print(f"JSON parsing error (text): {e}\nTEXT: {text_fragment}")
                    return []
    # Case 2: Tool-style .outputs (not present in your current responses)
    if hasattr(response, "outputs") and response.outputs and hasattr(response.outputs[0], "arguments"):
        arguments = response.outputs[0].arguments
        if arguments is not None:
            return arguments
    print("No recognizable output format found in OpenAI response.")
    return []

# 2. Character-based Chunking (if needed).
# Reduced number of characters to just 600, with no overlap at all.  

def chunk_text_by_chars(text, chunk_size=600, overlap=0):
    chunks = []
    i = 0
    text_len = len(text)
    while i < text_len:
        start_char = i
        end_char = min(i + chunk_size, text_len)
        chunk_text = text[start_char:end_char]
        chunks.append((chunk_text, start_char))
        if end_char == text_len:
            break
        i += chunk_size - overlap
    return chunks

# 3. API Call with Retry for Thread Use

def call_api_with_retry_chunk(chunk, extraction_instructions, client, max_output_tokens=2048, retries=4):
    # print("Chunk being sent:", repr(chunk))     # Can use this to DEBUG chunk issues
    for attempt in range(retries):
        try:
            response = client.responses.create(
                model="gpt-4.1-2025-04-14",
                instructions=extraction_instructions,
                input=chunk,
                text={"format": {"type": "text"}},
                reasoning={},
                tools=[
                    {
                        "type": "function",
                        "name": "recognize_toponyms",
                        "description": "Given the user input text, identify all the toponyms in the text.",
                        "parameters": {
                            "type": "object",
                            "required": ["input_text", "toponyms"],
                            "properties": {
                                "input_text": {
                                    "type": "string", 
                                    "description": "The text string from which to recognize and identify toponyms."
                                },
                                "toponyms": {
                                    "type": "array",
                                    "description": "Array of recognized and identified toponyms.",
                                    "items": {
                                        "type": "object",
                                        "properties": {
                                            "toponym": {"type": "string"}
                                        },
                                        "required": ["toponym"],
                                        "additionalProperties": False
                                    }
                                }
                            },
                            "additionalProperties": False
                        },
                        "strict": True
                    }
                ],
                temperature=0,
                tool_choice="required",
                max_output_tokens=max_output_tokens,
                top_p=1,
                store=True
            )
            """ These next three lines can be used to DEBUG responses or lack thereof
            print("Raw response:", response)
            extracted = extract_json_from_arguments(response)
            print("Extracted:", extracted)
            """
            return extract_json_from_arguments(response)         
        except Exception as e:
            wait = 2 ** attempt
            print(f"[API] Error: {e}\nRetrying in {wait}s...")
            time.sleep(wait)
    print(f"[API] Failed after retries.")
    return []

# 4. Stage 1: Parallel Toponym Extraction

# ====== Load Extraction Prompt ======
with open("openai_ToponymExtraction_prompt_complicated_25.txt", encoding="utf-8") as f:
    extraction_instructions = f.read()

# ====== Chunk Input ======
    # chunk via characters
chunks = chunk_text_by_chars(texts, chunk_size=600, overlap=0)
print(f"Text split into {len(chunks)} char-based chunks for extraction.")

# ====== Run Extraction in Parallel ======
max_workers = 20   # safe for modern high-tier; can adjust up/down
extracted_toponyms = []

with ThreadPoolExecutor(max_workers=max_workers) as executor:
    futures = [
        executor.submit(
            call_api_with_retry_chunk, chunk, extraction_instructions, client
        )
        for chunk, _ in chunks
    ]
    for f in as_completed(futures):
        result = f.result()
        # print("DEBUG:", result)          # Can use this if need to DEBUG results
        if isinstance(result, dict) and "toponyms" in result:
            extracted_toponyms += result["toponyms"]
        elif isinstance(result, list):
            extracted_toponyms += result
        else:
            print("Warning: Unexpected result format", result)

print(f"\nInitial extraction stage complete: Got {len(extracted_toponyms)} toponym instances (with possible duplicates).")

# ------------------ DEDUPLICATION STEP --------------------
def get_local_context(text, name, window=50):
    """Find the first occurrence of name in text and return local context window."""
    lowers = text.lower()
    name_lower = name.lower()
    idx = lowers.find(name_lower)
    if idx == -1:
        # fallback: just use the first window of the text (may dedupe global substrings, edge case)
        return text[:2*window]
    start = max(0, idx - window)
    end = min(len(text), idx + len(name) + window)
    return text[start:end]

def deduplicate_by_fuzzy_context_and_longest(toponym_list, texts, window=50, similarity=0.90):
    """
    Group extracted toponyms by fuzzy context similarity.
    Keeps only the longest (most specific) name in each group.
    Can be more aggressive with wider window & lower similarity threshold!!
    """
    items = [
        (t['toponym'].strip(), get_local_context(texts, t['toponym'].strip(), window), t)
        for t in toponym_list
    ]
    groups = []
    used = set()
    for i, (name_i, ctx_i, obj_i) in enumerate(items):
        if i in used: continue
        group = [(name_i, ctx_i, obj_i)]
        used.add(i)
        for j, (name_j, ctx_j, obj_j) in enumerate(items):
            if j <= i or j in used: continue
            if ctx_i and ctx_j:
                score = SequenceMatcher(None, ctx_i, ctx_j).ratio()
                if score >= similarity:
                    group.append((name_j, ctx_j, obj_j))
                    used.add(j)
        # Within the group, eliminate all substrings: keep only the longest(s)
        group.sort(key=lambda g: len(g[0]), reverse=True)
        deduped_names = set()
        deduped_objs = []
        for name, ctx, obj in group:
            if not any(name in longer for longer in deduped_names if len(name) < len(longer)):
                deduped_names.add(name)
                deduped_objs.append(obj)
        # Option 1: Only keep the very longest:
        groups.append(deduped_objs[0])
        # Option 2: To keep all equally-long max variants, use:
        # groups.extend(deduped_objs[:1])  # or groups.extend(deduped_objs)
    return groups
    
before = len(extracted_toponyms)
extracted_toponyms = deduplicate_by_fuzzy_context_and_longest(extracted_toponyms, texts, window=50, similarity=0.90)
after = len(extracted_toponyms)
print(f"Deduplicated toponyms: {before} → {after}")

# ------------------ END DEDUPLICATION STEP --------------------


with open("extracted_toponyms.json", "w", encoding="utf-8") as f:
    json.dump(extracted_toponyms, f, ensure_ascii=False, indent=2)
print(f"\nStage 1 complete: Saved {len(extracted_toponyms)} unique toponym instances to file.")

# 5. Stage 2: Parallel Toponym Analysis

# ====== Load Analysis Prompt ======
with open("openai_ToponymEmotionAnalysis_prompt_complicated_25.txt", encoding="utf-8") as f:
    analysis_instructions = f.read()

def call_api_with_retry_analysis(
    toponym_obj,
    texts,
    client,
    analysis_instructions,
    max_output_tokens=32000,
    retries=4,
):
    toponym_str = toponym_obj["toponym"]
    user_input = {
        "original_text": texts,
        "toponym_instances": [{**toponym_obj}]
    }
    for attempt in range(retries):
        try:
            response = client.responses.create(
                model="gpt-4.1-2025-04-14",
                instructions=analysis_instructions,
                input=json.dumps(user_input),
                text={"format": {"type": "text"}},
                reasoning={},
                tools=[{
                    "type": "function",
                    "name": "resolve_toponyms_and_detect_emotions",
                    "description": (
                        "Given the user input of the original text and extracted toponyms, determine latitude and longitude of each toponym."
                        "If the toponym is in France then proceed and perform emotion detection. If not in France, then do no futher action on that toponym and do not include it in your response."
                        "Try multiple possible context window sizes (~different context lengths) for each French toponym and "
                        "return the window (context) that maximizes the confidence score for the most likely detected emotion."
                    ),
                    "parameters": {
                        "type": "object",
                        "required": ["original_text", "toponym_instances"],
                        "properties": {
                            "original_text": {"type": "string", "description": "The text string from which to disambiguate toponyms and utilize their surrounding context."},
                            "toponym_instances": {
                                "type": "array",
                				"description": "Array of identified toponyms, each containing properties of location details and emotional context.",
                                "items": {
                                    "type": "object",
                                    "required": [
                                        "toponym", "resolved_name", "latitude",
                                        "longitude", "emotion", "confidence_score",
                                        "context", "context_length", "sub_category_emotion"
                                    ],
                                    "properties": {
                                        "toponym": {"type": "string", "description": "The name of the toponym as found in the previous step."},
                                        "resolved_name": {"type": "string", "description": "The name of the resolved toponym as identified and disambiguated."},
                                        "latitude": {"type": "number", "description": "The latitude coordinate of the toponym."},
                                        "longitude": {"type": "number", "description": "The longitude coordinate of the toponym."},
                                        "emotion": {"type": "string", "description": "The most likely detected emotion around the toponym.", "enum": [
                                            "anger", "disgust", "fear", "joy", "sadness", "surprise", "neutral"
                                        ]},
                                        "confidence_score": {"type": "number", "description": "The confidence score for the detected emotion, on a scale of 0 to 1."},
                                        "context": {"type": "string", "description": "The text block surrounding the toponym used for emotion detection, whose length is determined based on trying different lengths and seeing which one gives the highest confidence score for the most likely detected emotion."},
                                        "context_length": {"type": "number", "description": "The length, in characters including spaces, of the final text block surrounding the toponym used for emotion detection."},
                                        "sub_category_emotion": {"type": "string", "description":  "For each of the emotions that you concluded (anger, disgust, fear, joy, sadness, surprise, neutral), tell us which of the following sub-category of that emotion is most likely, using the Ekman emotion classification system",
                                                                "enum": ["Annoyance", "Frustration", "Exasperation", "Argumentativeness", "Bitterness", "Vengefulness", "Fury",
                                                                         "Dislike", "Aversion", "Distaste", "Repugnance", "Revulsion", "Abhorrence", "Loathing",
                                                                         "Trepidation", "Nervousness", "Anxiety", "Dread", "Desperation", "Panic", "Horror", "Terror",
                                                                         "Sensory Pleasure", "Rejoicing", "Compassion/Joy", "Amusement", "Schadenfreude", "Relief", "Peace", "Pride", "Fiero", "Naches", "Wonder", "Excitement", "Ecstasy",
                                                                         "Disappointment", "Discouragement", "Distraughtness", "Resignation", "Helplessness", "Hopelessness", "Misery", "Despair", "Grief", "Sorrow", "Anguish",
                                                                         "Surprise",
                                                                         "Neutral"
                                                                        ]}
                                    },
                                    "additionalProperties": False,
                                },
                            }
                        },
                        "additionalProperties": False,
                    },
                    "strict": True
                }],
                temperature=1,
                tool_choice="required",
                max_output_tokens=max_output_tokens,
                top_p=1,
                store=True
            )
            return extract_json_from_arguments(response), toponym_str
        except Exception as e:
            wait = 2 ** attempt
            print(f"[API] Analysis error for '{toponym_str}': {e}\nRetrying in {wait}s...")
            time.sleep(wait)
    print(f"[API] Analysis failed after retries for '{toponym_str}'.")
    return {"toponym": toponym_str, "error": "Failed after retries"}, toponym_str

# Run Stage 2 in Parallel

# ---- Load the extracted_toponyms ----
with open("extracted_toponyms.json", encoding="utf-8") as f:
    extracted_toponyms = json.load(f)
"""    
# This sets a context window that is at the maximum of 600 characters to avoid the sometimes random 
5000-character context windows that the model decides to use when calling the function, even though
I told it to not give me more than 600 character windows
"""
def get_context(text, toponym, window=600):
    idx = text.lower().find(toponym.lower())
    if idx == -1:
        print(f"Warning: Toponym {toponym} not found in text.")
        return text
    start = max(0, idx - window)
    end = min(len(text), idx + len(toponym) + window)
    return text[start:end]

analysis_results = []
"""
Keep max_workers relatively low to prevent truncation of output (which results in "JSON parsing errors" 
due to attempting this on a truncated "list" rather than the actual dictionary that it is).  
Also keeps it below rate limits.
"""
max_workers_analysis = 6

with ThreadPoolExecutor(max_workers=max_workers_analysis) as executor:
    futures = [
        executor.submit(
            call_api_with_retry_analysis, t, get_context(texts, t["toponym"]), client, analysis_instructions, 32000
        )
        for t in extracted_toponyms
    ]
    for f in as_completed(futures):
        batch_result, toponym_str = f.result()
        # Handle lists/dicts as before
        if isinstance(batch_result, list):
            analysis_results += batch_result
            print(f"Analyzed: {toponym_str} (got list)")
        elif isinstance(batch_result, dict) and "toponym_instances" in batch_result:
            analysis_results += batch_result["toponym_instances"]
            print(f"Analyzed: {toponym_str} (from .toponym_instances)")
        else:
            analysis_results.append(batch_result)
            print(f"Analyzed: {toponym_str} (error or unexpected shape)")

with open("analysis_results.json", "w", encoding="utf-8") as f:
    json.dump(analysis_results, f, ensure_ascii=False, indent=2)
print(f"\nStage 2 complete: Produced {len(analysis_results)} detailed toponym analyses.")


Text split into 77 char-based chunks for extraction.

Initial extraction stage complete: Got 165 toponym instances (with possible duplicates).
Deduplicated toponyms: 165 → 58

Stage 1 complete: Saved 58 unique toponym instances to file.
Analyzed: Gers (from .toponym_instances)
Analyzed: Auch (from .toponym_instances)
Analyzed: Le Chambon-sur-Lignon (from .toponym_instances)
Analyzed: Eastern Europe (from .toponym_instances)
Analyzed: France (from .toponym_instances)
Analyzed: Germany (from .toponym_instances)
Analyzed: Marseilles (from .toponym_instances)
Analyzed: Les Grillons (from .toponym_instances)
Analyzed: Le Chambon sur Lignon (from .toponym_instances)
Analyzed: Romania (from .toponym_instances)
Analyzed: Haute-Loire (from .toponym_instances)
Analyzed: Hungary (from .toponym_instances)
Analyzed: Cheylard (from .toponym_instances)
Analyzed: Massif Central (from .toponym_instances)
Analyzed: La Rouvière (from .toponym_instances)
Analyzed: St. Agrève (from .toponym_instances)
Anal

In [11]:
# Take response output in json format, put into a dataframe, then assign numeric values 
# to the detected emotions.

df = pd.DataFrame(analysis_results)

conditions = [
    df["emotion"] == "anger",
    df["emotion"] == "disgust",
    df["emotion"] == "fear",
    df["emotion"] == "joy",
    df["emotion"] == "neutral",
    df["emotion"] == "sadness",
    df["emotion"] == "surprise"
]
values = ["0", "1", "2", "3", "4", "5", "6"]
df["emotion_numeric"] = np.select(conditions, values, default="Unknown")

df

Unnamed: 0,toponym,resolved_name,latitude,longitude,emotion,confidence_score,context,context_length,sub_category_emotion,emotion_numeric
0,Gers,Gers,43.7167,0.6,neutral,0.5,No relevant context for Gers found in the text.,0,Neutral,4
1,Auch,Auch,43.6478,0.5867,neutral,0.73,I was given a \nFrench identity card with the ...,360,Neutral,4
2,Le Chambon-sur-Lignon,,0.0,0.0,neutral,0.0,,0,Neutral,4
3,Eastern Europe,Eastern Europe,54.52596,15.25512,neutral,0.0,,0,Neutral,4
4,France,France,46.603354,1.888334,neutral,0.7,"At the same time, the thing from the UGIF [Uni...",329,Neutral,4
5,Germany,Germany,51.165691,10.451526,neutral,0.0,,0,Neutral,4
6,Marseilles,Marseille,43.296482,5.36978,neutral,0.77,We were told that we'll have to get out in two...,312,Neutral,4
7,Les Grillons,Les Grillons,45.04585,4.31139,neutral,0.81,Mr. Trocmé came on his bicycle to meet me (sti...,545,Neutral,4
8,Le Chambon sur Lignon,Le Chambon-sur-Lignon,45.06081,4.302941,neutral,0.5,"Saturday, January 16, 1943 [ Le Chambon sur Li...",523,Neutral,4
9,Romania,Romania,45.9432,24.9668,neutral,0.0,,0,Neutral,4


In [12]:
# Export results to csv

df.to_csv("Results25_ToponymsEmotions_smallSubCorpus.csv", encoding="utf-8-sig", index=False, header=True, mode="w+")