In [1]:
# Check for CUDA and GPU, and if True, GPU will be used.

import torch
torch.cuda.is_available()

True

In [2]:
# Bring in the sample dataset, the smaller sub-corpus.

import os
path = "YOUR_DATA_test"

def read_txt_files(directory):
    # Reads all .txt files in a directory and returns a combined string of their contents.

    file_contents = ''
    
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            filepath = os.path.join(directory, filename)
            with open(filepath, "r", encoding="utf8") as f:
                file_contents = file_contents + (f.read())
    return file_contents

texts = read_txt_files(path)

In [4]:
"""
Developing function to identify and resolve toponyms, and detect emotions in context 
on either side of each toponym.  Context length is based on trying different lengths,
with the final context length chosen based on which gives the most likely detected emotion
with the highest confidence score.

"""
# Access libraries
import os
from openai import OpenAI
from pydantic import BaseModel
import pandas as pd
import json
import numpy as np
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
import tiktoken
import difflib # <-- for similarity


# Set a global variable for my OpenAI API key so that the model can be accessed.
os.environ["OPENAI_API_KEY"] = "YOUR_KEY"
client = OpenAI()

# Alternative data for testing:
#texts = "I traveled from Paris to Berlin and saw New York on the way.  It was fantastic.  I was so happy."

# ========== Robust OpenAI Output Extraction ==========
def extract_json_from_arguments(response):
    """
    Robust extraction for OpenAI responses.
    Handles both function call and text output scenarios.
    Returns dict or list or [].
    """
    # Case 1: Function call pattern
    if hasattr(response, "output") and response.output:
        first = response.output[0]
        if hasattr(first, "arguments"): # should be a string
            arguments_string = first.arguments
            if isinstance(arguments_string, (str, bytes)):
                try:
                    return json.loads(arguments_string)
                except Exception as e:
                    print(f"JSON parsing error: {e}\nARGUMENTS STRING: {arguments_string}")
                    return []
            else:
                # If already parsed (rare)
                return arguments_string
        # If it's classic text response
        if hasattr(first, "content") and first.content:
            text_fragment = getattr(first.content[0], "text", None)
            if text_fragment:
                try:
                    return json.loads(text_fragment)
                except Exception as e:
                    print(f"JSON parsing error (text): {e}\nTEXT: {text_fragment}")
                    return []
    # Case 2: Tool-style .outputs (not present in your current responses)
    if hasattr(response, "outputs") and response.outputs and hasattr(response.outputs[0], "arguments"):
        arguments = response.outputs[0].arguments
        if arguments is not None:
            return arguments
    print("No recognizable output format found in OpenAI response.")
    return []

# 2. Token_to_Character-based Chunking (if needed).
# For iteration 13 and following, reduced tokens to 1000 and overlap to 50 to try to improve performance and reduce duplicates.

def compute_token_to_char(text, encoding):
    all_tokens = encoding.encode(text)
    token_to_char = []
    curr_char = 0
    for tok in all_tokens:
        piece = encoding.decode([tok])
        token_to_char.append(curr_char)
        curr_char += len(piece)
    return all_tokens, token_to_char
    
def chunk_text_by_tokens_precise(text, max_tokens=1000, overlap_tokens=50):
    enc = tiktoken.get_encoding("cl100k_base")
    all_tokens, token_to_char = compute_token_to_char(text, enc)
    n = len(all_tokens)
    chunks = []
    i = 0
    while i < n:
        start_token = i
        end_token = min(i + max_tokens, n)
        start_char = token_to_char[start_token]
        end_char = token_to_char[end_token] if end_token < len(all_tokens) else len(text)
        chunk_text = text[start_char:end_char]
        chunks.append((chunk_text, start_char, end_char, start_token, end_token))
        if end_token == n:
            break
        i += max_tokens - overlap_tokens
    return chunks


# 3. API Call with Retry for Thread Use

def call_api_with_retry_chunk(chunk, offset, extraction_instructions, client, max_output_tokens=2048, retries=4):
    for attempt in range(retries):
        try:
            response = client.responses.create(
                model="gpt-4.1-2025-04-14",
                instructions=extraction_instructions,
                input=chunk,
                text={"format": {"type": "text"}},
                reasoning={},
                tools=[
                    {
                        "type": "function",
                        "name": "recognize_toponyms",
                        "description": "Given the user input text, identify all the toponyms in the text.",
                        "parameters": {
                            "type": "object",
                            "required": ["input_text", "toponyms"],
                            "properties": {
                                "input_text": {"type": "string", "description": "The text string from which to recognize and identify toponyms."},
                                "toponyms": {
                                    "type": "array",
                				    "description": "Array of recognized and identified toponyms.",
                                    "items": {
                                        "type": "object",
                                        "properties": {
                                            "toponym": {"type": "string"},
                                            "start_idx": {"type": "integer"},
                                            "end_idx": {"type": "integer"},
                                        },
                                        "required": ["toponym", "start_idx", "end_idx"],
                                        "additionalProperties": False
                                    }
                                }
                            },
                            "additionalProperties": False
                        },
                        "strict": True
                    }
                ],
                temperature=0.25,
                tool_choice="required",
                max_output_tokens=max_output_tokens,
                top_p=1,
                store=True
            )
            return extract_json_from_arguments(response), offset
        except Exception as e:
            wait = 2 ** attempt
            print(f"[API] Error: {e}\nRetrying in {wait}s (chunk at char {offset})...")
            time.sleep(wait)
    print(f"[API] Failed after retries for chunk at {offset}")
    return [], offset

# 4. Stage 1: Parallel Toponym Extraction

# ====== Load Extraction Prompt ======
with open("openai_ToponymExtraction_prompt_complicated_18.txt", encoding="utf-8") as f:
    extraction_instructions = f.read()

# ====== Chunk Input ======
    # chunk via token-to-char mapping 
# As noted above, for this iteration reduced chunk size to 1000 and overlap to 50
enc = tiktoken.get_encoding("cl100k_base")
input_tokens = len(enc.encode(texts))
# chunk ALWAYS with token/char mapping, never with .find()!
chunks = chunk_text_by_tokens_precise(texts, max_tokens=1000, overlap_tokens=50)
print(f"Text split into {len(chunks)} chunks for extraction (token-char mapped).")


# ====== Run Extraction in Parallel ======
max_workers = 20   # safe for modern high-tier; can adjust up/down
extracted_toponyms = []

with ThreadPoolExecutor(max_workers=max_workers) as executor:
    futures = [
        executor.submit(
            call_api_with_retry_chunk, chunk, start_char, extraction_instructions, client
        )
        for chunk, start_char, end_char, start_token, end_token in chunks
    ]
    for f in as_completed(futures):
        toponyms_chunk, offset = f.result()
        # Normalize returned indices to global!
        if isinstance(toponyms_chunk, list):
            for t in toponyms_chunk:
                t["start_idx"] += offset
                t["end_idx"] += offset
            extracted_toponyms += toponyms_chunk
        elif isinstance(toponyms_chunk, dict) and "toponyms" in toponyms_chunk:
            for t in toponyms_chunk["toponyms"]:
                t["start_idx"] += offset
                t["end_idx"] += offset
            extracted_toponyms += toponyms_chunk["toponyms"]

print(f"\nInitial extraction stage complete: Got {len(extracted_toponyms)} toponym instances (with possible duplicates).")

# ------STRICT INDEX/SUBSTR VALIDATION & DEDUPLICATION STEP -------
    # nearby, identical name (window adjustable)
def validate_and_deduplicate_toponyms(toponym_list, texts, window=40):
    # Filter for exact matches
    validated = []
    for t in toponym_list:
        s, e = t['start_idx'], t['end_idx']
        extract = texts[s:e]
        # Only accept if exact match:
        if extract.strip().lower() == t['toponym'].strip().lower():
            validated.append(t)
        else:
            print(f"BAD INDEX: {t['toponym']} @ {s}-{e} | Extracted: {repr(extract)}")
    # Dedupe: same toponym, overlapping window
    deduped = []
    for t in validated:
        found = False
        for other in deduped:
            # If same spot (or in window) and same toponym string, skip
            if t['toponym'].strip().lower() == other['toponym'].strip().lower() and abs(t['start_idx'] - other['start_idx']) < window:
                found = True
                break
        if not found:
            deduped.append(t)
    return deduped

final_toponyms = validate_and_deduplicate_toponyms(extracted_toponyms, texts)
print(f"After validation + deduplication: {len(final_toponyms)} valid, unique entries.")

# ------------------ END DEDUPLICATION STEP --------------------

with open("extracted_toponyms.json", "w", encoding="utf-8") as f:
    json.dump(final_toponyms, f, ensure_ascii=False, indent=2)

# 5. Stage 2: Parallel Toponym Analysis

# ====== Load Analysis Prompt ======
with open("openai_ToponymEmotionAnalysis_prompt_complicated_18.txt", encoding="utf-8") as f:
    analysis_instructions = f.read()

# Load validated output (from Stage 1)
with open("extracted_toponyms.json", encoding="utf-8") as f:
    extracted_toponyms = json.load(f)

def call_api_with_retry_analysis(
    toponym_obj,
    texts,
    client,
    analysis_instructions,
    max_output_tokens=2048,
    retries=4,
):
    toponym_str = toponym_obj["toponym"]
    # Use global indices, slice context
    s, e = toponym_obj["start_idx"], toponym_obj["end_idx"]
    # Define largest window (use LLM to narrow as needed)
    pre = 300
    post = 300
    text_len = len(texts)
    start = max(0, s - pre)
    end = min(text_len, e + post)
    context = texts[start:end]
    
    user_input = {
        "original_text": context,
        "toponym_instances": [{
            "toponym": toponym_str,
            "original_range": [s, e]
        }]
    }
    for attempt in range(retries):
        try:
            response = client.responses.create(
                model="gpt-4.1-2025-04-14",
                instructions=analysis_instructions,
                input=json.dumps(user_input),
                text={"format": {"type": "text"}},
                reasoning={},
                tools=[{
                    "type": "function",
                    "name": "resolve_toponyms_and_detect_emotions",
                    "description": (
                        "Given the user input of the original text and extracted toponyms, "
                        "determine latitude and longitude of each toponym and perform emotion detection. "
                        "Try multiple possible context window sizes (~different context lengths) for each toponym and "
                        "return the window (context) that maximizes the confidence score for the most likely detected emotion."
                    ),
                    "parameters": {
                        "type": "object",
                        "required": ["original_text", "toponym_instances"],
                        "properties": {
                            "original_text": {"type": "string", "description": "The text string from which to disambiguate toponyms and utilize their surrounding context."},
                            "toponym_instances": {
                                "type": "array",
                				"description": "Array of identified toponyms, each containing properties of location details and emotional context.",
                                "items": {
                                    "type": "object",
                                    "required": [
                                        "toponym", "resolved_name", "latitude",
                                        "longitude", "emotion", "confidence_score",
                                        "context", "context_length", "original_range"
                                    ],
                                    "properties": {
                                        "toponym": {"type": "string", "description": "The name of the toponym as found in the previous step."},
                                        "resolved_name": {"type": "string", "description": "The name of the resolved toponym as identified and disambiguated."},
                                        "latitude": {"type": "number", "description": "The latitude coordinate of the toponym."},
                                        "longitude": {"type": "number", "description": "The longitude coordinate of the toponym."},
                                        "emotion": {"type": "string", "description": "The most likely detected emotion around the toponym.", "enum": [
                                            "anger", "disgust", "fear", "joy", "sadness", "surprise", "neutral"
                                        ]},
                                        "confidence_score": {"type": "number", "description": "The confidence score for the detected emotion, on a scale of 0 to 1."},
                                        "context": {"type": "string", "description": "The text block surrounding the toponym used for emotion detection, whose length is determined based on trying different lengths and seeing which one gives the highest confidence score for the most likely detected emotion."},
                                        "context_length": {"type": "number", "description": "The length, in characters including spaces, of the final text block surrounding the toponym used for emotion detection."},
                                        "original_range": {"type": "array", "description": "The original start and end position in the text of the toponym.", "items": {"type": "number"}}
                                    },
                                    "additionalProperties": False,
                                },
                            }
                        },
                        "additionalProperties": False,
                    },
                    "strict": True
                }],
                temperature=1,
                tool_choice="required",
                max_output_tokens=max_output_tokens,
                top_p=1,
                store=True
            )
            return extract_json_from_arguments(response), toponym_str
        except Exception as e:
            wait = 2 ** attempt
            print(f"[API] Analysis error for '{toponym_str}': {e}\nRetrying in {wait}s...")
            time.sleep(wait)
    print(f"[API] Analysis failed after retries for '{toponym_str}'.")
    return {"toponym": toponym_str, "error": "Failed after retries"}, toponym_str

# Define function to validate results for Stage 2
def validate_analysis_results(results, texts):
    # For every output dict, compare the context string and indices (if present)
    valid = []
    for entry in results:
        if "context" in entry and "toponym" in entry and "original_range" in entry:
            s, e = entry["original_range"]
            text_snip = texts[s:e]
            if text_snip.strip().lower() == entry["toponym"].strip().lower():
                valid.append(entry)
            else:
                print(f"Mismatch after phase 2: {entry['toponym']} @ {s}-{e} | Extracted: {repr(text_snip)}")
        else:
            valid.append(entry)
    return valid

# Run Stage 2 in Parallel

analysis_results = []
max_workers_analysis = 20  # You can go higher if needed

with ThreadPoolExecutor(max_workers=max_workers_analysis) as executor:
    futures = [
        executor.submit(
            call_api_with_retry_analysis, t, texts, client, analysis_instructions, 2048
        )
        for t in extracted_toponyms
    ]
    for f in as_completed(futures):
        batch_result = f.result()
        # Each batch_result should be a list of result dicts
        if isinstance(batch_result, list):
            analysis_results += batch_result
        elif isinstance(batch_result, dict):
            analysis_results.append(batch_result)
        # else: ignore/print

# Validate indices and context after analysis, so they always match
final_results = validate_analysis_results(analysis_results, texts)

with open("analysis_results.json", "w", encoding="utf-8") as f:
    json.dump(final_results, f, ensure_ascii=False, indent=2)
print(f"\nStage 2 complete: Produced {len(final_results)} final, validated toponym analyses.")


Text split into 13 chunks for extraction (token-char mapped).

Initial extraction stage complete: Got 159 toponym instances (with possible duplicates).
BAD INDEX: Le Chambon @ 11685-11695 | Extracted: ', because '
BAD INDEX: Washington @ 11757-11767 | Extracted: 'many times'
BAD INDEX: Chambon @ 11803-11810 | Extracted: 'lves. Y'
BAD INDEX: Saint-Etienne @ 11849-11863 | Extracted: 'getting imposs'
BAD INDEX: Le Chambon @ 12654-12664 | Extracted: ' people? W'
BAD INDEX: Chambon-sur-Lignon @ 12795-12814 | Extracted: 'stor took a whole b'
BAD INDEX: Haute-Loire @ 12822-12833 | Extracted: 'people, the'
BAD INDEX: Chambon-sur-Lignon @ 12995-13014 | Extracted: "ey \ncame to us, it'"
BAD INDEX: Chambon @ 13126-13133 | Extracted: 'u somet'
BAD INDEX: Maison des Roches @ 13175-13192 | Extracted: 'o \ndefend them. W'
BAD INDEX: Europe @ 13266-13272 | Extracted: "it's g"
BAD INDEX: Spain @ 13275-13280 | Extracted: 'g to '
BAD INDEX: Belgium @ 13282-13289 | Extracted: 've to a'
BAD INDEX: Austria 

In [8]:
# Take response output in json format, put into a dataframe, then assign numeric values 
# to the detected emotions.

df = pd.DataFrame(analysis_results)

conditions = [
    df["emotion"] == "anger",
    df["emotion"] == "disgust",
    df["emotion"] == "fear",
    df["emotion"] == "joy",
    df["emotion"] == "neutral",
    df["emotion"] == "sadness",
    df["emotion"] == "surprise"
]
values = ["0", "1", "2", "3", "4", "5", "6"]
df["emotion_numeric"] = np.select(conditions, values, default="Unknown")

df

Unnamed: 0,toponym,resolved_name,latitude,longitude,emotion,confidence_score,context,context_length,original_range,emotion_numeric
0,Drancy,Drancy,48.925278,2.445556,fear,0.98,because already in July the French started col...,393,"[37547, 37553]",2
1,France,France,46.603354,1.888334,neutral,0.76,"When I returned from Marseilles at noon, Mrs. ...",427,"[19261, 19267]",4
2,College Cevenol,Collège Cévenol,45.063200,4.302300,neutral,0.77,"I was sent to school, to a school that they ha...",287,"[4777, 4792]",4
3,Vichy,Vichy,46.126400,3.426500,joy,0.85,His father [is] in [the French internment camp...,184,"[18956, 18961]",3
4,Les Grillons,"Les Grillons, guesthouse near Le Chambon-sur-L...",45.065000,4.317000,joy,0.91,Mr. Trocmé came on his bicycle to meet me (sti...,501,"[3897, 3909]",3
...,...,...,...,...,...,...,...,...,...,...
124,Russia,Russia,61.524010,105.318756,neutral,0.66,was the home for students from all \ncountries...,359,"[228, 234]",4
125,Drancy,Drancy,48.925799,2.445120,fear,0.97,"This idyll didn't last too long, because alrea...",278,"[13393, 13399]",2
126,Chambon-sur-Lignon,Le Chambon-sur-Lignon,45.060810,4.302941,fear,0.93,"And on New Year's Eve, \nthey explained to me ...",535,"[12836, 12855]",2
127,Spain,Spain,40.463667,-3.749220,neutral,0.67,which was the home for students from all \ncou...,307,"[13110, 13115]",4


In [5]:
# Export results to csv

df.to_csv("Results18_ToponymsEmotions_smallSubCorpus.csv", encoding="utf-8-sig", index=False, header=True, mode="w+")