In [1]:
# Check for CUDA and GPU, and if True, GPU will be used.

import torch
torch.cuda.is_available()

True

In [2]:
# Bring in the sample dataset, the smaller sub-corpus.

import os
path = "YOUR_DATA_test"

def read_txt_files(directory):
    # Reads all .txt files in a directory and returns a combined string of their contents.

    file_contents = ''
    
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            filepath = os.path.join(directory, filename)
            with open(filepath, "r", encoding="utf8") as f:
                file_contents = file_contents + (f.read())
    return file_contents

texts = read_txt_files(path)

In [3]:
"""
Developing function to identify and resolve toponyms, and detect emotions in context 
on either side of each toponym.  Context length is based on trying different lengths,
with the final context length chosen based on which gives the most likely detected emotion
with the highest confidence score.

"""

# Set a global variable for my OpenAI API key so that the model can be accessed.

import os
os.environ["OPENAI_API_KEY"] = "YOUR_KEY"

# Access libraries
from openai import OpenAI
from pydantic import BaseModel
import pandas as pd
import json
import numpy as np

# Access model with my API key

client = OpenAI()

# Bring "Instructions" txt file into memory for the response function to access

with open("openai_EmotionSpatialDetect4_prompt.txt", "r", encoding="utf-8") as f:
    instructions = f.read()

# Create response function to recognize and resolve toponyms and detect emotions in 150 character context
# on either side of the toponym.

response = client.responses.create(
  model="gpt-4.1-2025-04-14",
  instructions=instructions,
  input=texts,
  text={
    "format": {
      "type": "text"
    }
  },
  reasoning={},
  tools=[
    {
      "type": "function",
      "name": "identify_toponyms_and_emotions",
      "description": "Read the user input text string and identify all the toponyms in the text, disambiguate their locations, and perform emotion detection on surrounding context for all toponyms.",
      "parameters": {
        "type": "object",
        "required": [
          "input_text",
          "toponyms"
        ],
        "properties": {
          "input_text": {
            "type": "string",
            "description": "The text string from which to identify toponyms and their surrounding context."
          },
          "toponyms": {
            "type": "array",
            "description": "Array of identified toponyms, each containing properties of location details and emotional context.",
            "items": {
              "type": "object",
              "properties": {
                "name": {
                  "type": "string",
                  "description": "The name of the toponym identified."
                },
                "latitude": {
                  "type": "number",
                  "description": "The latitude coordinate of the toponym."
                },
                "longitude": {
                  "type": "number",
                  "description": "The longitude coordinate of the toponym."
                },
                "emotion": {
                  "type": "string",
                  "description": "The most likely detected emotion around the toponym.",
                  "enum": [
                    "anger",
                    "disgust",
                    "fear",
                    "joy",
                    "sadness",
                    "surprise",
                    "neutral"
                  ]
                },
                "confidence_score": {
                  "type": "number",
                  "description": "The confidence score for the detected emotion, on a scale of 0 to 1."
                },
                "context": {
                  "type": "string",
                  "description": "The block of text surrounding the toponym used for emotion detection, whose length is determined based on trying different lengths and seeing which one gives the highest confidence score for the most likely detected emotion."
                },
                "context_length": {
                  "type": "number",
                  "description": "The length, in characters including spaces, of the final block of text surrounding the toponym used for emotion detection."
                }
              },
              "required": [
                "name",
                "latitude",
                "longitude",
                "emotion",
                "confidence_score",
                "context",
                "context_length"
              ],
              "additionalProperties": False
            }
          }
        },
        "additionalProperties": False
      },
      "strict": True
    }
  ],
  temperature=1,
  max_output_tokens=32768,
  top_p=1,
  store=True
)

In [4]:
#Take response output, convert into json format, then normalize the data that I want 
#and put into a dataframe. Then assign numeric values to the detected emotions.

output = json.loads(response.output[0].arguments)
df = pd.json_normalize(output['toponyms'], meta=['name', 'latitude', 'longitude', 'emotion', 'confidence_score', 'context', 'context_length'])

conditions = [
    df["emotion"] == "anger",
    df["emotion"] == "disgust",
    df["emotion"] == "fear",
    df["emotion"] == "joy",
    df["emotion"] == "neutral",
    df["emotion"] == "sadness",
    df["emotion"] == "surprise"
]
values = ["0", "1", "2", "3", "4", "5", "6"]
df["emotion_numeric"] = np.select(conditions, values, default="Unknown")

df

Unnamed: 0,name,latitude,longitude,emotion,confidence_score,context,context_length,emotion_numeric
0,Haute-Loire,45.0667,3.91667,sadness,0.91,"Monday, September 21, 1942 I am sad and weary....",323,5
1,La Rouvière,44.036585,4.422371,sadness,0.85,"Monday, January 4, 1943 Mr. Brémond came to La...",208,5
2,Les Caillols,43.31667,5.46667,sadness,0.82,"Tuesday, January 5, 1943 [Les Caillols] Mrs. B...",410,5
3,Les Grillons,45.059511,4.283306,neutral,0.74,"She gave this letter to a young girl, Simone F...",357,4
4,Marseilles,43.296482,5.36978,fear,0.89,"When I returned from Marseilles at noon, Mrs. ...",165,2
5,Le Chambon sur Lignon,45.06081,4.302941,fear,0.86,"Saturday, January 16, 1943 [ Le Chambon sur Li...",564,2
6,Marseilles,43.296482,5.36978,sadness,0.85,It is much colder here than in Marseilles. I a...,227,5
7,Gurs,43.32383,-0.734628,sadness,0.92,I found an Austrian with whom I get along well...,278,5
8,Vichy,46.12868,3.42643,neutral,0.6,His father [is] in [the French internment camp...,339,4


In [5]:
# Export results to csv

df.to_csv("Results4_ToponymsEmotions_smallSubCorpus.csv", encoding="utf-8-sig", index=False, header=True, mode="w+")