In [1]:
import torch
torch.cuda.is_available()

True

In [5]:
# Bring in the sample dataset, the smaller sub-corpus.

import os
path = "YOUR_DATA_test"

def read_txt_files(directory):
    # Reads all .txt files in a directory and returns a combined string of their contents.

    file_contents = ''
    
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            filepath = os.path.join(directory, filename)
            with open(filepath, "r", encoding="utf8") as f:
                file_contents = file_contents + (f.read())
    return file_contents

texts = read_txt_files(path)

In [15]:
# Developing function to identify and resolve toponyms, and detect emotions in 150 word context 
# on either side of each toponym.

# Access libraries
import os
from openai import OpenAI
from pydantic import BaseModel
import pandas as pd
import json

# Set a global variable for my OpenAI API key and access model
os.environ["OPENAI_API_KEY"] = "YOUR_KEY"
client = OpenAI()

# Bring "Instructions" txt file into memory for the response function to access

with open("openai_EmotionSpatialDetect_prompt_22.txt", "r", encoding="utf-8") as f:
    instructions = f.read()

# Alternative data for testing:
#texts = "I know them by sight. We're becoming friends now. I'm glad not to have to travel alone. Age 16, November 1941, Le Chambon-sur-Lignon, France. I'm in Le Chambon-sur-Lignon. It's way up on a high plateau in the mountains. It's beautiful here. What an amazing difference from Gurs! I'm staying at La Guespy, which is a home for refugees.  My friend just arrived, and before she came she saw her mother for an hour that morning through the fence, and then they put them all on cattle wagons and they were deported. And now they're gone.  We are not sure, but we think they took them to Drancy.  Martin and I are working in the new carpentry shop now. I don't really like carpentry, but at least I know how to do it, and at least I am safe."

# Create response function to recognize and resolve toponyms and detect emotions in 150 character context
# on either side of the toponym.

response = client.responses.create(
  model="gpt-4.1-2025-04-14",
  instructions=instructions,
  input=texts,
  text={
    "format": {
      "type": "text"
    }
  },
  reasoning={},
  tools=[
    {
      "type": "function",
      "name": "identify_toponyms_and_emotions",
      "description": "Read the input text string and identify the toponyms in the text, disambiguate their locations, and perform emotion detection on surrounding context.",
      "parameters": {
        "type": "object",
        "required": [
          "input_text",
          "toponyms"
        ],
        "properties": {
          "input_text": {
            "type": "string",
            "description": "The text string from which to identify toponyms and their surrounding context."
          },
          "toponyms": {
            "type": "array",
            "description": "Array of identified toponyms, each containing properties of location details and emotional context.",
            "items": {
              "type": "object",
              "properties": {
                "name": {
                  "type": "string",
                  "description": "The name of the toponym identified."
                },
                "latitude": {
                  "type": "number",
                  "description": "The latitude coordinate of the toponym."
                },
                "longitude": {
                  "type": "number",
                  "description": "The longitude coordinate of the toponym."
                },
                "emotion": {
                  "type": "string",
                  "description": "The most likely detected emotion around the toponym.",
                  "enum": [
                    "anger",
                    "disgust",
                    "fear",
                    "joy",
                    "sadness",
                    "surprise",
                    "neutral"
                  ]
                },
                "confidence_score": {
                  "type": "number",
                  "description": "The confidence score for the detected emotion, on a scale of 0 to 1."
                },
                "context": {
                  "type": "string",
                  "description": "The block of text (150 characters on either side) surrounding the toponym used for emotion detection."
                }
              },
              "required": [
                "name",
                "latitude",
                "longitude",
                "emotion",
                "confidence_score",
                "context"
              ],
              "additionalProperties": False
            }
          }
        },
        "additionalProperties": False
      },
      "strict": True
    }
  ],
  temperature=0,
  max_output_tokens=32768,
  top_p=0.5,
  store=True
)

In [16]:
#Take response output, convert into json format, then normalize the data that I want 
#and put into a dataframe.

output = json.loads(response.output[0].arguments)
df = pd.json_normalize(output['toponyms'], meta=['name', 'latitude', 'longitude', 'emotion', 'confidence_score', 'context'])
df

Unnamed: 0,name,latitude,longitude,emotion,confidence_score,context
0,Haute-Loire,45.1333,3.9167,neutral,0.7,During rest hour Mrs. Cavailhon called me. She...
1,La Rouvière,44.0497,4.4181,sadness,0.8,"Monday, January 4, 1943 Mr. Brémond came to La..."
2,Les Caillols,43.3047,5.4447,neutral,0.6,"Tuesday, January 5, 1943 [Les Caillols] Mrs. B..."
3,Les Grillons,45.0625,4.3042,joy,0.85,"She gave this letter to a young girl, Simone F..."
4,Marseilles,43.2965,5.3698,neutral,0.7,"Friday, January 15, 1943 We were told that we'..."
5,Le Chambon sur Lignon,45.06081,4.302941,joy,0.9,"Saturday, January 16, 1943 [ Le Chambon sur Li..."
6,St. Agrève,45.0117,4.3706,neutral,0.7,I left aboard the Marseilles to Paris express ...
7,Les Grillons,45.0625,4.3042,joy,0.9,Now we still had to cover fifteen kilometers t...
8,Marseilles,43.2965,5.3698,neutral,0.7,Every day we have to cover twelve kilometers i...
9,Le Chambon-sur-Lignon,45.06081,4.302941,fear,0.85,"And on New Year's Eve, they explained to me th..."


In [17]:
df.to_csv("Results22C_ToponymsEmotions_smallSubCorpus.csv", encoding="utf-8-sig", index=False, header=True, mode="w+")