In [None]:
"""
The following set of scripts is for geoparsing directly using OpenAI's GPT-4.1 for both toponym 
recognition and resolution, with NO need for separate entity linking via a knowledge base.

After enabling the GPU, it begins with loading the data for analysis.

Then there are two different cells that essentially do the same thing, but the first cell (commented
out) is if you require streaming of responses due to response size issues. 

Each of these cells load the libraries, instantiate the model, load the instructions file,
then create and run the function using the parameter settings indicated. 

Then the responses are json normalized and then put into a pandas dataframe.

These results are then exported to a csv file.

"""

In [None]:
#Check for CUDA-enabled GPU. 
#If available (True), then it will automatically use the GPU.

import torch
torch.cuda.is_available()

In [None]:
"""
Load your data...
This create a combined string from all .txt files in a directory, which 
represents your data.
You have to use "f.read" not "f.readlines".
"""

# Establish the path to your data
import os
path = "C:/Users/....../Data/"

def read_txt_files(directory):
    # Reads all .txt files in a directory and returns a combined string of their contents.

    file_contents = ''
    
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            filepath = os.path.join(directory, filename)
            with open(filepath, "r", encoding="utf8") as f:
                file_contents = file_contents + (f.read())
    return file_contents

texts = read_txt_files(path)

In [None]:
"""
Use this cell if you need to do streaming of response due to size.
##########      OTHERWISE - USE THE CELL BELOW     ########
"""

"""
# Set a global variable for my OpenAI API key so that the model can be accessed.

os.environ["OPENAI_API_KEY"] = "Your_OpenAI_API_Key"

# Import libraries
from openai import OpenAI
from pydantic import BaseModel
import pandas as pd
import json

# Instantiate the model

client = OpenAI()

# Bring "Instructions" txt file into memory for the response function to access

with open("C:/Users/....../openai_toponym_prompt.txt", "r", encoding="utf-8") as f:
    instructions = f.read()

# Create response function to recognize and resolve toponyms

stream = client.responses.create(
  model="gpt-4.1-2025-04-14",
  instructions=instructions,
  input=texts,
  text={
    "format": {
      "type": "text"
    }
  },
  reasoning={},
  tools=[
    {
      "type": "function",
      "name": "toponym_recognition_resolution",
      "strict": True,
      "parameters": {
        "type": "object",
        "required": [
          "input_string",
          "toponyms"
        ],
        "properties": {
          "toponyms": {
            "type": "array",
            "items": {
              "type": "object",
              "required": [
                "name",
                "latitude",
                "longitude"
              ],
              "properties": {
                "name": {
                  "type": "string",
                  "description": "The name of the toponym."
                },
                "latitude": {
                  "type": "number",
                  "description": "The latitude of the toponym."
                },
                "longitude": {
                  "type": "number",
                  "description": "The longitude of the toponym."
                }
              },
              "additionalProperties": False
            },
            "description": "Array of all identified and disambiguated toponyms with their geographical coordinates."
          },
          "input_string": {
            "type": "string",
            "description": "The input string containing potential toponyms to recognize and resolve."
          }
        },
        "additionalProperties": False
      },
      "description": "Model takes string as input and performs toponym recognition and resolution using logic and context, with output response being all of the identified and correctly disambiguated toponyms and their latitude and longitude, even duplicates.  ALL instances of the identified and correctly disambiguated toponyms and their latitude and longitude are included in the response, even when there are duplicates or many instances of the same toponyms in the input text, including variations of spelling of the same toponymns."
    }
  ],
  temperature=1,
  max_output_tokens=32768,
  top_p=1,
  store=True,
  stream=True
)


# This streams the responses into a dictionary
final_tool_calls = {}

for event in stream:
    if event.type == 'response.output_item.added':
        final_tool_calls[event.output_index] = event.item;
    elif event.type == 'response.function_call_arguments.delta':
        index = event.output_index

        if final_tool_calls[index]:
            final_tool_calls[index].arguments += event.delta

print(final_tool_calls)
"""

In [None]:
"""
##########      USE THIS CELL (UNLESS YOU NEED STREAMING OF RESPONSES)     ########
"""

# Set a global variable for my OpenAI API key so that the model can be accessed.

os.environ["OPENAI_API_KEY"] = "Your_OpenAI_API_Key"

# Import libraries
from openai import OpenAI
from pydantic import BaseModel
import pandas as pd
import json

# Instantiate the model

client = OpenAI()

# Bring "Instructions" txt file into memory for the response function to access

with open("C:/Users/....../openai_toponym_prompt.txt", "r", encoding="utf-8") as f:
    instructions = f.read()

# Create response function to recognize and resolve toponyms

response = client.responses.create(
  model="gpt-4.1-2025-04-14",
  instructions=instructions,
  input=texts,
  text={
    "format": {
      "type": "text"
    }
  },
  reasoning={},
  tools=[
    {
      "type": "function",
      "name": "toponym_recognition_resolution",
      "strict": True,
      "parameters": {
        "type": "object",
        "required": [
          "input_string",
          "toponyms"
        ],
        "properties": {
          "toponyms": {
            "type": "array",
            "items": {
              "type": "object",
              "required": [
                "name",
                "latitude",
                "longitude"
              ],
              "properties": {
                "name": {
                  "type": "string",
                  "description": "The name of the toponym."
                },
                "latitude": {
                  "type": "number",
                  "description": "The latitude of the toponym."
                },
                "longitude": {
                  "type": "number",
                  "description": "The longitude of the toponym."
                }
              },
              "additionalProperties": False
            },
            "description": "Array of all identified and disambiguated toponyms with their geographical coordinates."
          },
          "input_string": {
            "type": "string",
            "description": "The input string containing potential toponyms to recognize and resolve."
          }
        },
        "additionalProperties": False
      },
      "description": "Model takes string as input and performs toponym recognition and resolution using logic and context, with output response being all of the identified and correctly disambiguated toponyms and their latitude and longitude, even duplicates.  ALL instances of the identified and correctly disambiguated toponyms and their latitude and longitude are included in the response, even when there are duplicates or many instances of the same toponyms in the input text, including variations of spelling of the same toponyms."
    }
  ],
  temperature=1,
  max_output_tokens=32768,
  top_p=1,
  store=True
)


In [None]:
#Take response output, convert into json format, then normalize the data that I want and put into a dataframe.

output = json.loads(response.output[0].arguments)
df = pd.json_normalize(output['toponyms'], meta=['name', 'latitude', 'longitude'])
df

In [None]:
# Export results to csv file...

df.to_csv("C:/Users/....../Results_ResolvedToponyms.csv", encoding="utf-8-sig", index=False, header=True, mode="w+")