In [None]:
"""
This set of scripts is for using the Irchel Geoparser with its pre-trained model, 
not the fine-tuned model that you can create! 

(see "Training_IrchelFineTunedModel" and "Geoparsing_IrchelTrainedModel" Jupyter notebook files 
if you want to use the fine-tuned model, which is highly recommended!)
"""

In [None]:
#Check for CUDA-enabled GPU. 
#If available (True), then Geoparser will automatically use the GPU.

import torch

print(torch.cuda.is_available())

In [None]:
"""
Load library and instantiate the geoparser.  Again, this is just the standard pre-trainined model, not 
the fine-tuned model.
"""

from geoparser import Geoparser

geo = Geoparser()

In [None]:
"""
Load your data...
This create a comprehensive "list of strings" from all .txt files in a directory, which 
represents your data.
You have to use "f.read" not "f.readlines".
If you use "f.readlines" you get a list within a list, and the geoparser cannot read it!
"""

# This establishes the path to your data.
import os
path = "C:/Users/YOUR_PATH/Data/"

def read_txt_files(directory):
    # Reads all .txt files in a directory and returns a list of their contents.

    file_contents = []
    
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            filepath = os.path.join(directory, filename)
            with open(filepath, "r", encoding="utf8") as f:
                file_contents.append(f.read())
    return file_contents

texts = read_txt_files(path)

# The following line prints the entire combined list of strings, so beware if it is a long set of data!
print(texts) 


In [None]:
"""
Calls the GeoParser parse method, parses the 'texts',
and limits the toponym resolution of location to France.
"""
docs = geo.parse(texts, country_filter=['FR'])

In [None]:
"""
Iterates over the toponyms, giving starting and ending character location in the data,
but limits the results to those which have a toponym resolution thru using 'if toponym.location'
"""
for doc in docs:
    for toponym in doc.toponyms:
        if toponym.location:
            print(toponym, toponym.start_char, toponym.end_char)

In [None]:
# Returns the results with their GeoName ID, latitude, and longitude

for doc in docs:
    for toponym in doc.toponyms:
        if toponym.location:
            print(toponym, toponym.location['geonameid'], toponym.location['latitude'], toponym.location['longitude'])

In [None]:
"""
Returns the results with the toponym, alternative names, administrative divisions, features type,
and population.
"""

for doc in docs:
    identifiers = doc.locations['name', 'alternatenames', 'admin2_name', 'admin1_name', 'country_name', 'feature_name', 'population']
    for toponym, identifier in zip(doc.toponyms, identifiers):
        if toponym.location:
            print(toponym, "->", identifier)

In [None]:
"""
Returns the results with the toponym, resolved location, features type, coordinates, and
confidence score in the resolved location.

If it was not resolved, it returns any unresolved toponyms as well.
"""

for doc in docs:
    for toponym, location in zip(doc.toponyms, doc.locations):
        print(f"- Toponym: {toponym.text}")
        if toponym.location:
            print(f"  Resolved Location: {toponym.location['name'], toponym.location['admin2_name'], toponym.location['admin1_name'], toponym.location['country_name']}")
            print(f"  Feature Type: {toponym.location['feature_name']}")
            print(f"  Coordinates: {toponym.location['latitude'], toponym.location['longitude']}")
            print(f"  Score: {toponym.score}")
            print(f"  ")
        else:
            print("Location could not be resolved.")
            print("  ")
    print()    

In [None]:
"""
This also returns the toponym and its resolved latitude and longitude coordinates, as well
as any unresolved toponyms.

It then create separate csv files of all resolved and unresolved toponyms via separate pandas dataframes.
"""

import pandas as pd

Toponyms_Resolved = []
Toponyms_UnResolved = []
for doc in docs:
    for toponym in doc.toponyms:
        if toponym.location:
            Toponyms_Resolved.append({"Toponym": toponym, "Latitude": toponym.location['latitude'], "Longitude": toponym.location['longitude']})
        else:
            Toponyms_UnResolved.append({"Toponym UnResolved": toponym})
            
df = pd.DataFrame(Toponyms_Resolved, dtype="string")
df_UnResolved = pd.DataFrame(Toponyms_UnResolved, dtype="string")
df.to_csv("C:/Users/....../Results_Geoparser_Resolved.csv", encoding="utf-8-sig", index=False, header=True, mode="w+")
df_UnResolved.to_csv("C:/Users/....../Results_Geoparser_UnResolved.csv", encoding="utf-8-sig", index=False, header=True, mode="w+")