In [None]:
"""
To establish a training corpus to create a finely-tuned model:
Prepare a dataset formatted as a list of tuples, where each tuple contains a text string and an associated list of annotations. 
Annotations should be tuples of (toponym string, start character, end character, location id) that mark the toponyms within the text:

NOTE:  This is just a tiny sample of the training corpus, and represents just one iteration of many.

"""

train_corpus = [
    ("She received a letter asking whether she wanted to send some children to a very nice school in Haute-Loire at an elevation of one thousand meters.", [("Haute-Loire", 95, 106, 3013760)]),
("I left aboard the Marseilles to Paris express train, crossed the La Voulte bridge.  I took the Le Cheylard local train, and I made my way to Saint Agrève.", [("Saint-Agrève", 141, 153, 2981901)]),
("I arrived at St. Agrève at 11:49 p.m. The train did not go beyond there.", [("Saint-Agrève", 13, 23, 2981901)]),
("And I was told to get off at a place called Le Chambon-sur-Lignon.", [("Le Chambon-sur-Lignon", 44, 65, 3004774)])
]

In [None]:
#Check for CUDA-enabled GPU. 
#If available (True), then Geoparser will automatically use the GPU.

import torch

print(torch.cuda.is_available())

In [None]:
"""
Once you have your dataset, use the annotate method to convert the text and annotations into 
gold GeoDoc objects suitable for training:

This uses the "dguzh/geo-all-distilroberta-v1" transformer model, as I found this most effective. 
There are other models available (see Irchel Geoparser documentation on the library's website...
"""

from geoparser import GeoparserTrainer

trainer = GeoparserTrainer(transformer_model="dguzh/geo-all-distilroberta-v1")

train_docs = trainer.annotate(train_corpus)

In [None]:
"""
Then train a model using the prepared documents, with the output path the location of the model you 
are training.
"""

output_path = "C:/Users/....../TrainedModel/"

trainer.train(train_docs, output_path=output_path)

In [None]:
"""
After training, create a test corpus (NOTE:  again, this is just a tiny sample below).

Then, you can use your fine-tuned model to resolve toponyms in a test set and evaluate 
how well your model performed:

It will calculate your model's overall Accuracy, Accuracy at 161 km, Mean error distance, and
Area Under the Curve, all calculated using traditional metrics in the field.

"""

test_corpus = [
    ("She received a letter asking whether she wanted to send some children to a very nice school in Haute-Loire at an elevation of one thousand meters.", [("Haute-Loire", 95, 106, 3013760)]),
("I left aboard the Marseilles to Paris express train, crossed the La Voulte bridge.  I took the Le Cheylard local train, and I made my way to Saint Agrève.", [("Saint-Agrève", 141, 153, 2981901)]),
("I arrived at St. Agrève at 11:49 p.m. The train did not go beyond there.", [("Saint-Agrève", 13, 23, 2981901)]),
("And I was told to get off at a place called Le Chambon-sur-Lignon.", [("Le Chambon-sur-Lignon", 44, 65, 3004774)])
]

test_docs = trainer.annotate(test_corpus)

trainer.resolve(test_docs)

evaluation_results = trainer.evaluate(test_docs)

print(evaluation_results)