In [None]:
"""
The following set of scripts allows geoparsing using spaCy's NLP capabilities:
It begins with loading different datasets which have csv files with names of locations 
and their coordinates.
Then it creates "Entity Ruler" pipes and sets them within the spaCy pipeline, notably BEFORE the 
normal spaCy NER (Named Entity Recognition).
Note:  Besides the Entity Rulers created from the loaded datasets, I show here how to directly create
additional pipes just using a typed list.  Some of these are not geographical, but are useful for me
and I will be using them in future research.
Then it brings in the data, and runs it through the spaCy pipeline which now has the Entity Rulers.
Then it creates dataframes of all the locations extracted by the Entity Rulers. Of note, here I also
included an instance of an error I received when doing this and my work-around.
Then it joins the dataframes of the extracted locations to the original dataframes with the coordinates,
resulting in a list of resolved toponyms, by dataset.
These results are then exported to csv files.
I have also included the additional scripts to return the results on the other Entity Rulers 
(both non-geographic and for unnamed geographic features) as well as the results from the 
normal spaCy NER categories, and export them to a csv.

"""

In [None]:
"""
Reading in the csv file with the names and coordinates of the towns, villages, and named features
of the Plateau and creating a pandas dataframe with these locations and their coordinates.
"""

import pandas as pd
df = pd.read_csv("C:/Users/....../PlateauLocationCoord.csv")
df = df.dropna()
df = df[["Plateau_Location", "Lat", "Long", "Alt"]]
df["Lat"] = pd.to_numeric(df["Lat"], downcast="float")
df["Long"] = pd.to_numeric(df["Long"], downcast="float")
df["Alt"] = pd.to_numeric(df["Alt"], downcast="float")
df.columns = ["Plateau_Locations", "LAT", "LON", "ALT"]
df

In [None]:
"""
Reading in the csv file with the names and coordinates of the childrens homes
of the Plateau and creating a pandas dataframe with these locations and their coordinates.
"""

df_homes = pd.read_csv("C:/Users/....../homesCoord.csv")
df_homes = df_homes.dropna()
df_homes = df_homes[["homes", "Lat", "Long", "Alt"]]
df_homes["Lat"] = pd.to_numeric(df_homes["Lat"], downcast="float")
df_homes["Long"] = pd.to_numeric(df_homes["Long"], downcast="float")
df_homes["Alt"] = pd.to_numeric(df_homes["Alt"], downcast="float")
df_homes.columns = ["Childrens_Homes", "LAT", "LON", "ALT"]
df_homes

In [None]:
"""
Reading in the csv file with the names and coordinates of the schools on the Plateau
for refugees and creating a pandas dataframe with these locations and their coordinates.
"""

df_schools = pd.read_csv("C:/Users/....../schoolsCoord.csv")
df_schools = df_schools.dropna()
df_schools = df_schools[["schools", "Lat", "Long", "Alt"]]
df_schools["Lat"] = pd.to_numeric(df_schools["Lat"], downcast="float")
df_schools["Long"] = pd.to_numeric(df_schools["Long"], downcast="float")
df_schools["Alt"] = pd.to_numeric(df_schools["Alt"], downcast="float")
df_schools.columns = ["Schools", "LAT", "LON", "ALT"]
df_schools

In [None]:
"""
Reading in the csv file with the names and coordinates of the farms named in the testimonies
and creating a pandas dataframe with these locations and their coordinates.
"""

df_farms = pd.read_csv("C:/Users/....../farmsCoord.csv")
df_farms = df_farms.dropna()
df_farms = df_farms[["farms", "Lat", "Long", "Alt"]]
df_farms["Lat"] = pd.to_numeric(df_farms["Lat"], downcast="float")
df_farms["Long"] = pd.to_numeric(df_farms["Long"], downcast="float")
df_farms["Alt"] = pd.to_numeric(df_farms["Alt"], downcast="float")
df_farms.columns = ["Farms", "LAT", "LON", "ALT"]
df_farms

In [None]:
"""
Reading in the csv file with the names and coordinates of the hotels used by German soldiers
for convalescence and creating a pandas dataframe with these locations and their coordinates.
"""

df_hotelsGer = pd.read_csv("C:/Users/....../hotelsGermanConvalesCoord.csv")
df_hotelsGer = df_hotelsGer.dropna()
df_hotelsGer = df_hotelsGer[["hotels", "Lat", "Long", "Alt"]]
df_hotelsGer["Lat"] = pd.to_numeric(df_hotelsGer["Lat"], downcast="float")
df_hotelsGer["Long"] = pd.to_numeric(df_hotelsGer["Long"], downcast="float")
df_hotelsGer["Alt"] = pd.to_numeric(df_hotelsGer["Alt"], downcast="float")
df_hotelsGer.columns = ["Hotels_GermanConvalescence", "LAT", "LON", "ALT"]
df_hotelsGer

In [None]:
"""
Reading in the csv file with the names and coordinates of the other French cities, towns, and camps
that are frequently mentioned and creating a pandas dataframe with these locations and their coordinates.
"""

df_otherFreLoc = pd.read_csv("C:/Users/....../otherFrenchLocationsCoord.csv")
df_otherFreLoc = df_otherFreLoc.dropna()
df_otherFreLoc = df_otherFreLoc[["other_French_Location", "Lat", "Long", "Alt"]]
df_otherFreLoc["Lat"] = pd.to_numeric(df_otherFreLoc["Lat"], downcast="float")
df_otherFreLoc["Long"] = pd.to_numeric(df_otherFreLoc["Long"], downcast="float")
df_otherFreLoc["Alt"] = pd.to_numeric(df_otherFreLoc["Alt"], downcast="float")
df_otherFreLoc.columns = ["other_French_Locations", "LAT", "LON", "ALT"]
df_otherFreLoc

In [None]:
#  Load libraries.

import spacy
## spacy.require_gpu()    # Could do this if desired, but need to install PyTorch and CuPy.  
from spacy.util import filter_spans
from spacy.tokens import Span
from spacy.language import Language
import re
import unidecode
from spacy import displacy

In [None]:
# Instantiate the spaCy model.

# The following is for the English trf model.
nlp = spacy.load("en_core_web_trf")

## You would use the following for the French model...
# nlp = spacy.load("fr_core_news_lg")

In [None]:
# Creating an entity ruler to find named locations (communes, hamlets, mountains, rivers, etc.) on the Plateau
Plateau_Locations = df["Plateau_Locations"].to_list()
Plateau_Location_patterns = []
for Plateau_Location in Plateau_Locations:
    normalized = unidecode.unidecode(Plateau_Location)
    Plateau_Location_patterns.append({"pattern": Plateau_Location, "label": "PLATEAU LOCATION"})
    Plateau_Location_patterns.append({"pattern": normalized, "label": "PLATEAU LOCATION"})
Plateau_Location_ruler = nlp.add_pipe("entity_ruler", before="ner", name="Plateau_Location_ruler")
Plateau_Location_ruler.add_patterns(Plateau_Location_patterns)

In [None]:
# Creating an entity ruler to find named children's homes on the Plateau
homes = df_homes["Childrens_Homes"].to_list()
home_patterns = []
for home in homes:
    normalized = unidecode.unidecode(home)
    home_patterns.append({"pattern": home, "label": "CHILDREN'S HOME"})
    home_patterns.append({"pattern": normalized, "label": "CHILDREN'S HOME"})
home_ruler = nlp.add_pipe("entity_ruler", before="ner", name="home_ruler")
home_ruler.add_patterns(home_patterns)

In [None]:
# Creating an entity ruler to find named schools on the Plateau
schools = df_schools["Schools"].to_list()
school_patterns = []
for school in schools:
    normalized = unidecode.unidecode(school)
    school_patterns.append({"pattern": school, "label": "SCHOOL"})
    school_patterns.append({"pattern": normalized, "label": "SCHOOL"})
school_ruler = nlp.add_pipe("entity_ruler", before="ner", name="school_ruler")
school_ruler.add_patterns(school_patterns)

In [None]:
# Creating an entity ruler to find named farms on the Plateau
farms = df_farms["Farms"].to_list()
farm_patterns = []
for farm in farms:
    normalized = unidecode.unidecode(farm)
    farm_patterns.append({"pattern": farm, "label": "FARM"})
    farm_patterns.append({"pattern": normalized, "label": "FARM"})
farm_ruler = nlp.add_pipe("entity_ruler", before="ner", name="farm_ruler")
farm_ruler.add_patterns(farm_patterns)

In [None]:
# Creating an entity ruler to find named hotels used for German convalescent soldiers on the Plateau
hotels = df_hotelsGer["Hotels_GermanConvalescence"].to_list()
hotel_patterns = []
for hotel in hotels:
    normalized = unidecode.unidecode(hotel)
    hotel_patterns.append({"pattern": hotel, "label": "HOTEL USED BY GERMANS"})
    hotel_patterns.append({"pattern": normalized, "label": "HOTEL USED BY GERMANS"})
hotel_ruler = nlp.add_pipe("entity_ruler", before="ner", name="hotel_ruler")
hotel_ruler.add_patterns(hotel_patterns)

In [None]:
# Creating an entity ruler to find other named French locations mentioned often in the testimonies
other_French_Locations = df_otherFreLoc["other_French_Locations"].to_list()
other_French_Location_patterns = []
for other_French_Location in other_French_Locations:
    normalized = unidecode.unidecode(other_French_Location)
    other_French_Location_patterns.append({"pattern": other_French_Location, "label": "OTHER FRENCH LOCATION"})
    other_French_Location_patterns.append({"pattern": normalized, "label": "OTHER FRENCH LOCATION"})
other_French_Location_ruler = nlp.add_pipe("entity_ruler", before="ner", name="other_French_Location_ruler")
other_French_Location_ruler.add_patterns(other_French_Location_patterns)

In [None]:
# Creating an entity ruler to find Jewish aid organizations active in France during the Holocaust
aid_orgs_jewish = ['Oeuvre de Secours aux Enfants', 'OSE', 'OSÉ', "Children's Aid Society", 'Mouvement de la Jeunesse Sioniste', 
                  'MJS', 'Eclaireuses et Eclaireurs Israélites de France', 'Eclaireurs Israélites de France','EIF',
                  'Jewish Scouts', 'Boy Scouts', 'Scouts', 'La Sixième', 'Sixième', 'Armée Juive', 'AJ', 'WIZO',
                  'Womens International Zionist Organization', 'Union Générale des Israélites de France', 'UGIF',
                  'Comité de la rue Amelot', 'rue Amelot', 'Rue Amelot', 'Le Comité Amelot', 'Comité Rue Amelot', 
                  'Amelot', 'Solidarite', "Union des Juifs pour la Resistance et l'Entraide", 'Service André', 
                  'Groupe Maurice Cachoud', 'Scout', 'Boy Scout', 'Girl Scouts', 'Girl Scout', 'Oeuvre de Secour aux Enfants', 
                  'Jewish Joint Distribution Committee', 'JDC', 'Joint', 'Circuit Garel', 'Garel Circuit', 'Garel Network',
                  "Children's Aid Rescue Society", "Commission Central des Organizations Juives d'Assistance", 'CCOJA', 
                  'Central Commission of Jewish Assistance Organizations', 'Camps Commission', 'Jewish Zionist Youth Movement',
                  'General Union of French Jews', 'Organization for Rehabilitation and Training of Jews', 'ORT',
                  'Zionist Youth Movement', "Comité d'Assistance aux Refugiés", 'CAR', 'Committee for Assistance to Refugees',
                  'Sixth Division', 'HIAS', 'Hebrew Immigrant Aid Society', 'HICEM', 'Jewish Colonisation Association', 
                  'Emigdirect', 'Bass Network', 'Bass rescue network', "Children's Aid", 'Aid Society']
aid_org_jewish_patterns = []
for aid_org_jewish in aid_orgs_jewish:
    normalized = unidecode.unidecode(aid_org_jewish)
    aid_org_jewish_patterns.append({"pattern": aid_org_jewish, "label": "JEWISH AID ORGANIZATION"})
    aid_org_jewish_patterns.append({"pattern": normalized, "label": "JEWISH AID ORGANIZATION"})
aid_org_jewish_ruler = nlp.add_pipe("entity_ruler", before="ner", name="aid_org_jewish_ruler")
aid_org_jewish_ruler.add_patterns(aid_org_jewish_patterns)

In [None]:
# Creating an entity ruler to find other, non-Jewish aid organizations active in France during the Holocaust
aid_orgs_other = ['Swiss Aid for Children', 'Swiss Aid', 'Cimade', 'Comité inter-mouvements auprès des évacués', 
                  'Salvation Army', 'Armée du Salut', 'Red Cross', 'Swiss Red Cross', 'Croix-Rouge', 
                  'Croix-Rouge suisse', 'Quaker', 'Quakers', 'Secours Suisse aux Enfants', 'Secours Suisse',
                 'International Protestant Loan Association', 'European Student Relief', 'Nîmes Committee', 
                 'American Friends Service Committee', 'American Friends', 'Friends Service Committee', 'AFSC',
                 'Unitarian Service Committee', 'USC', 'Emergency Rescue Committee', 'ERC', 'Centre Américain de Secours',
                 'Fédération Protestante de France', 'French Protestant Federation', 'Amitié Chretienne', 'Rescue Committee',
                 'YMCA', 'International Migration Service', "Service Social d'Aide aux Émigrants", 'SSAE', 
                  'Mouvement National Contre le Racisme', 'American Red Cross', 'CIMADE', 'Comité Intermouvement après des Evacuees',
                 'Boegner Rescue Network', "Comite d'Inter Mouvement après des Evacues", 'American Federation of Labor',
                 'Jousselin Rescue Network', 'Nîmes Coordinating Committee', 'Camps Committee', 
                  "Young Men's Christian Association", 'Young Mens Christian Association', 'Unitarians', 
                 "American Friends' Service Committee", 'French Red Cross', 'Croix-Rouge française', 
                 'International Rescue Committee', 'Fleury Rescue Network', 'Fleury rescue network', 
                  "L’Aide Chrétienne aux Israélites", 'Swiss Aid Society', 'Reconciliation Alliance', 'Swiss Friends',
                 'CIMAD', 'Swiss Coalition for Relief to Child War Victims', 'le Comité Inter-Movements Auprès des Evacuées',
                 'the Swiss Coalition for Relief to Child War Victims', 'American Quakers',
                 'Secours aux Enfants', 'Comité Inter- Movements Auprès des Evacuées', 'The Swiss Coalition for Relief to Child War Victims']
aid_org_other_patterns = []
for aid_org_other in aid_orgs_other:
    normalized = unidecode.unidecode(aid_org_other)
    aid_org_other_patterns.append({"pattern": aid_org_other, "label": "OTHER AID ORGANIZATION"})
    aid_org_other_patterns.append({"pattern": normalized, "label": "OTHER AID ORGANIZATION"})
aid_org_other_ruler = nlp.add_pipe("entity_ruler", before="ner", name="aid_org_other_ruler")
aid_org_other_ruler.add_patterns(aid_org_other_patterns)

In [None]:
# Creating an entity ruler to find non-named features
features = ['bay', 'bays', 'bog', 'bogs', 'canal', 'canals', 'aqueduct', 'aqueducts', 'cove', 'coves', 'dock',
           'docks', 'ditch', 'ditches', 'estuary', 'waterfall', 'mud flat', 'tidal flat', 'glacier', 'glaciers',
           'snowfield', 'snowfields', 'gulf', 'harbor', 'harbors', 'inlet', 'inlets', 'lake', 'lake bed', 'lakes',
           'marsh', 'ocean', 'pond', 'ponds', 'reservoir', 'reservoirs', 'ravine', 'ravines', 'sea', 'stream',
            'streams', 'river', 'rivers', 'swamp', 'swamps', 'tunnel', 'wetland', 'wetlands', 'clearing', 
            'clearings', 'coast', 'coastline', 'field', 'fields', 'pasture', 'pastures', 'port', 'ports', 'farm', 
            'farms', 'causeway', 'causeways', 'portage', 'road', 'roads', 'railroad', 'railroads', 'street'
            'railroad station', 'railroad stations', 'streets', 'tunnel', 'tunnels', 'trail', 'trails', 'airfield', 
            'airfields', 'airport', 'airports', 'bridge', 'bridges', 'apartment', 'apartments', 
            'office', 'offices', 'bank', 'banks', 'barrack', 'barracks', 'boatyard', 'boatyards', 'bus stop', 
            'bus stops', 'bus station', 'bus stations', 'bus', 'buses', 'train', 'trains', 'cave', 'caves', 'camp',
            'camps', 'church', 'churches', 'school', 'schools', 'castle', 'castles', 'synogogue', 'synogogues', 
            'cemetary', 'cemetaries', 'corral', 'corrals', 'courthouse', 'courthouses', 'town square', 'town squares', 
            'convent', 'convents', 'monastery', 'monasteries', 'dock', 'docks', 'customs house', 'guard house',
            'guard houses', 'guard tower', 'guard towers', 'estate', 'estates', 'factory', 'factories', 'facility', 
            'facilities', 'farmstead', 'farmsteads', 'fort', 'forts', 'ferry', 'ferries', 'gate', 'gates', 'garden', 
            'gardens', 'grave', 'graves', 'homestead', 'homesteads', 'home', 'homes', 'house', 'houses', 'hospital', 
            'hospitals', 'clinic', 'clinics', 'hotel', 'hotels', 'city hall', 'jetty', 'library', 'libraries', 
            'landfill', 'lighthouse', 'lighthouses', 'marina', 'marinas', 'cannery', 'canneries', 'munitions plant', 
            'market', 'markets', 'mill', 'mills', 'sawmill', 'sawmills', 'windmill', 'windmills', 'water mill', 
            'quarry', 'quarries', 'metro station', 'museum', 'museums', 'novitiate', 'palace', 'palaces', 'chateau',
            'chateaus', 'pier', 'piers', 'post office', 'police station', 'police post', 'park', 'parks', 'prison',
            'prisons', 'reformatory', 'reformatories', 'concentration camp', 'concentration camps', 'customs post',
            'border post', 'border house', 'border', 'borders', 'patrol house', 'patrol post', 'restaurant', 
            'restaurants', 'grocery store', 'store', 'college', 'university', 'universities', 'sheepfold', 'barn', 
            'barns', 'haystack', 'haystacks', 'sheath', 'sheathes', 'shrine', 'shrines', 'storehouse', 'storehouses',
            'sanatorium', 'sanatoriums', 'village square', 'stable', 'stables', 'stadium', 'stadiums', 
            'military base', 'outpost', 'station', 'stations', 'theater', 'theatre', 'theaters', 'theatres', 'tomb',
            'tombs', 'temple', 'temples', 'toll gate', 'barrier', 'barriers', 'fence', 'fences', 'tower', 'towers', 
            'tram', 'trams', 'trolley', 'trolleys', 'wall', 'walls', 'zoo', 'zoos', 'beach', 'beaaches', 'cliff', 
            'cliffs', 'canyon', 'canyons', 'corridor', 'corridors', 'cirque', 'crater', 'craters', 'delta', 'dune', 
            'desert', 'gorge', 'gorges', 'fissure', 'headland', 'headlands', 'hill', 'hills', 'island', 'islands', 
            'levee', 'levees', 'mound', 'mounds', 'mountain', 'mountains', 'pass', 'peninsula', 'peak', 'peaks',
            'plateau', 'ridge', 'ridges', 'escarpment', 'shore', 'shoreline', 'coastline', 'slope', 'terrace', 
            'upland', 'uplands', 'valley', 'valleys', 'furrow', 'furrows', 'knoll', 'knolls', 'ledge', 'ledges', 
            'moat', 'shoal', 'shoals', 'bush', 'bushes', 'forest', 'forests', 'woods', 'glade', 'glades', 'tree',
            'trees', 'grove', 'groves', 'grass', 'grasses', 'grassland', 'grasslands', 'meadow', 'meadows', 
            'orchard', 'orchards', 'scrubland', 'scrublands', 'scrub', 'tundra', 'vineyard', 'vineyards', 'village', 
            'barnyard', 'courtyard', 'crop land', 'crop lands', 'cropland', 'croplands', 'brushland', 'brush land', 
            'silo', 'railway', 'railway station', 'railway stations', 'railways', 'wagon', 'wagons', 'car', 'cars',
            'truck', 'trucks']
feature_patterns = []
for feature in features:
    normalized = unidecode.unidecode(feature)
    feature_patterns.append({"pattern": feature, "label": "FEATURE"})
    feature_patterns.append({"pattern": normalized, "label": "FEATURE"})
feature_ruler = nlp.add_pipe("entity_ruler", before="ner", name="feature_ruler")
feature_ruler.add_patterns(feature_patterns)

In [None]:
"""
Load your data...
This create a combined string from all .txt files in a directory, which 
represents your data.
You have to use "f.read" not "f.readlines".
"""

# Establish the path to your data
import os
path = "C:/Users/....../Data/"

def read_txt_files(directory):
    # Reads all .txt files in a directory and returns a combined string of their contents.

    file_contents = ''
    
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            filepath = os.path.join(directory, filename)
            with open(filepath, "r", encoding="utf8") as f:
                file_contents = file_contents + (f.read())
    return file_contents

texts = read_txt_files(path)

In [None]:
"""
Runs the combined txt files through the NLP pipeline, looking for specific entities that you set up
via the Entity Rulers, and then afterwards the normal entities extracted by the spaCy NER as well,
and then visualizing them.
"""

doc = nlp(texts)
options = {"ents": ["PLATEAU LOCATION", "CHILDREN'S HOME", "SCHOOL", "FARM", "HOTEL USED BY GERMANS",
                    "OTHER FRENCH LOCATION", "JEWISH AID ORGANIZATION", "OTHER AID ORGANIZATION", "PERSON", "GPE", 
                    "LOC", "ORG", "LANGUAGE", "NORP", "TIME", "DATE", "QUANTITY", "PERCENT", "FAC", "FEATURE"],
           "colors": {"PLATEAU LOCATION": "pink", "CHILDREN'S HOME": "#bf94e4", "SCHOOL": "yellow", 
                    "FARM": "green", "HOTEL USED BY GERMANS": "#ff033e", "OTHER FRENCH LOCATION": "#bfff00",
                    "JEWISH AID ORGANIZATION": "lightgreen", "OTHER AID ORGANIZATION": "#9966cc", "PERSON": "coral", 
                    "GPE": "#b5651d", "LOC": "orange", "ORG": "teal", "LANGUAGE": "#f56991", "NORP": "#ace5ee", 
                    "TIME": "white", "DATE": "gray", "QUANTITY": "tan", "PERCENT": "gold", "FAC": "lavender", 
                    "FEATURE": "fuchsia"}}

displacy.render(doc, style="ent", jupyter=True, options=options)

In [None]:
# Tell me now many tokens did spaCy process...

len(doc)

In [None]:
"""
Creates dataframe of all named Plateau towns, villages, and features extracted from the testimonies, 
with datatype as string to avoid commas between words, which is what it does if you use default 
object datatype..
"""

Plateau_Locations = []
for ent in doc.ents:
    if ent.label_ == "PLATEAU LOCATION":
        Plateau_Locations.append(ent)

df_PlatLocExtract = pd.DataFrame({"Plateau_Locations": Plateau_Locations}, dtype="string")

df_PlatLocExtract

In [None]:
"""
Creates dataframe of all named Plateau childrens homes extracted from the testimonies, 
with datatype as string to avoid commas between words, which is what it does if you use 
default object datatype..
"""

homes = []
for ent in doc.ents:
    if ent.label_ == "CHILDREN'S HOME":
        homes.append(ent)

df_homesExtract = pd.DataFrame({"Childrens_Homes": homes}, dtype="string")

df_homesExtract

In [None]:
"""
Creates dataframe of all named Plateau schools extracted from the testimonies, 
with datatype as string to avoid commas between words, which is what it does if you use 
default object datatype..
"""

schools = []
for ent in doc.ents:
    if ent.label_ == "SCHOOL":
        schools.append(ent)

df_schoolsExtract = pd.DataFrame({"Schools": schools}, dtype="string")

df_schoolsExtract

In [None]:
"""
Creates dataframe of all named Plateau farms extracted from the testimonies, 
with datatype as string to avoid commas between words, which is what it does if you use 
default object datatype..

##  However, different from the others, there was an error that was occuring with this data.  For an unknown
## reason, the list from the doc entities was trying to create a dataframe, using two different columns 
## for "La" and "Bergerie". When I tried using the method I did above with a dictionary for {"Farms": farms},
## it was then throwing a "ValueError: Buffer has wrong number of dimensions (expected 1, got 2)"
## To fix, I create a simple dataframe from the list, then aggregate/join the two columns to combine "La" and "Bergerie".
## Then I dropped the original two columns, leaving just the aggregated column that I want.
"""

farms = []
for ent in doc.ents:
    if ent.label_ == "FARM":
        farms.append(ent)
        
df_farmsExtract = pd.DataFrame(farms, dtype="string")
df_farmsExtract["Farms"] = df_farmsExtract[[0, 1]].agg(" ".join, axis=1)
df_farmsExtract = df_farmsExtract.drop(columns=[0, 1]) 
df_farmsExtract

In [None]:
"""
Creates dataframe of all named Plateau hotels used by German convalescent soldiers extracted 
from the testimonies, with datatype as string to avoid commas between words, which is what it 
does if you use default object datatype..
"""

hotels = []
for ent in doc.ents:
    if ent.label_ == "HOTEL USED BY GERMANS":
        hotels.append(ent)

df_hotelsGerExtract = pd.DataFrame({"Hotels_GermanConvalescence": hotels}, dtype="string")

df_hotelsGerExtract

In [None]:
"""
Creates dataframe of all other named French locations extracted from the testimonies, 
with datatype as string to avoid commas between words, which is what it does if you use 
default object datatype..
"""

other_French_Locations = []
for ent in doc.ents:
    if ent.label_ == "OTHER FRENCH LOCATION":
        other_French_Locations.append(ent)

df_othFreLocExtract = pd.DataFrame({"other_French_Locations": other_French_Locations}, dtype="string")

df_othFreLocExtract

In [None]:
"""
This merges two dataframes: the first is the dataframe with the named Plateau locations and their coordinates 
and the second is the dataframe which contains the extracted Plateau locations from the spaCy entity recognition.
Then it exports the results to a csv file.
"""

df_merged = pd.merge(df, df_PlatLocExtract, on="Plateau_Locations", how="right")

df_merged.to_csv("C:/Users/....../Results_ExtractedPlateauLocations.csv", encoding="utf-8-sig", index=False, header=True, mode="w+")

In [None]:
"""
This merges two dataframes: the first is the dataframe with the named children's homes and their coordinates 
and the second is the dataframe which contains the extracted children's homes from the spaCy entity recognition.
Then it exports the results to a csv file.
"""

df_mergedHomes = pd.merge(df_homes, df_homesExtract, on="Childrens_Homes", how="right")

df_mergedHomes.to_csv("C:/Users/....../Results_ExtractedChildrensHomes.csv", encoding="utf-8-sig", index=False, header=True, mode="w+")

In [None]:
"""
This merges two dataframes: the first is the dataframe with the named schools and their coordinates 
and the second is the dataframe which contains the extracted schools from the spaCy entity recognition.
Then it exports the results to a csv file.
"""

df_mergedSchools = pd.merge(df_schools, df_schoolsExtract, on="Schools", how="right")

df_mergedSchools.to_csv("C:/Users/....../Results_ExtractedSchools.csv", encoding="utf-8-sig", index=False, header=True, mode="w+")

In [None]:
"""
This merges two dataframes: the first is the dataframe with the named farms and their coordinates 
and the second is the dataframe which contains the extracted farms from the spaCy entity recognition.
Then it exports the results to a csv file.
"""

df_mergedFarms = pd.merge(df_farms, df_farmsExtract, on="Farms", how="right")

df_mergedFarms.to_csv("C:/Users/....../Results_ExtractedFarms.csv", encoding="utf-8-sig", index=False, header=True, mode="w+")

In [None]:
"""
This merges two dataframes: the first is the dataframe with the named hotels used by the Germans and their coordinates 
and the second is the dataframe which contains the extracted hotels from the spaCy entity recognition.
Then it exports the results to a csv file.
"""

df_mergedHotelsGer = pd.merge(df_hotelsGer, df_hotelsGerExtract, on="Hotels_GermanConvalescence", how="right")

df_mergedHotelsGer.to_csv("C:/Users/....../Results_ExtractedHotelsGermans.csv", encoding="utf-8-sig", index=False, header=True, mode="w+")

In [None]:
"""
This merges two dataframes: the first is the dataframe with the named Other French locations and their coordinates 
and the second is the dataframe which contains the extracted Other French locations from the spaCy entity recognition.
Then it exports the results to a csv file.
"""

df_mergedOthFreLoc = pd.merge(df_otherFreLoc, df_othFreLocExtract, on="other_French_Locations", how="right")

df_mergedOthFreLoc.to_csv("C:/Users/....../Results_ExtractedOtherFrenchLocations.csv", encoding="utf-8-sig", index=False, header=True, mode="w+")

In [None]:
"""
Creates dataframe of all non-named features found on the Plateau, with datatype as string to avoid commas between
words, which is what it does if you use default "Object" datatype..
"""

features = []
for ent in doc.ents:
    if ent.label_ == "FEATURE":
        features.append(ent)

df_feat = pd.DataFrame({"Features": features}, dtype="string")

df_feat

In [None]:
"""
Take output from "Feature" dataframe, give count of each feature name, then export to a csv.
Note: you need the "Index = true" to get the feature name to go to the csv file.  
Also note, you need the "encoding = utf-8-sig" to get the French words correct.
"""

df_feat_summary = df_feat["Features"].value_counts()

df_feat_summary.to_csv("C:/Users/....../Results_ExtractedFeatures.csv", encoding="utf-8-sig", index=True, header=True, mode="w+")

In [None]:
"""
Creates dataframe of all GPEs extracted from testimonies, with datatype as string to avoid commas between
words, which is what it does if you use default "Object" datatype..
"""

gpes = []
for ent in doc.ents:
    if ent.label_ == "GPE":
        gpes.append(ent)

df_gpe = pd.DataFrame({"GPEs": gpes}, dtype="string")
df_gpe

In [None]:
"""
Take output from "GPE" dataframe, give count of each GPE name, then export to a csv.
Note: you need the "Index = true" to get the GPE name to go to the csv file.  
Also note, you need the "encoding = utf-8-sig" to get the French words correct.
"""

df_gpe_summary = df_gpe["GPEs"].value_counts()

df_gpe_summary.to_csv("C:/Users/....../Results_ExtractedGPEs.csv", encoding="utf-8-sig", index=True, header=True, mode="w+")

In [None]:
"""
Creates dataframe of all locations extracted from testimonies, with datatype as string to avoid commas between
words, which is what it does if you use default "Object" datatype..
"""

locs = []
for ent in doc.ents:
    if ent.label_ == "LOC":
        locs.append(ent)

df_loc = pd.DataFrame({"LOCs": locs}, dtype="string")
df_loc

In [None]:
"""
Take output from "Location" dataframe, give count of each location name, then export to a csv.
Note: you need the "Index = true" to get the location name to go to the csv file.  
Also note, you need the "encoding = utf-8-sig" to get the French words correct.
"""

df_loc_summary = df_loc["LOCs"].value_counts()

df_loc_summary.to_csv("C:/Users/....../Results_ExtractedLOCs.csv", encoding="utf-8-sig", index=True, header=True, mode="w+")

In [None]:
"""
Creates dataframe of all facilities extracted from testimonies, with datatype as string to avoid commas between
words, which is what it does if you use default "Object" datatype..
"""

facs = []
for ent in doc.ents:
    if ent.label_ == "FAC":
        facs.append(ent)

df_fac = pd.DataFrame({"FACs": facs}, dtype="string")
df_fac

In [None]:
"""
Take output from "Facilities" dataframe, give count of each facilities name, then export to a csv.
Note: you need the "Index = true" to get the facilities name to go to the csv file.  
Also note, you need the "encoding = utf-8-sig" to get the French words correct.
"""

df_fac_summary = df_fac["FACs"].value_counts()

df_fac_summary.to_csv("C:/Users/....../Results_ExtractedFACs.csv", encoding="utf-8-sig", index=True, header=True, mode="w+")

In [None]:
"""
Creates dataframe of all Jewish Aid Organizations extracted from testimonies, with datatype as string to avoid commas between
words, which is what it does if you use default "Object" datatype..
"""

jewishAidOrgans = []
for ent in doc.ents:
    if ent.label_ == "JEWISH AID ORGANIZATION":
        jewishAidOrgans.append(ent)

df_jewishAidOrgan = pd.DataFrame({"Jewish Aid Organizations": jewishAidOrgans}, dtype="string")
df_jewishAidOrgan

In [None]:
"""
Take output from "Jewish Aid Organizations" dataframe, give count of each organizations name, then export to a csv.
Note: you need the "Index = true" to get the organizations name to go to the csv file.  
Also note, you need the "encoding = utf-8-sig" to get the French words correct.
"""

df_jewishAidOrgan_summary = df_jewishAidOrgan["Jewish Aid Organizations"].value_counts()

df_jewishAidOrgan_summary.to_csv("C:/Users/....../Results_ExtractedJewishAidOrgans.csv", encoding="utf-8-sig", index=True, header=True, mode="w+")

In [None]:
"""
Creates dataframe of all Other Aid Organizations extracted from testimonies, with datatype as string to avoid commas between
words, which is what it does if you use default "Object" datatype..
"""

otherAidOrgans = []
for ent in doc.ents:
    if ent.label_ == "OTHER AID ORGANIZATION":
        otherAidOrgans.append(ent)

df_otherAidOrgan = pd.DataFrame({"Other Aid Organizations": otherAidOrgans}, dtype="string")
df_otherAidOrgan

In [None]:
"""
Take output from "Other Aid Organizations" dataframe, give count of each organizations name, then export to a csv.
Note: you need the "Index = true" to get the organizations name to go to the csv file.  
Also note, you need the "encoding = utf-8-sig" to get the French words correct.
"""

df_otherAidOrgan_summary = df_otherAidOrgan["Other Aid Organizations"].value_counts()

df_otherAidOrgan_summary.to_csv("C:/Users/....../Results_ExtractedOtherAidOrgans.csv", encoding="utf-8-sig", index=True, header=True, mode="w+")

In [None]:
"""
Creates dataframe of all Persons extracted from testimonies, with datatype as string to avoid commas between
words, which is what it does if you use default "Object" datatype..
"""

persons = []
for ent in doc.ents:
    if ent.label_ == "PERSON":
        persons.append(ent)

df_person = pd.DataFrame({"Persons": persons}, dtype="string")
df_person

In [None]:
"""
Take output from "Persons" dataframe, give count of each Persons name, then export to a csv.
Note: you need the "Index = true" to get the Persons name to go to the csv file.  
Also note, you need the "encoding = utf-8-sig" to get the French words correct.
"""

df_person_summary = df_person["Persons"].value_counts()

df_person_summary.to_csv("C:/Users/....../Results_ExtractedPersons.csv", encoding="utf-8-sig", index=True, header=True, mode="w+")