## Regex zur Erkennung von Orten innerhalb von Organisationen aus amtlichem Ortschaftsverzeichnis und Kantonsliste erzeugen

In [1]:
import re
import pandas as pd

# Source: ortschaftenverzeichnis_plz_2056.csv.zip @ https://www.swisstopo.admin.ch/de/amtliches-ortschaftenverzeichnis#Ortschaftenverzeichnis--Download
# ©swisstopo

# Modify path depending on working directory of Jupyter environment
df = pd.read_csv("../resources/AMTOVZ_CSV_LV95.csv", sep=";")

Kantonsnamen

In [2]:
cantons = {
    "Aargau",
    "Appenzell Innerrhoden",
    "Innerrhoden",
    "Appenzell Ausserrhoden",
    "Ausserrhoden",
    "Bern",
    "Basel-Landschaft",
    "Basel-Land",
    "Basel Land",
    "Baselland",
    "Baselbiet",
    "Basel-Stadt",
    "Basel Stadt",
    "Baselstadt",
    "Fribourg",
    "Freiburg",
    "Genève",
    "Genf",
    "Glarus",
    "Graubünden",
    "Grischun",
    "Jura",
    "Luzern",
    "Neuchâtel",
    "Neuenburg",
    "Nidwalden",
    "Obwalden",
    "St. Gallen",
    "Sankt Gallen",
    "Schaffhausen",
    "Solothurn",
    "Schwyz",
    "Thurgau",
    "Ticino",
    "Tessin",
    "Uri",
    "Vaud",
    "Waadt",
    "Valais",
    "Wallis",
    "Zug",
    "Zürich",
}

In [3]:
df

Unnamed: 0,Ortschaftsname,PLZ,Zusatzziffer,Gemeindename,BFS-Nr,Kantonskürzel,E,N,Sprache,Validity
0,Aeugst am Albis,8914,0,Aeugst am Albis,1,ZH,2679402.872,1235842.010,de,2008-07-01
1,Aeugstertal,8914,2,Aeugst am Albis,1,ZH,2679815.372,1237404.310,de,2008-07-01
2,Zwillikon,8909,0,Affoltern am Albis,2,ZH,2675280.133,1238108.286,de,2008-07-01
3,Affoltern am Albis,8910,0,Affoltern am Albis,2,ZH,2676852.012,1236929.718,de,2008-07-01
4,Bonstetten,8906,0,Bonstetten,3,ZH,2677412.150,1241078.278,de,2008-07-01
...,...,...,...,...,...,...,...,...,...,...
5729,Mauren FL,9493,0,Mauren,7008,,2759637.820,1231945.121,de,2008-07-01
5730,Nendeln,9485,0,Gamprin,7009,,2759978.270,1229651.126,de,2008-07-01
5731,Gamprin-Bendern,9487,0,Gamprin,7009,,2757125.670,1232514.364,de,2008-07-01
5732,Ruggell,9491,0,Ruggell,7010,,2758648.719,1235172.319,de,2008-07-01


Analyse der kürzesten Ortschaft- und Gemeindesnamen in Hinblick auf allfällige False Positives.

In [4]:
sorted(list(set(df["Ortschaftsname"])), key=lambda x: len(x))

['Lü',
 'Gy',
 'Vex',
 'Vnà',
 'Loc',
 'Fey',
 'Auw',
 'Sur',
 'Oey',
 'Sax',
 'Lax',
 'Asp',
 'Zug',
 'Fex',
 'Rue',
 'Nax',
 'Naz',
 'Ins',
 'Bex',
 'Elm',
 'Juf',
 'Mon',
 'Binz',
 'Yens',
 'Chur',
 'Nivo',
 'Mels',
 'Erde',
 'Agno',
 'Vrin',
 'Brig',
 'Gudo',
 'Caux',
 'Baar',
 'Onex',
 'Ursy',
 'Tann',
 'Orny',
 'Bôle',
 'Dizy',
 'Horw',
 'Glis',
 'Lyss',
 'Aran',
 'Aven',
 'Rain',
 'Nyon',
 'Maur',
 'Pfyn',
 'Boll',
 'Embd',
 'Mund',
 'Sent',
 'Birr',
 'Sarn',
 'Riex',
 'Iseo',
 'Gals',
 'Eriz',
 'Worb',
 'Mols',
 'Trey',
 'Engi',
 'Dino',
 'Port',
 'Riom',
 'Croy',
 'Says',
 'Wila',
 'Etoy',
 'Cimo',
 'Ftan',
 'Aïre',
 'Pura',
 'Ayer',
 'Jens',
 'Vals',
 'Agra',
 'Watt',
 'Zuoz',
 'Lüen',
 'Noës',
 'Giez',
 'Visp',
 'Nohl',
 'Gohl',
 'Vich',
 'Bure',
 'Präz',
 'Alle',
 'Buus',
 'Arzo',
 'Igis',
 'Pomy',
 'Seon',
 'Miex',
 'Pizy',
 'Trun',
 'Loco',
 'Riaz',
 'Suhr',
 'Font',
 'Gais',
 'Elgg',
 'Sutz',
 'Murg',
 'Mies',
 'Thal',
 'Eich',
 'Thun',
 'Pany',
 'Höri',
 'Arch',
 'Matt'

In [5]:
sorted(list(set(df["Gemeindename"])), key=lambda x: len(x))

['Gy',
 'Egg',
 'Vex',
 'Fey',
 'Auw',
 'Lax',
 'Zug',
 'Rue',
 'Ins',
 'Bex',
 'Yens',
 'Chur',
 'Mels',
 'Agno',
 'Prez',
 'Baar',
 'Onex',
 'Ursy',
 'Orny',
 'Dizy',
 'Horw',
 'Lyss',
 'Rain',
 'Bühl',
 'Nyon',
 'Maur',
 'Pfyn',
 'Embd',
 'Birr',
 'Gals',
 'Eriz',
 'Worb',
 'Trey',
 'Port',
 'Croy',
 'Avry',
 'Wila',
 'Etoy',
 'Pura',
 'Oron',
 'Jens',
 'Vals',
 'Zuoz',
 'Giez',
 'Visp',
 'Lenk',
 'Vich',
 'Bure',
 'Alle',
 'Buus',
 'Pomy',
 'Seon',
 'Leuk',
 'Trun',
 'Riaz',
 'Suhr',
 'Gais',
 'Elgg',
 'Mies',
 'Thal',
 'Eich',
 'Thun',
 'Höri',
 'Arch',
 'Bern',
 'Fiez',
 'Arth',
 'Sion',
 'Binn',
 'Laax',
 'Lens',
 'Belp',
 'Orbe',
 'Trub',
 'Cama',
 'Broc',
 'Goms',
 'Dorf',
 'Rafz',
 'Fahy',
 'Cham',
 'Gams',
 'Horn',
 'Root',
 'Nods',
 'Trin',
 'Jaun',
 'Sins',
 'Erlen',
 'Furna',
 'Jonen',
 'Féchy',
 'Syens',
 'Gland',
 'Nidau',
 'Grône',
 'Wängi',
 'Soral',
 'Curio',
 'Augst',
 'Realp',
 'Flims',
 'Vaduz',
 'Arosa',
 'Fully',
 'Wäldi',
 'Sévaz',
 'Elsau',
 'Grabs',
 'Jussy',

In [6]:
localities = set(df["Ortschaftsname"])
municipalities = set(df["Gemeindename"])
places = list(localities | municipalities | cantons)

In [7]:
len(localities), len(municipalities), len(localities | municipalities), len(places)

(3973, 2143, 4425, 4455)

Ortschaften und Gemeinden werden anhand folgender Regeln expandiert:

1. b. -> bei
2. S. -> Sogn, San, Sant', Sankt, Saint, Saint-
3. Prefixes: Au ZH, Au (SG), ... -> Au
4. Slash-getrennte Schreibweisen innerhalb von Prefixes: Brienz/Brinzauls -> Brienz, Brinzauls

In [8]:
places_aug = []
for place in places:
    # Expanding abbreviations
    if "b." in place:
        places_aug.append(place.replace("b.", "bei"))
    if "S. " in place:
        for subs in ["Sogn ", "San ", "Sant'", "Sankt ", "Saint ", "Saint-"]:
            places_aug.append(place.replace("S. ", subs))

    # Extract prefixes
    prefix = re.sub("(.*?) (?:(?:\(.*\)|[A-Z][A-Z]|b(?:\.|ei)|im).*)", r"\1", place)
    places_aug.append(prefix)

    # Does not handle suffixes
    if "/" in prefix:
        for subplace in prefix.split("/"):
            places_aug.append(subplace.strip())

places = list(set(places) | set(places_aug))

# Sort in reverse to match longer names first
places = sorted(places, key=lambda x: len(x), reverse=True)

In [9]:
places[:100]

['Comunanza Cadenazzo/Monteceneri',
 'Sankt Antonio (Val Morobbia)',
 "Crête-à-l'Oeil (Les Agettes)",
 'Saint-Antonio (Val Morobbia)',
 'Saint Antonio (Val Morobbia)',
 "Sant'Antonio (Val Morobbia)",
 'Deisswil bei Münchenbuchsee',
 'Sogn Antonio (Val Morobbia)',
 'Flumserberg Tannenbodenalp',
 'Weissenstein bei Solothurn',
 'Schwendi im Weisstannental',
 'Rudolfstetten-Friedlisberg',
 'Aeschlen bei Oberdiessbach',
 'Deisswil b. Münchenbuchsee',
 'Röthenbach Herzogenbuchsee',
 'San Antonio (Val Morobbia)',
 'Weissenstein b. Solothurn',
 'Finsterwald bei Entlebuch',
 'Les Geneveys-sur-Coffrane',
 'Fontaine Dessous (Liddes)',
 'Niederried bei Interlaken',
 'Aeschlen b. Oberdiessbach',
 'Vuisternens-devant-Romont',
 'S. Antonio (Val Morobbia)',
 'Bleiken bei Oberdiessbach',
 'Chavannes-sous-Orsonnens',
 'Hettiswil bei Hindelbank',
 'Reichenbach im Kandertal',
 'Saint Carlo (Val Bavona)',
 'Bangerten bei Dieterswil',
 'Bleiken b. Oberdiessbach',
 'Welschenrohr-Gänsbrunnen',
 'Fontaine Dess

In [10]:
places[-100:]

['Gals',
 'Worb',
 'Trey',
 'Engi',
 'Dino',
 'Port',
 'Riom',
 'Croy',
 'Avry',
 'Etoy',
 'Ftan',
 'Aïre',
 'Pura',
 'Ayer',
 'Oron',
 'Riet',
 'Vals',
 'Watt',
 'Noës',
 'Giez',
 'Praz',
 'Visp',
 'Lenk',
 'Gohl',
 'Jura',
 'Vira',
 'Vich',
 'Präz',
 'Alle',
 'Arzo',
 'Niva',
 'Igis',
 'Seon',
 'Pizy',
 'Gabi',
 'Uors',
 'Trun',
 'Loco',
 'Bänk',
 'Grüt',
 'Sutz',
 'Thal',
 'Thun',
 'Pany',
 'Höri',
 'Burg',
 'Mase',
 'Arth',
 'Laax',
 'Lens',
 'Pont',
 'Osco',
 'Orbe',
 'Trub',
 'Cama',
 'Broc',
 'Goms',
 'Dorf',
 'Rafz',
 'Genf',
 'Cham',
 'Buix',
 'Muri',
 'Nods',
 'Lohn',
 'Trin',
 'Jaun',
 'Sins',
 'Egg',
 'Vex',
 'Vnà',
 'Auw',
 'Oey',
 'Sax',
 'Lax',
 'Ems',
 'Asp',
 'Zug',
 'Fex',
 'Mur',
 'Rue',
 'Vaz',
 'Bex',
 'Elm',
 'Uri',
 'Loc',
 'Lai',
 'Fey',
 'Sur',
 'Mex',
 'Nax',
 'Naz',
 'Ins',
 'Juf',
 'Luc',
 'Mon',
 'Wil',
 'Au',
 'Lü',
 'Gy']

Orte escapen und zu Regex-Konjunktion zusammensetzen.

In [11]:
places = [re.escape(place) for place in places]
loc_regex = f"\\b(?:{'|'.join(places)})\\b"
print(loc_regex)

\b(?:Comunanza\ Cadenazzo/Monteceneri|Sankt\ Antonio\ \(Val\ Morobbia\)|Crête\-à\-l'Oeil\ \(Les\ Agettes\)|Saint\-Antonio\ \(Val\ Morobbia\)|Saint\ Antonio\ \(Val\ Morobbia\)|Sant'Antonio\ \(Val\ Morobbia\)|Deisswil\ bei\ Münchenbuchsee|Sogn\ Antonio\ \(Val\ Morobbia\)|Flumserberg\ Tannenbodenalp|Weissenstein\ bei\ Solothurn|Schwendi\ im\ Weisstannental|Rudolfstetten\-Friedlisberg|Aeschlen\ bei\ Oberdiessbach|Deisswil\ b\.\ Münchenbuchsee|Röthenbach\ Herzogenbuchsee|San\ Antonio\ \(Val\ Morobbia\)|Weissenstein\ b\.\ Solothurn|Finsterwald\ bei\ Entlebuch|Les\ Geneveys\-sur\-Coffrane|Fontaine\ Dessous\ \(Liddes\)|Niederried\ bei\ Interlaken|Aeschlen\ b\.\ Oberdiessbach|Vuisternens\-devant\-Romont|S\.\ Antonio\ \(Val\ Morobbia\)|Bleiken\ bei\ Oberdiessbach|Chavannes\-sous\-Orsonnens|Hettiswil\ bei\ Hindelbank|Reichenbach\ im\ Kandertal|Saint\ Carlo\ \(Val\ Bavona\)|Bangerten\ bei\ Dieterswil|Bleiken\ b\.\ Oberdiessbach|Welschenrohr\-Gänsbrunnen|Fontaine\ Dessus\ \(Liddes\)|Saint\-Carlo\ \

Export

In [12]:
# Modify path depending on working directory of Jupyter environment
with open("../resources/locs_within_orgs_regex.txt", "w") as f:
    f.write(loc_regex)