This script is used to produce the list of locations used to synthetically fill the anonymized FSCD decision documents

In [42]:
import pandas as pd
import json
import os

In [109]:
countries = ['de','ch','fr','at','it']

raw_loc_path = f"auxiliary_data/locations/"
save_path = f"auxiliary_data/final/"
file_names = ['place-city.ndjson','place-town.ndjson','place-village.ndjson','place-hamlet.ndjson']


In [58]:
data = {}

data_stats = {}
data_stats['size'] = {}

for country in countries:

    country_path = os.path.join(raw_loc_path,country)

    country_dfs = []

    types = file_names #cities, towns, villages, hamlets
    
    if not country == 'ch':
        types = types[:2] #only take city and town names of non-swiss places

    else:
        types = types[:3] #use city, towns, villages for swiss-places

    for file_name in types:
        path = os.path.join(country_path,file_name)

        try:
            country_dfs.append(pd.read_json(path,lines=True)["name"])

        except:
            print(f"error in {country} - {file_name}")

    data[country] = pd.concat(country_dfs,axis=0,ignore_index=True)
    data[country] = data[country].dropna(how='any')
    data_stats['size'][country] = len(data[country])
        

In [60]:
data['ch']

0          Zürich
1          Genève
2        Lausanne
3            Bern
4          Luzern
          ...    
3763      Samnaun
3764        Laret
3765     Ravaisch
3766         Plan
3767    Compatsch
Name: name, Length: 3757, dtype: object

In [59]:
data_stats['size']

{'de': 2433, 'ch': 3757, 'fr': 1231, 'at': 241, 'it': 1316}

In [97]:
loc_data = pd.concat([data[country] for country in countries],ignore_index=True)
loc_data = pd.DataFrame({"name" : loc_data})

In [98]:
loc_data.sample(30,random_state=42)


Unnamed: 0,name
6683,Saint-Maximin-la-Sainte-Baume
93,Leipzig
8447,Loreto
7657,Poysdorf
994,Kamp-Lintfort
2683,Genthod
4985,Gerra
8746,Treviolo
6159,Madulain
3460,Middes


In [99]:
print(f"number of locations in loc_data : {len(loc_data)}")

number of locations in loc_data : 8978


In [100]:
municipalities = pd.read_csv(f"auxiliary_data\locations\swiss_place_names.csv")

print(f"length of swiss municipalites datset : {len(municipalities)}")
municipalities.head()

length of swiss municipalites datset : 2175


Unnamed: 0,name
0,Aeugst am Albis
1,Affoltern am Albis
2,Bonstetten
3,Hausen am Albis
4,Hedingen


In [107]:
#extend the all_loc_data with the swiss municipalities, removing dublicates

all_loc_data = pd.concat([loc_data,municipalities],ignore_index=True,axis = 0).drop_duplicates()
all_loc_data.head()
all_loc_data.sample(10,random_state=42)

Unnamed: 0,name
9851,Eppenberg-Wöschnau
7244,Saint-Amand-les-Eaux
2394,Marlow
3413,Couvet
533,Eislingen/Fils
3069,Saint-Prex
7734,Brindisi
7611,St. Pölten
7684,Reggio Emilia
1421,Halberstadt


In [108]:
print(f"length of final dataset : {len(all_loc_data)}")

length of final dataset : 8965


Save the dataset

In [113]:
all_loc_data.to_csv(os.path.join(save_path,"location_names.csv"),index = False)