# Place names

Data from Pleiades and Wiki

In [1]:
# TODO: add https://en.wikipedia.org/wiki/List_of_Greek_place_names

In [2]:
import json

In [3]:
with open("pleiades-places-latest.json", "rb") as fo:
    pleiades = json.load(fo)

In [4]:
len(pleiades["@graph"])

37500

In [5]:
def is_greek_char(char: str):
    if 880 <= ord(char) <= 1023 or 7936 <= ord(char) <= 8191:
        return True
    return False

is_greek_char("ά")

True

In [6]:
pleiades_names = list()
for item in pleiades["@graph"]:
    for name in item["names"]:
        # "ancient greek" only in name["description"] 55 times, so not reliable; "greek" 129
        # https://www.ssec.wisc.edu/~tomw/java/unicode.html
        # Ancient Greek hex range: 0370—03FF == 880-1023
        # Greek Extended: 1F00—1FFF == 7936-8191
        # if all(), then 1329 Greek names are found
        # if any(), then 1585; not a big diff, use the ones we know are right
        if name["attested"] and all([is_greek_char(x) for x in name["attested"]]):
                pleiades_names.append(name["attested"])
print("Total:", len(pleiades_names))

Total: 1329


In [7]:
# also "place names" from wiki: https://en.wikipedia.org/wiki/List_of_Greek_place_names
with open("place-names.txt") as fo:
    wiki_names = fo.readlines()
wiki_names = [s.strip() for s in pleiades_names]

In [8]:
len(wiki_names)

1329

In [9]:
print(wiki_names[:3])

['Μαινάκῃ', 'Ὠρητανοί', 'Πιτυοῦσσαι']


In [10]:
names = pleiades_names + wiki_names
print(len(names))

2658


In [11]:
# remove multiword place names
# this can be reenabled later
names = [n for n in names if len(n.split()) == 1]
print(len(names))

2658


In [12]:
# trim suffix of each word unless short
# doing this as a naive match to declined forms in real texts
def bad_stemming(names):
    names_new = list()
    for name in names:
        femenine_endings = ["ά", "α", "η", "ή"]
        masculine_neuter_endings = ["ος", "ός", "ον", "όν"]
        consonants = list("βγδζθκλμνξπρσςτυφχψ")
        if not len(name) > 3:
            names_new.append(name)
        elif any([end for end in femenine_endings if name.endswith(end)]):
            names_new.append(name[:-1])
        elif any([end for end in masculine_neuter_endings if name.endswith(end)]):
            names_new.append(name[:-2])
        elif any([end for end in consonants if name.endswith(end)]):
            names_new.append(name)
        else:
            names_new.append(name)
    return names_new

In [15]:
names_stemmed = bad_stemming(names=names)
print(names_stemmed[:25])

['Μαινάκῃ', 'Ὠρητανοί', 'Πιτυοῦσσαι', 'Σκομβραρί', 'Ἀβίλυχ', 'Ἥρας', 'Λίξ', 'Καρμανί', 'Ζουχάββαρι', 'Κεδρωσί', 'Γαδρωσί', 'Γεδρωσί', 'Ὀάρακτ', 'Προφθασί', 'Σάρνι', 'Κώφας', 'Ὠγυρις', 'Ὀργάν', 'Νόαρ', 'Βήθηλά', 'Βήθηγ', 'Κουκονήσι', 'Δισπηλιό', 'Ἰάνουκλ', 'Στροφάδες']


In [16]:
with open("place-names-stemmed.txt", "w") as fo:
    fo.write("\n".join(names_stemmed))

# Personal names

Data from The Lexicon of Greek Personal Names (LGPN).

http://clas-lgpn5.classics.ox.ac.uk:8080/exist/apps/lgpn1-search/index.html?name=%CE%92*

In [18]:
with open("personal-names.txt") as fo:
    personal_names = fo.readlines()
personal_names = [s.strip() for s in personal_names]

In [19]:
print(len(personal_names))

39484


In [20]:
# ~1k of these words have english apostrophes in them; don't know why don't care
personal_names_grk = list()
for name in personal_names:
    if all([is_greek_char(x) for x in name]):
        personal_names_grk.append(name)
print(len(personal_names_grk))

38508


In [21]:
personal_names_stemmed = bad_stemming(names=personal_names_grk)

In [22]:
print(len(personal_names_stemmed))

38508


In [23]:
personal_names_stemmed[:5]

['', '῎Αβ', 'Αβα', '῎Αβαβ', 'Ἀβαεόδωρ']

In [24]:
with open("personal-names-stemmed.txt", "w") as fo:
    fo.write("\n".join(personal_names_stemmed))