## Create language data

In [242]:
# Import dependencies
import pandas as pd
import regex as re
import numpy as np

# Path
path = "data/glottolog/"

In [243]:
# Load in linguoid dataset
df = pd.read_csv(path + 'languoid.csv', delimiter = ',', encoding = "utf-8")

# Extract desired columns
df = df[['id', 'name', 'iso639P3code', 'level', 'family_id', 'country_ids', 'latitude', 'longitude']]

# Rename columns
df.columns = ['id', 'name', 'iso', 'level', 'family', 'country', 'lat', 'lon']

# From
df = df.assign(source='glottolog')

print(df.shape[0])

26416


In [244]:
# Load in geodataset for macroarea
df_geo = pd.read_csv(path + 'languages_and_dialects_geo.csv', delimiter=',', encoding="utf-8")

# Rename columns
df_geo.columns = ['id', 'name', 'iso', 'level', 'macroarea', 'lat', 'lon']

# From
df_geo = df_geo.assign(source='glottolog')
print(df_geo.shape[0])

21746


In [245]:
# Merge
df = pd.merge(df, df_geo, on=["name", "iso", "level", "id", "lat", "lon", "source"], how='outer')

# Drop duplicates
df.drop_duplicates(subset=['id', 'name'], keep='first', inplace=True, ignore_index=True)
print(df.shape[0])

26416


In [246]:
# Drop sign languages, bookkeeping, unattested, artificial
df = df[df['family'] != 'book1242']
df = df[df['family'] != 'sign1238']
df = df[df['family'] != 'unat1236']
df = df[df['family'] != 'arti1236']

print(df.shape[0])

25590


In [247]:
# Manually name families?
df['family'] = [re.sub('sino1245', "Sino-Tibetan", str(x)) for x in df['family']]
df['family'] = [re.sub('drav1251', "Dravidian", str(x)) for x in df['family']]
df['family'] = [re.sub('atla1278', "Atlantic-Congo", str(x)) for x in df['family']]
df['family'] = [re.sub('aust1307', "Austronesian", str(x)) for x in df['family']]
df['family'] = [re.sub('indo1319', "Indo-European", str(x)) for x in df['family']]
df['family'] = [re.sub('afro1255', "Afro-Asiatic", str(x)) for x in df['family']]
df['family'] = [re.sub('aust1305', "Austroasiatic", str(x)) for x in df['family']]
df['family'] = [re.sub('nucl1709', "Nuclear Trans New Guinea", str(x)) for x in df['family']]
df['family'] = [re.sub('pama1250', "Pama-Nyungan", str(x)) for x in df['family']]
df['family'] = [re.sub('pama1250', "Pama-Nyungan", str(x)) for x in df['family']]
df['family'] = [re.sub('tupi1275', "Tupian", str(x)) for x in df['family']]
df['family'] = [re.sub('tuuu1241', "Tuu", str(x)) for x in df['family']]
df['family'] = [re.sub('ural1272', "Uralic", str(x)) for x in df['family']]
df['family'] = [re.sub('turk1311', "Turkic", str(x)) for x in df['family']]
df['family'] = [re.sub('khoe1240', "Khoe-Kwadi", str(x)) for x in df['family']]
df['family'] = [re.sub('lake1255', "Lakes Plain", str(x)) for x in df['family']]
df['family'] = [re.sub('japo1237', "Japonic", str(x)) for x in df['family']]
df['family'] = [re.sub('kxaa1236', "Kxa", str(x)) for x in df['family']]
df['family'] = [re.sub('utoa1244', "Uto-Aztecan", str(x)) for x in df['family']]
df['family'] = [re.sub('mong1349', "Mongolic-Khitan", str(x)) for x in df['family']]
df['family'] = [re.sub('koia1260', "Koiarian", str(x)) for x in df['family']]
df['family'] = [re.sub('zapa1251', "Zaparoan", str(x)) for x in df['family']]
df['family'] = [re.sub('jara1244', "Jarawa-Onge", str(x)) for x in df['family']]

In [248]:
# Drop if no code
# df = df[df['iso'].notna()]
# print(df.shape)

# Drop dialects
# df = df[df['level'] != 'dialect']

In [249]:
# Reorder columns
df = df[['id', 'name', 'iso', 'level', 'family', 'macroarea', 'country', 'lat', 'lon', 'source']]

df

Unnamed: 0,id,name,iso,level,family,macroarea,country,lat,lon,source
0,3adt1234,3Ad-Tekles,,dialect,Afro-Asiatic,Africa,,,,glottolog
1,aala1237,Aalawa,,dialect,Austronesian,Papunesia,,,,glottolog
2,aant1238,Aantantara,,dialect,Nuclear Trans New Guinea,Papunesia,,,,glottolog
3,aari1238,Aari-Gayil,aiz,family,sout2845,,,,,glottolog
4,aari1239,Aari,aiw,language,sout2845,Africa,ET,5.95034,36.5721,glottolog
...,...,...,...,...,...,...,...,...,...,...
26411,zuti1239,Guajajára of Zutiua,,dialect,Tupian,South America,,,,glottolog
26412,zuwa1238,Zuwadza,,dialect,Koiarian,Papunesia,,,,glottolog
26413,zwal1238,Zwall,,dialect,Atlantic-Congo,Africa,,,,glottolog
26414,zyph1238,Zyphe,zyp,language,Sino-Tibetan,Eurasia,IN MM,22.52400,93.2640,glottolog


In [250]:
# Add coordinates by hand
df.loc[df.name == 'Arabic', ['lat', 'lon']] = 27.96, 43.85 # Coordinates of Standard Arabic, OK
df.loc[df.name == 'West Germanic', ['lat', 'lon']] = 48, 9
df.loc[df.name == 'Semitic', ['lat', 'lon']] = 29, 40
df.loc[df.name == 'Anglo-Norman', ['lat', 'lon']] = 51, -1
df.loc[df.name == 'Romance', ['lat', 'lon']] = 44, 12
df.loc[df.name == 'Medieval Latin', ['lat', 'lon']] = 46.2, 1.3 # Western Europe coordinates
df.loc[df.name == 'Late Latin', ['lat', 'lon']] = 46.2, 1.3 # Western Europe coordinates
df.loc[df.name == 'Germanic', ['lat', 'lon']] = 48.25, 10.60 #Middle High German
df.loc[df.name == 'New Latin', ['lat', 'lon']] = 46.2, 1.3 # Western Europe coordinates
df.loc[df.name == 'Slavic', ['lat', 'lon']] = 43.72, 22.84 #Church Slavic???
df.loc[df.name == 'Brunei Malay', ['lat', 'lon']] = 4.53, 114.72 #Coordinates of Brunei, OK?
df.loc[df.name == 'Fante', ['lat', 'lon']] = 6.35, -1.33 #Coordinates of hyperglot Akan, OK
df.loc[df.name == 'Twi', ['lat', 'lon']] = 6.35, -1.33 #Coordinates of hyperglot Akan, OK
df.loc[df.name == 'Mongolian', ['lat', 'lon']] = 48.32, 106.29 #Coordinates of hypoglot, Halh Mongolian (most spoken dialect) OK
df.loc[df.name == 'Volapük', ['lat', 'lon']] = 49, 8.24 #Coordinates of Karlsrühe, Baden OK
df.loc[df.name == 'Luxemburgish', ['lat', 'lon']] = 49.68, 6.15 #Coordinates of hyperglot, Moselle Franconian OK
df.loc[df.name == 'Armenian', ['lat', 'lon']] = 49.68, 6.15 #Coordinates of hyperglot, Moselle Franconian OK
df.loc[df.name == 'Dravidian', ['lat', 'lon']] = 11, 78 #Coordinates of Old Tamil
df.loc[df.name == 'Aramaic', ['lat', 'lon']] =33.91, 42.19 # Coordinates of Imperial Aramaic (700-300 BCE)
df.loc[df.name == 'Turkic', ['lat', 'lon']] =55.49, 47.16 # Coordinates of Chuvash, cognate of bors.
df.loc[df.name == 'Iranian', ['lat', 'lon']] =39.70, 66.98 # Coordinates of Sogdian, cognate bors

# Modify names
df.loc[df.name == 'Old English (ca. 450-1100)', ['name']] = 'Old English'
df.loc[df.name == 'Old French (842-ca. 1400)', ['name']] = 'Old French'
df.loc[df.name == 'Jewish Babylonian Aramaic (ca. 200-1200 CE)', ['name']] = 'Jewish Babylonian Aramaic'
# df.loc[df.name == 'Pahlavi', ['name']] = 'Middle Persian'
# Late Middle Indo-Aryan in the new instead of Middle Indo Aryan

In [251]:
# Add language
df.loc[-1] = [np.NaN, "Proto-Dravidian", np.NaN, "proto", "Dravidian", "Asia", np.NaN, 11.00, 78.00, "manual"]  # adding a row
df.index = df.index + 1  # shifting index
df.loc[-1] = [np.NaN, "Proto-Iranian", np.NaN, "proto", "Indo-European", "Asia", np.NaN, 45, 65, "manual"]  # Loc of Andronovo culture
df.index = df.index + 1  # shifting index
df.loc[-1] = [np.NaN, "Proto-Sino-Tibetan", np.NaN, "proto", "Sino-Tibetan", "Asia", np.NaN, 25, 103, "manual"]  # Loc of Yunnan
df.index = df.index + 1  # shifting index

# Sort by alphabetical order
df = df.sort_values(['name', 'family'])
df.reset_index(inplace=True, drop=True)

df

Unnamed: 0,id,name,iso,level,family,macroarea,country,lat,lon,source
0,gane1238,!Gã!ne,,language,Tuu,Africa,ZA,-31.3200,28.7500,glottolog
1,kwii1241,!Ui,,family,Tuu,,,,,glottolog
2,abda1238,'Abd Al-Kuri,,dialect,Afro-Asiatic,Africa,YE,12.1959,52.2282,glottolog
3,aden1242,'Aden,,dialect,Afro-Asiatic,Eurasia,,,,glottolog
4,alga1234,'Algaden,,dialect,Afro-Asiatic,Africa,,,,glottolog
...,...,...,...,...,...,...,...,...,...,...
25588,oita1237,Ōita-ben,,dialect,Japonic,Eurasia,,,,glottolog
25589,omut1237,Ōmuta-ben,,dialect,Japonic,Eurasia,,,,glottolog
25590,osak1237,Ōsaka,,dialect,Japonic,Eurasia,,,,glottolog
25591,suma1275,Šumadija-Vojvodina,,dialect,Indo-European,Eurasia,,,,glottolog


In [252]:
# Rename columns
df = df.rename(columns={"name": "language"})

In [253]:
# Save
path = "data/languages/"

# Write data out
languages = df.copy()
languages.to_csv(path + 'languages.csv')