# Wiktionary extractor

In [358]:
import re
import pandas as pd
import requests
from bs4 import BeautifulSoup
import plotly.express as px
import plotly.graph_objs as go

# response = requests.get("https://en.wiktionary.org/wiki/cinnamon")
# soup = BeautifulSoup(response.text, 'html.parser')
# soup

## Crawl and organize data

In [359]:
# Variables
key = 'cinnamon'
path = 'data\\wiktionary\\'

In [360]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Define the URL
url = f"https://en.wiktionary.org/wiki/cinnamon"

# Get the HTML content of the page
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

# Define the list of ids
ids = ['Translations-spice', 'Translations-\'\'Cinnamomum_verum\'\'']

# Create a list of tuples containing the language, translation, sense, and level
translations = []
for id in ids:
    # Get the translations for the specific noun sense
    translations_div = soup.find('div', {'id': id})

    if translations_div is not None:
        for li in translations_div.find_all('li'):
            split_text = li.get_text().split(":", 1)
            lang = split_text[0]
            trans = split_text[1].split('\n')[0] if len(split_text) > 1 else ''
            sense = id.split('-')[1]  # Extract the sense from the id
            level = 'main'  # This row is not from a level
            translations.append((lang, trans, sense, level))

            # Check if there are sub-items (dl and dd tags)
            dl = li.find('dl')
            if dl is not None:
                for dd in dl.find_all('dd'):
                    # Extract language and translation from the dd tag
                    split_text = dd.get_text().split(":", 1)
                    lang = split_text[0]
                    trans = split_text[1].split('\n')[0] if len(split_text) > 1 else ''
                    level = 'sub'  # This row is from a level
                    translations.append((lang, trans, sense, level))

# Create a DataFrame from the list of tuples
df = pd.DataFrame(translations, columns=['language', 'item', 'sense', 'level'])
df

Unnamed: 0,language,item,sense,level
0,Afrikaans,kaneel,spice,main
1,Albanian,kanellë (sq) f,spice,main
2,Amharic,ቀረፋ (ḳäräfa),spice,main
3,Arabic,⁧قِرْفَة⁩ f (qirfa),spice,main
4,Egyptian Arabic,⁧قرفة⁩ f (ʔerfa),spice,sub
...,...,...,...,...
169,Thai,อบเชย (th) (òp-chəəi),''Cinnamomum_verum'',main
170,Tibetan,ཤིང་ཚ (shing tsha),''Cinnamomum_verum'',main
171,Turkish,tarçın ağacı,''Cinnamomum_verum'',main
172,Vietnamese,cây quế,''Cinnamomum_verum'',main


In [361]:
# # Replace "" with NaN
# df['item'] = df['item'].replace('', pd.NA)

# # Use backfill for NA in items
# df['item'] = df['item'].fillna(method='bfill')

# Define a function to split on commas not inside parentheses
def split_not_in_parentheses(s):
    return re.split(r',\s*(?![^()]*\))', s)

# Apply the function to the 'item' column
df['item'] = df['item'].apply(split_not_in_parentheses)

# Explode the 'item' column
df = df.explode('item')

# Drop duplicates (keep first with spice sense)
df = df.drop_duplicates(subset = ['language', 'item'], keep = 'first').reset_index(drop = True)

# Cleaning
df['item'] = [re.sub('\xa0', " ", str(x)) for x in df['item']]
df['item'] = [re.sub(r' m ', " ", str(x)) for x in df['item']]
df['item'] = [re.sub(r' f ', " ", str(x)) for x in df['item']]
df['item'] = [re.sub(r' n ', " ", str(x)) for x in df['item']]
df['item'] = [re.sub(r' c ', " ", str(x)) for x in df['item']]
df['item'] = [re.sub(r' pl ', " ", str(x)) for x in df['item']]
df['item'] = [re.sub(r' m,', ",", str(x)) for x in df['item']]
df['item'] = [re.sub(r' f,', ",", str(x)) for x in df['item']]
df['item'] = [re.sub(r' n,', ",", str(x)) for x in df['item']]
df['item'] = [re.sub(r' c,', ",", str(x)) for x in df['item']]
df['item'] = [re.sub(r' pl,', ",", str(x)) for x in df['item']]
df['item'] = [re.sub(r' m$', "", str(x)) for x in df['item']]
df['item'] = [re.sub(r' f$', "", str(x)) for x in df['item']]
df['item'] = [re.sub(r' n$', "", str(x)) for x in df['item']]
df['item'] = [re.sub(r' c$', "", str(x)) for x in df['item']]
df['item'] = [re.sub(r' pl$', "", str(x)) for x in df['item']]
df['item'] = [re.sub(r'\(bcl\)', "", str(x)) for x in df['item']]
df['item'] = [re.sub(r'\(nds\)', "", str(x)) for x in df['item']]
df['item'] = [re.sub(r'\(scn\)', "", str(x)) for x in df['item']]
df['item'] = [re.sub(r'\(ast\)', "", str(x)) for x in df['item']]
df['item'] = [re.sub(r'\(Föhr-Amrum\)', "", str(x)) for x in df['item']]
df['item'] = [re.sub(r'\s?\(\w\w\)', "", str(x)) for x in df['item']]
df['item'] = [re.sub(r'\(please verify\)', "", str(x)) for x in df['item']]
df['item'] = [re.sub(r'\s+', " ", str(x)) for x in df['item']]
df['item'] = [re.sub(r' ,', ",", str(x)) for x in df['item']]
df['item'] = [re.sub(r'^\s', "", str(x)) for x in df['item']]
df['item'] = [re.sub(r'\s$', "", str(x)) for x in df['item']]

# Other
df['item'] = [re.sub(r"\(taraškievica\)", "", str(x)) for x in df['item']]
df['item'] = [re.sub(r"\(collective\)", "", str(x)) for x in df['item']]
df['item'] = [re.sub(r"class 9/10", "", str(x)) for x in df['item']]

# Change ( and ) to * and * 
df['item'] = [re.sub(r'\(', "*", str(x)) for x in df['item']]
df['item'] = [re.sub(r'\)', "*", str(x)) for x in df['item']]

# Remove ⁧ and ⁩ from item
df['item'] = [re.sub(r'⁧', "", str(x)) for x in df['item']]
df['item'] = [re.sub(r'⁩', "", str(x)) for x in df['item']]

# drop NA
df = df[df.item != "please add this translation if you can"]

# If there is a star in the item column, split the column into 'script' and 'transliteration' columns
df['term'] = df['item'].apply(lambda x: x.split('*')[0].strip())
df['transliteration'] = df['item'].apply(lambda x: x.split('*')[1].strip() if len(x.split('*')) > 1 else None)

# CLean '' in the sense column
df['sense'] = [re.sub(r"''", "", str(x)) for x in df['sense']]

# Reorder by alphabetizing the language column
# df = df.sort_values('language').reset_index(drop=True)

# Create source
df['source'] = 'Wiktionary'
df['group'] = ''
df['skip'] = ''

# reorder
df = df[['skip', 'language', 'term', 'transliteration', 'item', 'group', 'sense', 'source']]

# Glottologize
# glottologize()

# Save the DataFrame to file
# df.to_excel(path + f'{key}_gen.xlsx', sheet_name='wiktionary', index=None)

# Print
df.head(60)

Unnamed: 0,skip,language,term,transliteration,item,group,sense,source
0,,Afrikaans,kaneel,,kaneel,,spice,Wiktionary
1,,Albanian,kanellë,,kanellë,,spice,Wiktionary
2,,Amharic,ቀረፋ,ḳäräfa,ቀረፋ *ḳäräfa*,,spice,Wiktionary
3,,Arabic,قِرْفَة,qirfa,قِرْفَة *qirfa*,,spice,Wiktionary
4,,Egyptian Arabic,قرفة,ʔerfa,قرفة *ʔerfa*,,spice,Wiktionary
5,,Gulf Arabic,دارسين,dārsīn,دارسين *dārsīn*,,spice,Wiktionary
6,,Hijazi Arabic,قرفة,girfa,قرفة *girfa*,,spice,Wiktionary
7,,Aramaic,,,,,spice,Wiktionary
8,,Jewish Babylonian Aramaic,דַּרְצִינִי,darṣīnī,דַּרְצִינִי *darṣīnī*,,spice,Wiktionary
9,,Armenian,դարչին,darčʿin,դարչին *darčʿin*,,spice,Wiktionary


## Manual work

Now the manual work: Use the generated file `key_generated.xlsx` to fix, amend, append, group, and organize the names, with the addition of other sources and create a `key.xlsx` master list.

Steps:
 1. Manual check transliterations, especially Dhivehi, Burmese.
 2. Analyze and group words/names.
 3. Mark uncertain ones for skipping with 'yes'.

Recommended sources:
* Katzer (needs serious checking)
* WOLD
* Max Planck databases \\\\(CLIC3, etc.)
* others... 

After that, more preprocessing, cleaning, and merging with language data and coordinates.

## Preprocessing

### Language data

In [362]:
# Read in Glottolog data
df_languoid = pd.read_csv('data\\glottolog\\languoid.csv', delimiter=',', encoding="utf-8")

# Extract desired columns
df_languoid = df_languoid[['id', 'name', 'iso639P3code', 'level', 'family_id', 'country_ids', 'latitude', 'longitude']]

# Rename columns
df_languoid.columns = ['id', 'name', 'iso', 'level', 'family', 'country', 'lat', 'lon']

# Add source
df_languoid = df_languoid.assign(source='glottolog')
df_languoid

Unnamed: 0,id,name,iso,level,family,country,lat,lon,source
0,3adt1234,3Ad-Tekles,,dialect,afro1255,,,,glottolog
1,aala1237,Aalawa,,dialect,aust1307,,,,glottolog
2,aant1238,Aantantara,,dialect,nucl1709,,,,glottolog
3,aari1238,Aari-Gayil,aiz,family,sout2845,,,,glottolog
4,aari1239,Aari,aiw,language,sout2845,ET,5.95034,36.5721,glottolog
...,...,...,...,...,...,...,...,...,...
26664,zuti1239,Guajajára of Zutiua,,dialect,tupi1275,,,,glottolog
26665,zuwa1238,Zuwadza,,dialect,koia1260,,,,glottolog
26666,zwal1238,Zwall,,dialect,atla1278,,,,glottolog
26667,zyph1238,Zyphe,zyp,language,sino1245,IN MM,22.52400,93.2640,glottolog


In [363]:
# Load in geodataset for macroarea
df_geo = pd.read_csv('data\\glottolog\\languages_and_dialects_geo.csv', delimiter=',', encoding="utf-8")

# Rename columns
df_geo.columns = ['id', 'name', 'iso', 'level', 'macroarea', 'lat', 'lon']

# From
df_geo = df_geo.assign(source='glottolog')
df_geo

Unnamed: 0,id,name,iso,level,macroarea,lat,lon,source
0,3adt1234,3Ad-Tekles,,dialect,Africa,,,glottolog
1,aala1237,Aalawa,,dialect,Papunesia,,,glottolog
2,aant1238,Aantantara,,dialect,Papunesia,,,glottolog
3,aari1239,Aari,aiw,language,Africa,5.95034,36.5721,glottolog
4,aari1240,Aariya,aay,language,Eurasia,,,glottolog
...,...,...,...,...,...,...,...,...
21952,zuwa1238,Zuwadza,,dialect,Papunesia,,,glottolog
21953,zwal1238,Zwall,,dialect,Africa,,,glottolog
21954,zyph1238,Zyphe,zyp,language,Eurasia,22.52400,93.2640,glottolog
21955,zyud1238,Zyuzdin,,dialect,Eurasia,,,glottolog


In [364]:
# Merge
df_languages = pd.merge(df_languoid, df_geo, on=["name", "iso", "level", "id", "lat", "lon", "source"], how='outer')

# Drop duplicates
df_languages.drop_duplicates(subset=['id', 'name'], keep='first', inplace=True, ignore_index=True)

# Reassign
df = df_languages

# Drop sign languages, bookkeeping, unattested, artificial
df = df[df['family'] != 'book1242']
df = df[df['family'] != 'sign1238']
df = df[df['family'] != 'unat1236']
df = df[df['family'] != 'arti1236']

# Manually name families?
df['family'] = [re.sub('afro1255', "Afro-Asiatic", str(x)) for x in df['family']]
df['family'] = [re.sub('atla1278', "Atlantic-Congo", str(x)) for x in df['family']]
df['family'] = [re.sub('aust1305', "Austroasiatic", str(x)) for x in df['family']]
df['family'] = [re.sub('aust1307', "Austronesian", str(x)) for x in df['family']]
df['family'] = [re.sub('drav1251', "Dravidian", str(x)) for x in df['family']]
df['family'] = [re.sub('indo1319', "Indo-European", str(x)) for x in df['family']]
df['family'] = [re.sub('japo1237', "Japonic", str(x)) for x in df['family']]
df['family'] = [re.sub('jara1244', "Jarawa-Onge", str(x)) for x in df['family']]
df['family'] = [re.sub('khoe1240', "Khoe-Kwadi", str(x)) for x in df['family']]
df['family'] = [re.sub('koia1260', "Koiarian", str(x)) for x in df['family']]
df['family'] = [re.sub('kxaa1236', "Kxa", str(x)) for x in df['family']]
df['family'] = [re.sub('lake1255', "Lakes Plain", str(x)) for x in df['family']]
df['family'] = [re.sub('mong1349', "Mongolic-Khitan", str(x)) for x in df['family']]
df['family'] = [re.sub('nucl1709', "Nuclear Trans New Guinea", str(x)) for x in df['family']]
df['family'] = [re.sub('pama1250', "Pama-Nyungan", str(x)) for x in df['family']]
df['family'] = [re.sub('sino1245', "Sino-Tibetan", str(x)) for x in df['family']]
df['family'] = [re.sub('sout2845', "South Omotic", str(x)) for x in df['family']]
df['family'] = [re.sub('tupi1275', "Tupian", str(x)) for x in df['family']]
df['family'] = [re.sub('turk1311', "Turkic", str(x)) for x in df['family']]
df['family'] = [re.sub('tuuu1241', "Tuu", str(x)) for x in df['family']]
df['family'] = [re.sub('ural1272', "Uralic", str(x)) for x in df['family']]
df['family'] = [re.sub('utoa1244', "Uto-Aztecan", str(x)) for x in df['family']]
df['family'] = [re.sub('zapa1251', "Zaparoan", str(x)) for x in df['family']]

# Reorder columns
df = df[['id', 'name', 'iso', 'level', 'family', 'macroarea', 'country', 'lat', 'lon', 'source']]

# Rename name column to language in df_languages
df = df.rename(columns={'name': 'language'})

# Reassign
df_languages = df

### Merge language data and spice data

In [365]:
# Read in dataframe from xlsx
df = pd.read_excel(path + f'{key}.xlsx', sheet_name='wiktionary')

# Extract only desired columns
selectlist = ['skip', 'language', 'term', 'transliteration', 'group']
df = df[selectlist]

# Drop rows with 'skip' == 'yes'
print("Before skipping: ", df.shape[0])
df=df[df['skip'] != "yes"]
print("After skipping: ", df.shape[0])

# Reset index
df = df.reset_index(drop=True)

# Reassign
df_words = df

Before skipping:  203
After skipping:  166


In [366]:
# Change languages to Glottolog name
def glottologize():
    df['language'] = [re.sub("^Albanian$", "Northern Tosk Albanian", str(x)) for x in df['language']]
    df['language'] = [re.sub("^Arabic$", "Standard Arabic", str(x)) for x in df['language']]
    df['language'] = [re.sub("^Aramaic$", "Classical Syriac", str(x)) for x in df['language']]
    df['language'] = [re.sub("^Armenian$", "Eastern Armenian", str(x)) for x in df['language']]
    df['language'] = [re.sub("^Armenian \(Eastern\)$", "Eastern Armenian", str(x)) for x in df['language']]
    df['language'] = [re.sub("^Eastern Armenian$", "Eastern Armenian", str(x)) for x in df['language']]
    df['language'] = [re.sub("^Armenian \(Western\)$", "Western Armenian", str(x)) for x in df['language']]
    df['language'] = [re.sub("^Western Armenian$", "Western Armenian", str(x)) for x in df['language']]
    df['language'] = [re.sub("^Armenian \(Old\)$", "Classical-Middle Armenian", str(x)) for x in df['language']]
    df['language'] = [re.sub("^Old Armenian$", "Classical-Middle Armenian", str(x)) for x in df['language']]
    df['language'] = [re.sub("^Azerbaijani$", "North Azerbaijani", str(x)) for x in df['language']]
    df['language'] = [re.sub("^Bikol Central$", "Coastal-Naga Bikol", str(x)) for x in df['language']]
    df['language'] = [re.sub("^Bosnian$", "Bosnian Standard", str(x)) for x in df['language']]
    df['language'] = [re.sub("^Brunei Malay$", "Brunei", str(x)) for x in df['language']]
    df['language'] = [re.sub("^Croatian$", "Croatian Standard", str(x)) for x in df['language']]
    df['language'] = [re.sub("^Gaelic$", "Scottish Gaelic", str(x)) for x in df['language']]
    df['language'] = [re.sub("^Greek$", "Modern Greek", str(x)) for x in df['language']]
    df['language'] = [re.sub("^Greek \(Ancient\)$", "Ionic-Attic Ancient Greek", str(x)) for x in df['language']]
    df['language'] = [re.sub("^Ancient Greek$", "Ionic-Attic Ancient Greek", str(x)) for x in df['language']]
    df['language'] = [re.sub("^Greenlandic$", "Kalaallisut", str(x)) for x in df['language']]
    df['language'] = [re.sub("^Hakka$", "Hakka Chinese", str(x)) for x in df['language']]
    df['language'] = [re.sub("^Hebrew$", "Modern Hebrew", str(x)) for x in df['language']]
    df['language'] = [re.sub("^Jewish Babylonian Aramaic$", "Jewish Babylonian Aramaic (ca. 200-1200 CE)", str(x)) for x in df['language']]
    df['language'] = [re.sub("^Indonesian$", "Standard Indonesian", str(x)) for x in df['language']]
    df['language'] = [re.sub("^Khmer$", "Central Khmer", str(x)) for x in df['language']]
    df['language'] = [re.sub("^Kyrgyz$", "Kirghiz", str(x)) for x in df['language']]
    df['language'] = [re.sub("^Luxembourgish$", "Luxemburgish", str(x)) for x in df['language']]
    df['language'] = [re.sub("^Malagasy$", "Plateau Malagasy", str(x)) for x in df['language']]
    df['language'] = [re.sub("^Malay$", "Standard Malay", str(x)) for x in df['language']]
    df['language'] = [re.sub("^Mandarin$", "Mandarin Chinese", str(x)) for x in df['language']]
    df['language'] = [re.sub("^Manipuri \(Meitei-Lon\)$", "Manipuri", str(x)) for x in df['language']]
    df['language'] = [re.sub("^Minnan$", "Min Nan Chinese", str(x)) for x in df['language']]
    df['language'] = [re.sub("^Mongolian$", "Halh Mongolian", str(x)) for x in df['language']]
    df['language'] = [re.sub("^Naga \(Sumi\)$", "Sumi Naga", str(x)) for x in df['language']]
    df['language'] = [re.sub("^Naga \(Tangkhul\)$", "North-Central Tangkhul Naga", str(x)) for x in df['language']] # A hypoglot
    df['language'] = [re.sub("^Naga \(Rengma\)$", "Northern Rengma Naga", str(x)) for x in df['language']]
    df['language'] = [re.sub("^Naga \(Lotha\)$", "Lotha Naga", str(x)) for x in df['language']]
    df['language'] = [re.sub("^Naga \(Konyak\)$", "Konyak Naga", str(x)) for x in df['language']]
    df['language'] = [re.sub("^Naga \(Chakhesang-Chokri\)$", "Chokri Naga", str(x)) for x in df['language']]
    df['language'] = [re.sub("^Naga \(Ao\)$", "Ao Naga", str(x)) for x in df['language']]
    df['language'] = [re.sub("^Naga \(Angami\)$", "Angami Naga", str(x)) for x in df['language']]
    df['language'] = [re.sub("^Norman$", "Anglo-Norman", str(x)) for x in df['language']]
    df['language'] = [re.sub("^North Frisian$", "Northern Frisian", str(x)) for x in df['language']]
    df['language'] = [re.sub("^North Sami$", "North Saami", str(x)) for x in df['language']]
    df['language'] = [re.sub("^Old Church Slavonic$", "Church Slavic", str(x)) for x in df['language']]
    df['language'] = [re.sub("^Old East Slavic$", "Old Russian", str(x)) for x in df['language']]
    df['language'] = [re.sub("^Old Javanese$", "Kawi", str(x)) for x in df['language']]
    df['language'] = [re.sub("^Oriya$", "Odia", str(x)) for x in df['language']]
    df['language'] = [re.sub("^Oromo$", "West Central Oromo", str(x)) for x in df['language']]
    df['language'] = [re.sub("^Ossetian$", "Modern Ossetic", str(x)) for x in df['language']]
    df['language'] = [re.sub("^Persian$", "Western Farsi", str(x)) for x in df['language']]
    df['language'] = [re.sub("^Punjabi$", "Western Panjabi", str(x)) for x in df['language']]
    df['language'] = [re.sub("^Panjabi$", "Western Panjabi", str(x)) for x in df['language']]
    df['language'] = [re.sub("^Serbian$", "Serbian Standard", str(x)) for x in df['language']]
    df['language'] = [re.sub("^Serbo-Croatian$", "Serbian-Croatian-Bosnian", str(x)) for x in df['language']]
    df['language'] = [re.sub("^Sinhalese$", "Sinhala", str(x)) for x in df['language']]
    df['language'] = [re.sub("^Slovene$", "Slovenian", str(x)) for x in df['language']]
    df['language'] = [re.sub("^Uyghur$", "Uighur", str(x)) for x in df['language']]
    df['language'] = [re.sub("^Uzbek$", "Northern Uzbek", str(x)) for x in df['language']]
    df['language'] = [re.sub("^West Frisian$", "Western Frisian", str(x)) for x in df['language']]
    df['language'] = [re.sub("^Yiddish$", "Eastern Yiddish", str(x)) for x in df['language']]

    return df

# Standardize names of languages
glottologize()

Unnamed: 0,skip,language,term,transliteration,group
0,,Afrikaans,kaneel,,canela
1,,Northern Tosk Albanian,kanellë,,canela
2,,Amharic,ቀረፋ,ḳäräfa,qirfa
3,,Standard Arabic,قِرْفَة,qirfa,qirfa
4,,Eastern Armenian,կինամոն,kinamon,kinnamon
...,...,...,...,...,...
161,,Venetian,canela,,canela
162,,Vietnamese,quế,,gui
163,,Volapük,kirfat,,qirfa
164,,Welsh,synamon,,kinnamon


In [367]:
# Merge input and languages
df_merged = pd.merge(df_words, df_languages, on=['language'])
print("Merged:", df_merged.shape)

# Show rows that appear in the df_words but not in the df_merged
df_missing = df_words[~df_words.language.isin(df_merged.language)]
print(df_missing)

# Reassign
df = df_merged.copy()

Merged: (164, 14)
    skip   language    term transliteration     group
35   NaN  Esperanto  cinamo             NaN  kinnamon
163  NaN    Volapük  kirfat             NaN     qirfa


In [368]:
# Sort the dataframe according to the group column
# merged = merged.sort_values(by=['group'])

# #sort by categories, pepper ######## AUTOMATE ########
# df['group'] = pd.Categorical(df['group'], ["pippali", "pigment", "marica", "hujiao", "other"]) # add categorical order here
# df.sort_values("group", inplace = True) # sort according to the categories

# Sort by categories, cinnamon ######## AUTOMATE ########
df['group'] = pd.Categorical(df['group'], ["canela", "kinnamon", "korica", "qirfa", "darchin", "gui", "other"]) # add categorical order here
df.sort_values("group", inplace = True) # sort according to the categories
df

Unnamed: 0,skip,language,term,transliteration,group,id,iso,level,family,macroarea,country,lat,lon,source
0,,Afrikaans,kaneel,,canela,afri1274,afr,language,Indo-European,Africa,BW MZ NA ZA ZM ZW,-22.000000,30.000000,glottolog
35,,Estonian,kaneel,,canela,esto1258,ekk,language,Uralic,Eurasia,EE LV RU,58.550000,25.820000,glottolog
37,,Faroese,kanel,,canela,faro1244,fao,language,Indo-European,Eurasia,DK FO,62.073200,-6.884970,glottolog
38,,Finnish,kaneli,,canela,finn1318,fin,language,Uralic,Eurasia,EE FI NO RU SE,64.762800,25.557700,glottolog
39,,French,cannelle,,canela,stan1290,fra,language,Indo-European,Eurasia,AD BE CA CH FR GF IT LU MC PM TF US,48.000000,2.000000,glottolog
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
127,,Sanskrit,चोच,coca,other,sans1269,san,language,Indo-European,Eurasia,IN,20.000000,77.000000,glottolog
130,,Shan,ၵႅင်းႁွမ်,shn,other,shan1277,shn,language,taik1256,Eurasia,CN MM TH,21.599500,98.034200,glottolog
131,,Sinhala,කුරුඳු,kuruⁿdu,other,sinh1246,sin,language,Indo-European,Eurasia,LK,8.000000,81.000000,glottolog
103,,Chokri Naga,"chipfweketo, thime",,other,chok1243,nri,language,Sino-Tibetan,Eurasia,IN,25.563600,94.288700,glottolog


In [369]:
# Show rows in the dataframe with missing latitude and longitude
df_missing = df[df['lat'].isna()]
df_missing

Unnamed: 0,skip,language,term,transliteration,group,id,iso,level,family,macroarea,country,lat,lon,source
83,,Low German,Kaneel,,canela,lowg1239,,family,Indo-European,,,,,glottolog
111,,Anglo-Norman,cannelle,,canela,angl1258,xno,dialect,Indo-European,Eurasia,,,,glottolog
120,,Pashto,دالچینی,dolchini,darchin,pash1269,,family,Indo-European,,,,,glottolog
117,,Ottoman Turkish,دارچین,darçın,darchin,otto1234,ota,dialect,Turkic,Eurasia,,,,glottolog
151,,Twi,anoatre dua,,other,twii1234,twi,dialect,Atlantic-Congo,Africa,,,,glottolog
36,,Fante,anoater dua,,other,fant1241,fat,dialect,Atlantic-Congo,Africa,,,,glottolog


In [370]:
# Add coordinates by hand
df.loc[df.language == 'Anglo-Norman', ['lat', 'lon']] = 51, -1
df.loc[df.language == 'Arabic', ['lat', 'lon']] = 27.96, 43.85 # Coordinates of Standard Arabic, OK
df.loc[df.language == 'Armenian', ['lat', 'lon']] = 49.68, 6.15 # Coordinates of hyperglot, Moselle Franconian OK
df.loc[df.language == 'Aramaic', ['lat', 'lon']] = 33.91, 42.19 # Coordinates of Imperial Aramaic (700-300 BCE)
df.loc[df.language == 'Brunei Malay', ['lat', 'lon']] = 4.53, 114.72 #Coordinates of Brunei, OK?
df.loc[df.language == 'Dravidian', ['lat', 'lon']] = 11, 78 # Coordinates of Old Tamil
df.loc[df.language == 'Fante', ['lat', 'lon']] = 6.35, -1.33 # Coordinates of hyperglot Akan, OK
df.loc[df.language == 'Germanic', ['lat', 'lon']] = 48.25, 10.60 #Middle High German
df.loc[df.language == 'Iranian', ['lat', 'lon']] = 39.70, 66.98 # Coordinates of Sogdian, cognate bors
df.loc[df.language == 'Late Latin', ['lat', 'lon']] = 46.2, 1.3 # Western Europe coordinates
df.loc[df.language == 'Luxemburgish', ['lat', 'lon']] = 49.68, 6.15 #C oordinates of hyperglot, Moselle Franconian OK
df.loc[df.language == 'Medieval Latin', ['lat', 'lon']] = 46.2, 1.3 # Western Europe coordinates
df.loc[df.language == 'Mongolian', ['lat', 'lon']] = 48.32, 106.29 # Coordinates of hypoglot, Halh Mongolian (most spoken dialect) OK
df.loc[df.language == 'New Latin', ['lat', 'lon']] = 46.2, 1.3 # Western Europe coordinates
df.loc[df.language == 'Ottoman Turkish', ['lat', 'lon']] = 39.87, 32.87 # Coordinates of Turkish
df.loc[df.language == 'Romance', ['lat', 'lon']] = 44, 12
df.loc[df.language == 'Semitic', ['lat', 'lon']] = 29, 40
df.loc[df.language == 'Slavic', ['lat', 'lon']] = 43.72, 22.84 # Church Slavic???
df.loc[df.language == 'Turkic', ['lat', 'lon']] = 55.49, 47.16 # Coordinates of Chuvash, cognate of bors.
df.loc[df.language == 'Twi', ['lat', 'lon']] = 6.35, -1.33 # Coordinates of hyperglot Akan, OK
df.loc[df.language == 'Volapük', ['lat', 'lon']] = 49, 8.24 # Coordinates of Karlsrühe, Baden OK
df.loc[df.language == 'West Germanic', ['lat', 'lon']] = 48, 9

# Modify names
df.loc[df.language == 'Old English (ca. 450-1100)', ['language']] = 'Old English'
df.loc[df.language == 'Old French (842-ca. 1400)', ['language']] = 'Old French'
df.loc[df.language == 'Jewish Babylonian Aramaic (ca. 200-1200 CE)', ['language']] = 'Jewish Babylonian Aramaic'
# df.loc[df.language == 'Pahlavi', ['language']] = 'Middle Persian'
# Late Middle Indo-Aryan in the new instead of Middle Indo Aryan

# Save
df.to_csv(path + f'{key}.csv')
df

Unnamed: 0,skip,language,term,transliteration,group,id,iso,level,family,macroarea,country,lat,lon,source
0,,Afrikaans,kaneel,,canela,afri1274,afr,language,Indo-European,Africa,BW MZ NA ZA ZM ZW,-22.000000,30.000000,glottolog
35,,Estonian,kaneel,,canela,esto1258,ekk,language,Uralic,Eurasia,EE LV RU,58.550000,25.820000,glottolog
37,,Faroese,kanel,,canela,faro1244,fao,language,Indo-European,Eurasia,DK FO,62.073200,-6.884970,glottolog
38,,Finnish,kaneli,,canela,finn1318,fin,language,Uralic,Eurasia,EE FI NO RU SE,64.762800,25.557700,glottolog
39,,French,cannelle,,canela,stan1290,fra,language,Indo-European,Eurasia,AD BE CA CH FR GF IT LU MC PM TF US,48.000000,2.000000,glottolog
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
127,,Sanskrit,चोच,coca,other,sans1269,san,language,Indo-European,Eurasia,IN,20.000000,77.000000,glottolog
130,,Shan,ၵႅင်းႁွမ်,shn,other,shan1277,shn,language,taik1256,Eurasia,CN MM TH,21.599500,98.034200,glottolog
131,,Sinhala,කුරුඳු,kuruⁿdu,other,sinh1246,sin,language,Indo-European,Eurasia,LK,8.000000,81.000000,glottolog
103,,Chokri Naga,"chipfweketo, thime",,other,chok1243,nri,language,Sino-Tibetan,Eurasia,IN,25.563600,94.288700,glottolog


## Plotly

In [371]:
# Prism colors in hex code (without gray)
prism = ['#5f4690', '#1d6996', '#38a6a5', '#0f8554', '#73af48', '#edae08', '#e17909', '#cc503e', '#94346e', '#6f4070']

# Transparency from black
transparent = "rgba(0,0,0,0)"
three_quarters_transparent = 'rgba(0,0,0,0.75)'
half_transparent = 'rgba(0,0,0,0.5)'
quarter_transparent = 'rgba(0,0,0,0.25)'
tenth_transparent = 'rgba(0,0,0,0.1)'

# Visual variables for map (dark mode)
font_size = 12
font_color = "#dddddd"
font_family = "Noto Sans"
marker_symbol= 'circle'
marker_size = 12
max_marker_size = 32
edge_color = transparent
edge_size = 1
opacity = 0.75
line_width = 4
water = "#202020"
grid_color = "#282828"
land = "#303030"
lines = "#383838"
copyright_color = "#404040"
background_color = transparent
legend_background_color = quarter_transparent
color_scheme = prism

# # Visual variables for map (light mode)
# font_size = 12
# font_color = "#000000"
# font_family = "Noto Serif"
# marker_symbol= 'circle'
# marker_size = 12
# max_marker_size = 32
# edge_color = transparent
# edge_size = 1
# opacity = 0.75
# line_width = 4
# water = "#ffffff"
# grid_color = "#f7f7f7"
# land = "#ffffff"
# lines = "#777777"
# copyright_color = "#f7f7f7"
# background_color = transparent
# legend_background_color = quarter_transparent
# color_scheme = prism

# Orthographic globe layout
ortho_traces = dict(
    textposition = 'top right', # middle left, bottom center, etc.
    textfont = dict(size=font_size, color=font_color, family=font_family),
    hovertemplate=
        "<b>%{customdata[0]}</b><br><br>" +
        "Species: <i>%{customdata[1]}</i><br>" +
        "Family: <i>%{customdata[2]}</i><br>" +
        "Region of origin: %{customdata[3]}<br>" +
        "Arabic: %{customdata[4]} <i>%{customdata[5]}</i><br>" +
        "Chinese: %{customdata[6]} <i>%{customdata[7]}</i><br>" +
        # "Spreadability: %{customdata[7]:.2f}<br>" +
        "<extra></extra>",
    marker = dict(
        symbol = marker_symbol,
        size = marker_size,
        line = dict(
            color=edge_color,
            width=edge_size
        )
    )
)

ortho_layout = go.Layout(
    paper_bgcolor=background_color,
    plot_bgcolor=background_color,
    geo = dict(
        resolution=110, # 50 is large; 110 is small
        scope='world', # 'world', 'asia'
        projection_type = 'orthographic', # orthographic, natural earth
        projection_scale = 1,
        projection_rotation = {'lat': 20, 'lon': 80, 'roll': 0},
        bgcolor=background_color,
        showcoastlines=True, coastlinewidth = 1, coastlinecolor = lines,
        showcountries=False, countrywidth = 1, countrycolor = lines,
        showframe=True, framewidth = 1, framecolor = lines,
        showlakes=True, lakecolor = water,
        showland=True, landcolor = land,
        showocean=True, oceancolor = water,
        showrivers=True, riverwidth = 1, rivercolor = water,
        showsubunits=True, subunitwidth = 0, subunitcolor = lines,
        lonaxis = dict(showgrid = True, gridwidth = 0.5, dtick = 10, gridcolor=grid_color),
        lataxis = dict (showgrid = True, gridwidth = 0.5, dtick = 10, gridcolor=grid_color)),
    showlegend = True,
    legend=dict(x=0, y=0, xanchor="left", yanchor="bottom", bgcolor=legend_background_color,
                font=dict(color=font_color, size=font_size, family=font_family),
                title_font=dict(color=font_color, size=font_size+2, family=font_family),
                traceorder = 'normal', orientation="v"),
    title=dict(x=0.5, y=0.99, xanchor='center', yanchor='top', text='',
               font=dict(color=font_color, size=font_size+6, family=font_family)),
    margin={"r":0,"t":0,"l":0,"b":0},
    hoverlabel=dict(#bgcolor="white",
                    font_size=font_size,
                    font_family=font_family),
    )

# "Document size" for pdfs
document_size = dict(width = 600, height=600)

# Copyright
cr = dict(
    name="copyright",
    text="© Gábor Parti, 2024",
    font=dict(color=copyright_color, size=8, family=font_family),
    opacity=0.9,
    xref="paper",
    yref="paper",
    x=0.5,
    y=0,
    # xanchor="right", 
    # yanchor="bottom", 
    # align="center",
    showarrow=False,
)
# fig.update_layout(annotations = [cr]) # to call

In [372]:
# Load
df = pd.read_csv(path + f'{key}.csv', header =[0], delimiter=',', encoding="utf-8")
df

Unnamed: 0.1,Unnamed: 0,skip,language,term,transliteration,group,id,iso,level,family,macroarea,country,lat,lon,source
0,0,,Afrikaans,kaneel,,canela,afri1274,afr,language,Indo-European,Africa,BW MZ NA ZA ZM ZW,-22.000000,30.000000,glottolog
1,35,,Estonian,kaneel,,canela,esto1258,ekk,language,Uralic,Eurasia,EE LV RU,58.550000,25.820000,glottolog
2,37,,Faroese,kanel,,canela,faro1244,fao,language,Indo-European,Eurasia,DK FO,62.073200,-6.884970,glottolog
3,38,,Finnish,kaneli,,canela,finn1318,fin,language,Uralic,Eurasia,EE FI NO RU SE,64.762800,25.557700,glottolog
4,39,,French,cannelle,,canela,stan1290,fra,language,Indo-European,Eurasia,AD BE CA CH FR GF IT LU MC PM TF US,48.000000,2.000000,glottolog
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159,127,,Sanskrit,चोच,coca,other,sans1269,san,language,Indo-European,Eurasia,IN,20.000000,77.000000,glottolog
160,130,,Shan,ၵႅင်းႁွမ်,shn,other,shan1277,shn,language,taik1256,Eurasia,CN MM TH,21.599500,98.034200,glottolog
161,131,,Sinhala,කුරුඳු,kuruⁿdu,other,sinh1246,sin,language,Indo-European,Eurasia,LK,8.000000,81.000000,glottolog
162,103,,Chokri Naga,"chipfweketo, thime",,other,chok1243,nri,language,Sino-Tibetan,Eurasia,IN,25.563600,94.288700,glottolog


In [373]:
# Repel data points if they overlap (check for language duplicates  and remove them)    

In [374]:
# Change NaN in transliteration to ''
df['transliteration'] = df['transliteration'].fillna('')

# Create an item colum that contains the term and the transliteration with the latter in italics
df['item'] = df['term'] + ' <i>' + df['transliteration'] + '</i>'

# Create text for annotation label
df['text'] = df['term'] + '<br>' + df['transliteration'].astype(str) + '<br>Language: ' + df['language'] + '<br>Family: ' + df['family']

# df['text'] = [re.sub(r"<br>nan<br>", "<br>", str(x)) for x in df['text']]
# df['term'] = [re.sub(r"\u200e", "", str(x)) for x in df['term']] #removes right to left mark
# df['term'] = [re.sub(r" *$", "", str(x)) for x in df['term']] #!

In [375]:
# Document
fig = px.scatter_geo(df,
    lat='lat', 
    lon='lon',
    text='item',
    color='group',
    color_discrete_sequence=prism,
    opacity = opacity,
    hover_name='item',
    hover_data={'term':False, 'language':True, 'family':True, 'item':False, 'lon':False, 'lat':False, 'group':False},
    labels={"group": "category"}
    )

fig.update_traces(mode = "markers",
                  textposition='middle right',
                  textfont={"color": font_color, "size": font_size, "family": font_family},
                  marker=dict(symbol=marker_symbol, size=marker_size, opacity=opacity, line=dict(color=edge_color, width=1)),
                #   hovertemplate=None
                  )

fig.update_layout(ortho_layout)
fig.update_layout(geo=dict(projection_rotation = {'lat': 15, 'lon': 60, 'roll': 0}))
# fig.update_layout(document_size)
# fig.update_layout(title_text="Distribution of words for " + key + " in a few languages")
fig.update_layout(annotations = [cr])

fig.show()

# Write
filename = "distribution_" + key
fig.write_image(path + filename + ".pdf", engine="kaleido")
fig.write_image(path + filename + ".png", scale=3)
fig.write_html(path + filename + ".html")
fig.write_json(path + filename + ".json", validate=True, pretty=True)

In [376]:
# ################################################################################
# #### HTML
# fig = px.scatter_geo(df,
#     lat='lat', 
#     lon='lon',
#     text='item',
#     color='group',
#     color_discrete_sequence=[p1,p2,p3,p4,p5,p6,p11],
#     opacity = opacity,
#     hover_name='item',
#     hover_data={'term':True, 'language':True, 'family':True, 'item':False, 'lon':False, 'lat':False, 'group':False},
#     # labels={"group": "category"}
#     )

# fig.update_traces(mode = "markers",
#                   textposition='middle right',
#                   textfont={"color": font_color, "size": font_size, "family": font_family},
#                   marker=dict(symbol=marker_symbol, size=marker_size, opacity=opacity, line=dict(color=edge_color, width=1)),
#                 #   hovertemplate=None
#                   )

# fig.update_layout(ortho_layout)
# fig.update_layout(geo=dict(projection_rotation = {'lat': 15, 'lon': 60, 'roll': 0}))
# fig.update_layout(title_text="Distribution of words for " + key + " in a few languages")
# fig.update_layout(annotations = [cr])
# fig.add_layout_image(logo)

# fig.show()

# # write
# filename = "distribution_" + key
# fig.write_html(path_out_html + filename + ".html")
# fig.write_json(path_out_json + filename + ".json", validate=True, pretty=True)

# End