## Wiktionary extractor

In [132]:
import re
import pandas as pd
import requests
from bs4 import BeautifulSoup

# response = requests.get("https://en.wiktionary.org/wiki/cinnamon")
# soup = BeautifulSoup(response.text, 'html.parser')
# soup

In [133]:
# Variables
key = 'cinnamon'
path = 'data\\wiktionary\\'

In [134]:
# Define the URL
url = f"https://en.wiktionary.org/wiki/{key}"

# Get the HTML content of the page
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

# Get the translations
ids = ['Translations-spice', 'Translations-\'\'Cinnamomum_verum\'\''] ### 

# Create a list of tuples containing the sense, language, and translation
translations = []
for id in ids:
    translations_div = soup.find('div', {'id': id})
    if translations_div is not None:
        for li in translations_div.find_all('li'):
            lang_and_trans = li.get_text().split(':')
            if len(lang_and_trans) == 2:
                lang = lang_and_trans[0].strip()
                trans = lang_and_trans[1].strip()
                translations.append((id, lang, trans))

# Create a DataFrame from the list of tuples
df = pd.DataFrame(translations, columns=['sense', 'language', 'term'])

# Use regex to replace cells in the sense column of df
df['sense'] = [re.sub("Translations-", "", str(x)) for x in df['sense']]

# # Split the rows with multiple translations into separate rows
# df['term'] = df['term'].str.split(', ')
# df = df.explode('term')

# Define a function to split on commas not inside parentheses
def split_not_in_parentheses(s):
    return re.split(r',\s*(?![^()]*\))', s)

# Apply the function to the 'term' column
df['term'] = df['term'].apply(split_not_in_parentheses)

# Explode the 'term' column
df = df.explode('term')

# Save the DataFrame to a CSV file
# df.to_csv(path + 'raw_translations_for_check.csv', index=False)

# Drop duplicates
df = df.drop_duplicates(subset = ['language', 'term'], keep = 'first').reset_index(drop = True)

# Get list of senses
list_of_senses = df['sense'].unique().tolist()

# Filter duplicates among senses and get rid of B name if A is found. # Change subset to be more strict e.g. subset=['language', 'term']
if len(list_of_senses) > 1:
    df['duplicate'] = df[df.duplicated(subset=['language', 'term'], keep=False)]['sense'] == list_of_senses[1]
    df.drop(df[df['duplicate'] == True].index, inplace=True)
    df.drop(['duplicate'], axis=1, inplace=True)
    df.reset_index(inplace=True, drop=True)

# Cleaning
df['term'] = [re.sub('\xa0', " ", str(x)) for x in df['term']]
df['term'] = [re.sub(r' m ', " ", str(x)) for x in df['term']]
df['term'] = [re.sub(r' f ', " ", str(x)) for x in df['term']]
df['term'] = [re.sub(r' n ', " ", str(x)) for x in df['term']]
df['term'] = [re.sub(r' c ', " ", str(x)) for x in df['term']]
df['term'] = [re.sub(r' pl ', " ", str(x)) for x in df['term']]
df['term'] = [re.sub(r' m,', ",", str(x)) for x in df['term']]
df['term'] = [re.sub(r' f,', ",", str(x)) for x in df['term']]
df['term'] = [re.sub(r' n,', ",", str(x)) for x in df['term']]
df['term'] = [re.sub(r' c,', ",", str(x)) for x in df['term']]
df['term'] = [re.sub(r' pl,', ",", str(x)) for x in df['term']]
df['term'] = [re.sub(r' m$', "", str(x)) for x in df['term']]
df['term'] = [re.sub(r' f$', "", str(x)) for x in df['term']]
df['term'] = [re.sub(r' n$', "", str(x)) for x in df['term']]
df['term'] = [re.sub(r' c$', "", str(x)) for x in df['term']]
df['term'] = [re.sub(r' pl$', "", str(x)) for x in df['term']]
df['term'] = [re.sub(r'\(bcl\)', "", str(x)) for x in df['term']]
df['term'] = [re.sub(r'\(nds\)', "", str(x)) for x in df['term']]
df['term'] = [re.sub(r'\(scn\)', "", str(x)) for x in df['term']]
df['term'] = [re.sub(r'\(ast\)', "", str(x)) for x in df['term']]
df['term'] = [re.sub(r'\(Föhr-Amrum\)', "", str(x)) for x in df['term']]
df['term'] = [re.sub(r'\s?\(\w\w\)', "", str(x)) for x in df['term']]
df['term'] = [re.sub(r'\(please verify\)', "", str(x)) for x in df['term']]
df['term'] = [re.sub(r'\s+', " ", str(x)) for x in df['term']]
df['term'] = [re.sub(r' ,', ",", str(x)) for x in df['term']]
df['term'] = [re.sub(r'^\s', "", str(x)) for x in df['term']]
df['term'] = [re.sub(r'\s$', "", str(x)) for x in df['term']]

# Other
df['term'] = [re.sub(r"\(taraškievica\)", "", str(x)) for x in df['term']]
df['term'] = [re.sub(r"class 9/10", "", str(x)) for x in df['term']]

# Change ( and ) to * and * 
df['term'] = [re.sub(r'\(', "*", str(x)) for x in df['term']]
df['term'] = [re.sub(r'\)', "*", str(x)) for x in df['term']]

# Remove ⁧ and ⁩ from term
df['term'] = [re.sub(r'⁧', "", str(x)) for x in df['term']]
df['term'] = [re.sub(r'⁩', "", str(x)) for x in df['term']]

# drop NA
df = df[df.term != "please add this translation if you can"]

# If there is a star in the term column, split the column into 'script' and 'transliteration' columns
df['word'] = df['term'].apply(lambda x: x.split('*')[0].strip())
df['transliteration'] = df['term'].apply(lambda x: x.split('*')[1].strip() if len(x.split('*')) > 1 else None)

# Reorder by alphabetizing the language column
df = df.sort_values('language').reset_index(drop=True)

# Create source
df['source'] = 'Wiktionary'
df['group'] = ''
df['skip'] = ''

# reorder
df = df[['skip', 'sense', 'language', 'term', 'group', 'word', 'transliteration', 'source']]

# Save the DataFrame to a CSV file
df.to_csv(path + f'{key}_gen.csv', index=False)
df.to_excel(path + f'{key}_gen.xlsx', sheet_name='wiktionary', index=None)

# Print
df.head(60)

Unnamed: 0,skip,sense,language,term,group,word,transliteration,source
0,,spice,Afrikaans,kaneel,,kaneel,,Wiktionary
1,,''Cinnamomum_verum'',Afrikaans,kaneelboom,,kaneelboom,,Wiktionary
2,,spice,Albanian,kanellë,,kanellë,,Wiktionary
3,,spice,Amharic,ቀረፋ *ḳäräfa*,,ቀረፋ,ḳäräfa,Wiktionary
4,,''Cinnamomum_verum'',Armenian,դարչին *darčʿin*,,դարչին,darčʿin,Wiktionary
5,,''Cinnamomum_verum'',Assamese,দালচেনি *dalseni*,,দালচেনি,dalseni,Wiktionary
6,,spice,Azerbaijani,darçın,,darçın,,Wiktionary
7,,''Cinnamomum_verum'',Basque,kanela,,kanela,,Wiktionary
8,,spice,Belarusian,кары́ца *karýca*,,кары́ца,karýca,Wiktionary
9,,spice,Belarusian,цынамо́н *cynamón*,,цынамо́н,cynamón,Wiktionary


# End

In [129]:
# Change languages to Glottolog name
def glottologize():
    df['language'] = [re.sub(r"^Arabic$", "Standard Arabic", str(x)) for x in df['language']]
    df['language'] = [re.sub(r"^Armenian$", "Eastern Armenian", str(x)) for x in df['language']]
    df['language'] = [re.sub(r"^Armenian (Eastern)$", "Eastern Armenian", str(x)) for x in df['language']]
    df['language'] = [re.sub(r"^Armenian (Western)$", "Western Armenian", str(x)) for x in df['language']]
    df['language'] = [re.sub(r"^Bikol Central$", "Coastal-Naga Bikol", str(x)) for x in df['language']]
    df['language'] = [re.sub(r"^Bosnian$", "Bosnian Standard", str(x)) for x in df['language']]
    df['language'] = [re.sub(r"^Croatian$", "Croatian Standard", str(x)) for x in df['language']]
    df['language'] = [re.sub(r"^Gaelic$", "Scottish Gaelic", str(x)) for x in df['language']]
    df['language'] = [re.sub(r"^Greek$", "Modern Greek", str(x)) for x in df['language']]
    df['language'] = [re.sub(r"^Greenlandic$", "Kalaallisut", str(x)) for x in df['language']]
    df['language'] = [re.sub(r"^Hebrew$", "Modern Hebrew", str(x)) for x in df['language']]
    df['language'] = [re.sub(r"^Kyrgyz$", "Kirghiz", str(x)) for x in df['language']]
    df['language'] = [re.sub(r"^Luxembourgish$", "Luxemburgish", str(x)) for x in df['language']]
    df['language'] = [re.sub(r"^Mandarin$", "Mandarin Chinese", str(x)) for x in df['language']]
    df['language'] = [re.sub(r"^Manipuri (Meitei-Lon)$", "Manipuri", str(x)) for x in df['language']]
    df['language'] = [re.sub(r"^Naga (Sumi)$", "Sumi Naga", str(x)) for x in df['language']]
    df['language'] = [re.sub(r"^Naga (Tangkhul)$", "North-Central Tangkhul Naga", str(x)) for x in df['language']] # A hypoglot
    df['language'] = [re.sub(r"^Naga (Rengma)$", "Northern Rengma Naga", str(x)) for x in df['language']]
    df['language'] = [re.sub(r"^Naga (Lotha)$", "Lotha Naga", str(x)) for x in df['language']]
    df['language'] = [re.sub(r"^Naga (Konyak)$", "Konyak Naga", str(x)) for x in df['language']]
    df['language'] = [re.sub(r"^Naga (Chakhesang-Chokri)$", "Chokri Naga", str(x)) for x in df['language']]
    df['language'] = [re.sub(r"^Naga (Ao)$", "Ao Naga", str(x)) for x in df['language']]
    df['language'] = [re.sub(r"^Naga (Angami)$", "Angami Naga", str(x)) for x in df['language']]
    df['language'] = [re.sub(r"^Norman$", "Anglo-Norman", str(x)) for x in df['language']]
    df['language'] = [re.sub(r"^North Frisian$", "Northern Frisian", str(x)) for x in df['language']]
    df['language'] = [re.sub(r"^North Sami$", "North Saami", str(x)) for x in df['language']]
    df['language'] = [re.sub(r"^Old Armenian$", "Classical-Middle Armenian", str(x)) for x in df['language']]
    df['language'] = [re.sub(r"^Old Church Slavonic$", "Church Slavic", str(x)) for x in df['language']]
    df['language'] = [re.sub(r"^Old East Slavic$", "Old Russian", str(x)) for x in df['language']]
    df['language'] = [re.sub(r"^Old Javanese$", "Kawi", str(x)) for x in df['language']]
    df['language'] = [re.sub(r"^Ossetian$", "Modern Ossetic", str(x)) for x in df['language']]
    df['language'] = [re.sub(r"^Punjabi$", "Eastern Panjabi", str(x)) for x in df['language']]
    df['language'] = [re.sub(r"^Serbian$", "Serbian Standard", str(x)) for x in df['language']]
    df['language'] = [re.sub(r"^Serbo-Croatian$", "Serbian-Croatian-Bosnian", str(x)) for x in df['language']]
    df['language'] = [re.sub(r"^West Frisian$", "Western Frisian", str(x)) for x in df['language']]

    return df

glottologize()

Unnamed: 0,sense,language,term,group,word,transliteration,source
0,spice,Afrikaans,kaneel,,kaneel,,Wiktionary
1,''Cinnamomum_verum'',Afrikaans,kaneelboom,,kaneelboom,,Wiktionary
2,spice,Albanian,kanellë,,kanellë,,Wiktionary
3,spice,Amharic,ቀረፋ *ḳäräfa*,,ቀረፋ,ḳäräfa,Wiktionary
4,''Cinnamomum_verum'',Eastern Armenian,դարչին *darčʿin*,,դարչին,darčʿin,Wiktionary
...,...,...,...,...,...,...,...
157,spice,Volapük,kirfat,,kirfat,,Wiktionary
158,''Cinnamomum_verum'',Volapük,kirfatep,,kirfatep,,Wiktionary
159,spice,Welsh,synamon,,synamon,,Wiktionary
160,spice,Yiddish,צימערינג *tsimering*,,צימערינג,tsimering,Wiktionary


## Manual steps

Now the manual work: Use the generated file `key_generated.xlsx` to fix, amend, append, group, and organize the names, with the addition of other sources and create a `key.xlsx` master list.

Steps:
 1. Manual check transliterations, especially Dhivehi.
 2. Analyze and group words/names.
 3. Mark uncertain ones for skipping with 'yes'.

Recommended sources:
* Katzer (needs serious checking)
* WOLD
* Max Planck databases (CLIC3, etc.)
* others... 

After that, more preprocessing, cleaning, and merging with language data and coordinates.

In [130]:
# Read in dataframe from xlsx
df = pd.read_excel(path + f'{key}_gen.xlsx', sheet_name='wiktionary')

# Extract only desired columns
selectlist = ['skip', 'language', 'term', 'transliteration', 'item', 'group']
df = df[selectlist]

# Drop rows with 'skip' == 'yes'
print("Before skipping: ", df.shape[0])
df=df[df['skip'] != "yes"]
print("After skipping: ", df.shape[0])

# # drop columns manually
# df.drop(columns=['skip', 'literal', 'explanation', 'IPA', 'source zotero', 'notes', 'type', 'katzer', 'katzer tr', 'checked', 'reference', 'link'], inplace=True)

Unnamed: 0,sense,language,term,group,word,transliteration,source
0,spice,Afrikaans,kaneel,,kaneel,,Wiktionary
1,''Cinnamomum_verum'',Afrikaans,kaneelboom,,kaneelboom,,Wiktionary
2,spice,Albanian,kanellë,,kanellë,,Wiktionary
3,spice,Amharic,ቀረፋ *ḳäräfa*,,ቀረፋ,ḳäräfa,Wiktionary
4,''Cinnamomum_verum'',Armenian,դարչին *darčʿin*,,դարչին,darčʿin,Wiktionary
...,...,...,...,...,...,...,...
157,spice,Volapük,kirfat,,kirfat,,Wiktionary
158,''Cinnamomum_verum'',Volapük,kirfatep,,kirfatep,,Wiktionary
159,spice,Welsh,synamon,,synamon,,Wiktionary
160,spice,Yiddish,צימערינג *tsimering*,,צימערינג,tsimering,Wiktionary


In [135]:
languages = pd.read_csv('data\\glottolog\\languoid.csv', header =[0], delimiter=',', encoding="utf-8", index_col=[0])
languages


Unnamed: 0_level_0,family_id,parent_id,name,bookkeeping,level,latitude,longitude,iso639P3code,description,markup_description,child_family_count,child_language_count,child_dialect_count,country_ids
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
3adt1234,afro1255,nort3292,3Ad-Tekles,False,dialect,,,,,,0,0,0,
aala1237,aust1307,ramo1244,Aalawa,False,dialect,,,,,,0,0,0,
aant1238,nucl1709,nort2920,Aantantara,False,dialect,,,,,,0,0,0,
aari1238,sout2845,ahkk1235,Aari-Gayil,False,family,,,aiz,,,0,2,0,
aari1239,sout2845,aari1238,Aari,False,language,5.95034,36.5721,aiw,,,0,0,0,ET
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zuti1239,tupi1275,guaj1255,Guajajára of Zutiua,False,dialect,,,,,,0,0,0,
zuwa1238,koia1260,omie1241,Zuwadza,False,dialect,,,,,,0,0,0,
zwal1238,atla1278,shal1242,Zwall,False,dialect,,,,,,0,0,0,
zyph1238,sino1245,nucl1757,Zyphe,False,language,22.52400,93.2640,zyp,,,0,0,2,IN MM


In [None]:
# merge input and languages
df = pd.merge(df_input, languages, on=['language'])
print("Merged:", df.shape)

# FIX BELOW HERE

In [None]:

# # merge input and languages
# df = pd.merge(df_input, languages, on=['language'])
# print("Merged:", df.shape)

# #drop duplicates
# df.drop_duplicates(subset=['language', 'term'], keep='first', inplace=True, ignore_index=True)
# print("Dropping duplicates:", df.shape)

# multilingual = df
# multilingual

# #check missing ones
# temp = pd.merge(df_input, multilingual, how='outer', suffixes=('','_y'), indicator=True)
# missing = temp[temp['_merge']=='left_only'][df_input.columns]
# print("The following terms and languages have failed to load:")
# print(missing)

# # df = df.dropna() #OPERATIVE ONLY
# # df = df.fillna('x')

# #sort by categories, cinnamon ######## AUTOMATE ########
# df['group'] = pd.Categorical(df['group'], ["canela", "kinnamon", "korica", "qirfa", "darchin", "gui", "other"]) # add categorical order here
# df.sort_values("group", inplace = True) # sort according to the categories

# # #sort by categories, pepper ######## AUTOMATE ########
# # df['group'] = pd.Categorical(df['group'], ["pippali", "pigment", "marica", "hujiao", "other"]) # add categorical order here
# # df.sort_values("group", inplace = True) # sort according to the categories

# # create text for annotation label
# df['text'] = df['term'] + '<br>' + df['transliteration'].astype(str) + '<br>Language: ' + df['language'] + '<br>Family: ' + df['family']
# df['text'] = [re.sub(r"<br>nan<br>", "<br>", str(x)) for x in df['text']]

# df['term'] = [re.sub(r"\u200e", "", str(x)) for x in df['term']] #removes right to left mark
# df['term'] = [re.sub(r" *$", "", str(x)) for x in df['term']] #!

# # reindex?

# # save
# df.to_csv(path_in_wiktionary + "multilingual/" + key +'.csv')
# df