# Spice Core

This interactive Python notebook helps to process and analyze the database on spices on three levels: the spices, their names, and their etymologies. I use this code to create content for (a) my Ph.D. thesis at PolyU (a LaTeX project), (b) my Spices and Spice Terminology website (a Hugo project), and (c) any other related projects (papers, datasets to publish, etymology tools, future book). The pipeline outputs different files and formats for different purposes, e.g. `tex` and `pdf` files for the thesis, `md` and `json` files for the website. This notebook is intended to be a the main working source code of all related projects, used for development and testing.


# Setup

In [26]:
# # Installations
# !pip install geopy
# !pip install plotly
# !pip install kaleido
# !pip install openpyxl
# !pip install py-markdown-table
# !pip install mdutils
# !pip install pdf2image


In [27]:
# Import dependencies
import pandas as pd
import regex as re
import numpy as np
import glob
from collections import defaultdict
import subprocess
from openpyxl import Workbook
import csv

import plotly
import plotly.io as pio
from plotly.io import write_image, write_json
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import kaleido

# from mdutils.mdutils import MdUtils
# from mdutils import Html

# from IPython.display import display

# if using Google Colab:
# from google.colab import files
# %load_ext google.colab.data_table

print("Dependencies have been loaded with no problems.")

Dependencies have been loaded with no problems.


## Functions

In [28]:
################################################################################
# Roman numerals
def roman(num: int) -> str:

    chlist = "VXLCDM"
    rev = [int(ch) for ch in reversed(str(num))]
    chlist = ["I"] + [chlist[i % len(chlist)] + "\u0304" * (i // len(chlist))
                    for i in range(0, len(rev) * 2)]

    def period(p: int, ten: str, five: str, one: str) -> str:
        if p == 9:
            return one + ten
        elif p >= 5:
            return five + one * (p - 5)
        elif p == 4:
            return one + five
        else:
            return one * p

    return "".join(reversed([period(rev[i], chlist[i * 2 + 2], chlist[i * 2 + 1], chlist[i * 2])
                            for i in range(0, len(rev))]))

def century(year):
    return (year) // 100 + 1 

print(roman(17))

################################################################################
# List files in a folder (with path)
def list_files(dir):                                                                                                  
    r = []                                                                                                            
    subdirs = [x[0] for x in os.walk(dir)]                                                                            
    for subdir in subdirs:                                                                                            
        files = os.walk(subdir).__next__()[2]                                                                             
        if (len(files) > 0):                                                                                          
            for file in files:                                                                                        
                r.append(os.path.join(subdir, file))                                                                         
    return r

################################################################################
# Move files between folders
import os, shutil, pathlib, fnmatch

def move_dir(src: str, dst: str, pattern: str = '*'):
    if not os.path.isdir(dst):
        pathlib.Path(dst).mkdir(parents=True, exist_ok=True)
    for f in fnmatch.filter(os.listdir(src), pattern):
        shutil.move(os.path.join(src, f), os.path.join(dst, f))

def copy_dir(src: str, dst: str, pattern: str = '*'):
    if not os.path.isdir(dst):
        pathlib.Path(dst).mkdir(parents=True, exist_ok=True)
    for f in fnmatch.filter(os.listdir(src), pattern):
        shutil.copy(os.path.join(src, f), os.path.join(dst, f))

################################################################################
# Convert PDFs
from pdf2image import convert_from_path

def convert_pdf_to_png(file):
    name = str(file)
    name = re.sub(".*(?=/)", "", name)
    name = re.sub("\..*", "", name)
    pages = convert_from_path(file, 0)
    for page in pages:
        page.save(path + name + ".png", 'PNG')

################################################################################
# # Geopy
# from geopy.geocoders import Nominatim

# # Initialize Nominatim API (Requires internet connection)
# geolocator = Nominatim(user_agent="MyApp")

# def coordinates(place):
#     location = geolocator.geocode(place)
#     lat, lon = location.latitude, location.longitude
#     coord = [lat, lon]
#     return coord

# print(coordinates("Hong Kong"))

################################################################################

# (?!) - negative lookahead
# (?=) - positive lookahead
# (?<=) - positive lookbehind
# (?<!) - negative lookbehind

# (?>) - atomic group


XVII


In [29]:
# # Example 1
# location = geolocator.geocode("Budapest")

# print("The latitude of the location is: ", location.latitude)
# print("The longitude of the location is: ", location.longitude)

In [30]:
# Example 2
# from tkinter import *
# from geopy.geocoders import Nominatim

# # Create an instance of tkinter frame
# win = Tk()

# # Define geometry of the window
# win.geometry("700x350")

# # Initialize Nominatim API
# geolocator = Nominatim(user_agent="MyApp")

# # Latitude & Longitude input
# coordinates = "17.3850 , 78.4867"

# location = geolocator.reverse(coordinates)

# address = location.raw['address']

# # Traverse the data
# city = address.get('city', '')
# state = address.get('state', '')
# country = address.get('country', '')

# # Create a Label widget
# label1=Label(text="Given Latitude and Longitude: " + coordinates, font=("Calibri", 24, "bold"))
# label1.pack(pady=20)

# label2=Label(text="The city is: " + city, font=("Calibri", 24, "bold"))
# label2.pack(pady=20)

# label3=Label(text="The state is: " + state, font=("Calibri", 24, "bold"))
# label3.pack(pady=20)

# label4=Label(text="The country is: " + country, font=("Calibri", 24, "bold"))
# label4.pack(pady=20)

# win.mainloop()

## Variables

In [31]:
# Color scheme

# # print(px.colors.qualitative.Prism) #to see the color codes
# # https://plotly.com/python/discrete-color/ # See the colormaps here, or construct a sequence like:
# # color_discrete_sequence=["red", "green", "blue", "goldenrod", "magenta"]
# # Using Sequential Scales as Discrete Sequences: color_discrete_sequence= px.colors.sequential.Plasma_r,

prism = px.colors.qualitative.Prism
print(px.colors.qualitative.Prism)
# antique = px.colors.qualitative.Antique
# bold = px.colors.qualitative.Bold
# pastel = px.colors.qualitative.Pastel
# safe = px.colors.qualitative.Safe
# vivid = px.colors.qualitative.Vivid

# #Prism colors:
ppurple = '#5f4690' #'rgb(95, 70, 144)'
pblue = '#1d6996' #'rgb(29, 105, 150)'
pturquiose = '#38a6a5' #'rgb(56, 166, 165)'
pgreen = '#0f8554' #'rgb(15, 133, 84)'
plime = '#73af48' #'rgb(115, 175, 72)'
pyellow = '#edad08' #'rgb(237, 173, 8)'
porange = '#e17c05' #'rgb(225, 124, 5)'
pred = '#cc503e' #'rgb(204, 80, 62)'
pmagenta = '#94346e' #'rgb(148, 52, 110)'
pfuchsia = '#6f4070' #'rgb(111, 64, 112)'
pgray = '#808080' #'rgb(128,128,128)', originally: #'rgb(102, 102, 102)'
pblack = '#000000'

p1=ppurple
p2=pblue
p3=pturquiose
p4=pgreen
p5=plime
p6=pyellow
p7=porange
p8=pred 
p9=pmagenta
p10=pfuchsia
p11=pgray
p12=pblack

prism_extended = [p1,p2,p3,p4,p5,p6,p7,p8,p9,p10,p11,p12]

# Other colors
PolyU='#8f1329'  	# (143,19,41) 
PolyUcomp = '#138f79'
MidnightBlue='#006795'
# https://en.wikibooks.org/wiki/LaTeX/Colors

# Maps ################

# Variables (light) ########
transparent = 'rgba(255,255,255,0)'
half_transparent = 'rgba(255,255,255,0.5)'
quarter_transparent = 'rgba(255,255,255,0.25)'
tenth_transparent = 'rgba(255,255,255,0.1)'

marker_symbol= 'circle'
marker_size = 14
edge_size = 1
edge_color = 'white'
opacity = 0.75
line_width = 4
font_size = 14
font_family = "Serif"
font_color = "black"
# water = '#ebedef'
# grid_color = '#d6dbdf'
# land = '#aeb6bf'
# lines = '#85929e'
# copyright_color = '#5d6d7e'
water = 'white'
grid_color = '#EDEDED'
land = 'gainsboro'
lines = 'gainsboro'
copyright_color = 'lightgray'
background_color = transparent
legend_background_color = tenth_transparent

# # Variables (dark) ########
# transparent = 'rgba(0,0,0,0)'
# half_transparent = 'rgba(0,0,0,0.5)'
# quarter_transparent = 'rgba(0,0,0,0.25)'
# tenth_transparent = 'rgba(0,0,0,0.1)'

# marker_symbol= 'circle'
# marker_size = 14
# edge_size = 1
# edge_color = 'black'
# opacity = 0.75
# line_width = 4
# font_size = 14
# font_family = 'Serif'
# font_color = 'black'
# water = '#212f3c'
# grid_color = '#283747'
# land = ' #2e4053'
# lines = '#34495e'
# copyright_color = '#5d6d7e'
# background_color = transparent
# legend_background_color = tenth_transparent

['rgb(95, 70, 144)', 'rgb(29, 105, 150)', 'rgb(56, 166, 165)', 'rgb(15, 133, 84)', 'rgb(115, 175, 72)', 'rgb(237, 173, 8)', 'rgb(225, 124, 5)', 'rgb(204, 80, 62)', 'rgb(148, 52, 110)', 'rgb(111, 64, 112)', 'rgb(102, 102, 102)']


## Plotly builder

Premade layouts, predefined templates, and reusable code snippets

In [32]:
################################################################################
# Plotly templates

# built-in templates
# for template in ["plotly", "plotly_white", "plotly_dark", "ggplot2", "seaborn", "simple_white", "none"]:
#     fig = px.scatter(df, x="year", y="id", color="class",
#                     #  log_x=True, size_max=60,
#                      template=template)

pio.templates["odd"] = go.layout.Template(layout=go.Layout(colorway=[p1,p3,p5,p7,p9]))
pio.templates["even"] = go.layout.Template(layout=go.Layout(colorway=[p2,p4,p6,p8,p10]))
pio.templates["prism"] = go.layout.Template(layout=go.Layout(colorway=prism))
pio.templates["top5"] = go.layout.Template(layout=go.Layout(colorway=[p1,p2,p3,p4,p5,p11,p6,p7,p8,p9,p10]))
pio.templates["trilingual"] = go.layout.Template(layout=go.Layout(colorway=[p2,p4,p6]))
pio.templates["yesmaybeno"] = go.layout.Template(layout=go.Layout(colorway=[p2,p11,p8]))
pio.templates["yesnomaybe"] = go.layout.Template(layout=go.Layout(colorway=[p2,p6,p11]))

# pio.templates.default = 'prism'

################################################################################

# # Orthographic globe layout
ortho_layout = dict(
    paper_bgcolor=background_color,
    plot_bgcolor=background_color,
    geo = dict(
        resolution=110, #50 is large or 110 small
        scope='world',
        projection_type = 'orthographic',
        projection_scale = 1,
        projection_rotation = {'lat': 15, 'lon': 30, 'roll': 0},
        bgcolor=background_color,
        showcoastlines=True, coastlinewidth = 1, coastlinecolor = lines,
        showcountries=False, countrywidth = 1, countrycolor = lines, 
        showframe=True, framewidth = 1, framecolor = lines, 
        showlakes=True, lakecolor = water,
        showland=True, landcolor = land, 
        showocean=True, oceancolor = water,
        showrivers=True, riverwidth = 1, rivercolor = water,
        showsubunits=False, subunitwidth = 1, subunitcolor = lines, 
        lonaxis = dict(showgrid = True, gridwidth = 0.5, dtick = 10, gridcolor=grid_color),
        lataxis = dict (showgrid = True, gridwidth = 0.5, dtick = 10, gridcolor=grid_color)),
    showlegend = True,
    legend=dict(x=0, y=0, xanchor="left", yanchor="bottom", bgcolor=half_transparent,  
                font=dict(color=font_color, size=font_size, family=font_family), 
                title_font=dict(color=font_color, size=font_size+2, family=font_family),
                traceorder = 'normal', orientation="v"),
    title=dict(x=0.5, y=0.99, xanchor='center', yanchor='top', text='',
               font=dict(color=font_color, size=font_size+6, family=font_family)),
    margin={"r":0,"t":0,"l":0,"b":0},
    hoverlabel=dict(#bgcolor="white", 
                    font_size=font_size, 
                    font_family=font_family),
    )

# title_font_family=font_family,

# "Document size" for pdfs in thesis
document_size = dict(width = 600, height=600)

# Draft
# draft_template = go.layout.Template()
# draft_template.layout.annotations = [
#     dict(
#         name="draft watermark",
#         text="DRAFT",
#         textangle=-30,
#         opacity=0.1,
#         font=dict(color="black", size=120),
#         xref="paper",
#         yref="paper",
#         x=0.5,
#         y=0.5,
#         showarrow=False,)]
    
# fig.update_layout(template=draft_template)

# Copyright
cr = go.layout.Template()
cr.layout.annotations = [
    dict(
        name="copyright",
        text="© Gábor Parti, 2022",
        font=dict(color=copyright_color, size=8, family="font_family"),
        opacity=0.9,
        xref="paper",
        yref="paper",
        x=0.5,
        y=0,
        # xanchor="right", 
        # yanchor="bottom", 
        # align="center",
        showarrow=False,)]

# fig.update_layout(template=cr) # to call

# Adding images
logo = dict(
    source="https://upload.wikimedia.org/wikipedia/en/thumb/9/9e/PolyU_Logo_with_wordmark.svg/1024px-PolyU_Logo_with_wordmark.svg.png",
    sizex=0.15, sizey=0.15,
    x=1, y=0, 
    xanchor="right", 
    yanchor="bottom", 
)

# fig.add_layout_image(logo) # to call

################################################################################

from plotly.validators.scatter.marker import SymbolValidator

raw_symbols = SymbolValidator().values
namestems = []
namevariants = []
symbols = []
for i in range(0,len(raw_symbols),3):
    name = raw_symbols[i+2]
    symbols.append(raw_symbols[i])
    namestems.append(name.replace("-open", "").replace("-dot", ""))
    namevariants.append(name[len(namestems[-1]):])

fig = go.Figure(go.Scatter(mode="markers", x=namevariants, y=namestems, marker_symbol=symbols,
                           marker_line_color="midnightblue", marker_color="lightskyblue",
                           marker_line_width=2, marker_size=15,
                           hovertemplate="name: %{y}%{x}<br>number: %{marker.symbol}<extra></extra>"))
fig.update_layout(title="Mouse over symbols for name & number!",
                  xaxis_range=[-1,4], yaxis_range=[len(set(namestems)),-1],
                  margin=dict(b=0,r=0), xaxis_side="top", height=1400, width=400)
# plotly.offline.plot(fig, filename='C:/plotlyplots/lifeExp.html')
fig.show()

## Paths

In [33]:
# Path
path_in = "data/"
path_out_html = "output/html/"
path_out_json = "output/json/"
path_out_md = "output/md/"
path_out_pdf = "output/pdf/"
path_out_png = "output/png/"
path_out_tex = "output/tex/"

destination_html = "website/static/plotly/"
destination_json = "../partigabor.github.io/static/plotly/"
destination_md = "website/content/book/spices"
destination_pdf = "thesis/imgs/plots/"
destination_png = "../partigabor.github.io/static/images/"
destination_tex = "thesis/envs/"

# Data

## Create spice data

In [28]:
# Read and store content of an excel file 
read_file = pd.read_excel(path_in+"spices.xlsx")

# Write the dataframe object into csv file
read_file.to_csv (path_in+"spices.csv", index = None, header=True)

# Load in dataset of spices as a dataframe
df_spices=pd.read_csv(path_in+'spices.csv', header =[0], delimiter=',', encoding="utf-8")

# Select ones to include
df_spices = df_spices.loc[(df_spices['include'] == "in")]

# If for symposium, use this
# df_spices = df_spices.loc[df_spices['sym'] == 'yes'] # include ones to include

# List the list of ids
list_of_spices = df_spices['id'].tolist()
list_of_spices.sort()
print("List of spices:", list_of_spices, "\n", len(list_of_spices), "spices in total.")

List of spices: ['Sichuan pepper', 'allspice', 'anise', 'asafoetida', 'caraway', 'cardamom', 'cassia', 'chile', 'cinnamon', 'clove', 'coriander', 'cumin', 'dill', 'fennel', 'fenugreek', 'ginger', 'long pepper', 'mace', 'nutmeg', 'pepper', 'saffron', 'star anise', 'turmeric', 'vanilla'] 
 24 spices in total.


In [29]:
# df = df_spices.copy()

# for index, row in df.iterrows():
#     m = re.search(r"([\u4e00-\u9fff]+)", str(row['item']))
#     if m is not None:
#         df.loc[index,'zh'] = m.group(1)
#         df.loc[index,'pinyin'] = pinyin.get(str(m.group(1)))
#         jp = jyutping.get(str(m.group(1)))
#         jp = ['? ' if c is None else c for c in jp] #None
#         jp = [item for sublist in jp for item in sublist] #flatlist
#         df.loc[index,'jyutping'] = ''.join(jp)  

In [30]:
# import pinyin, jyutping

# chinese = ['多香果', '茴芹', '阿魏', '葛縷子', '荳蔻/豆蔻', '肉桂', '辣椒', '錫蘭肉桂', '丁香', '芫荽', '孜然', '蒔蘿', '茴香', '胡蘆巴', '薑', '蓽撥', '肉豆蔻皮', '肉豆蔻', '胡椒', '藏紅花', '花椒', '八角', '薑黃', '香草']
# transcribed = []
# for word in chinese:
#     jp = jyutping.get(word)
#     transcribed.append(jp)

# transcribed

### Spice box for LaTeX

In [31]:
# Choose a spice as key
key = "allspice"

################################################################################

# The following code will create a spice box for the key, to be used in LaTeX. 
print("Started the generation of '" + key + "' as spicebox...")

df_local = df_spices.loc[df_spices['id'] == key]
# df_local.fillna('', inplace=True)

slash = r" \\"
nl = "\n"
dot = r". "

headname = df_local['id'].iloc[0].capitalize()
headname = re.sub("_", " ", headname)

id = df_local['id'].iloc[0]
head = r"\textsc{" + headname + r"} \hfill "
powo = r"\href{" + str(df_local['POWO'].iloc[0]) + r"}{POWO} \\" 
# tpl = r" ⸱ \href{" + df_local['TPL'].iloc[0] + r"}{TPL} \\"
header = head + powo + nl

en = r"\textbf{English:} \textit{" + df_local['English'].iloc[0] +  r"}"
if pd.notna(df_local['En alt'].iloc[0]):
  en += r"; \textit{" + str(df_local['En alt'].iloc[0]) + r"}"
en += dot + nl

ar = r"\textbf{Arabic:} {\arabicfont{" + df_local['Arabic'].iloc[0] + r"}} \textit{" + df_local['Ar transliteration'].iloc[0] + r"}"
if pd.notna(df_local['Ar literal'].iloc[0]):
  ar += r" [" + str(df_local['Ar literal'].iloc[0]) + r"]"
if pd.notna(df_local['Ar alt'].iloc[0]):
  ar += r"; " + str(df_local['Ar alt'].iloc[0])
ar += dot + nl

zh = r"\textbf{Chinese:} {\tradchinesefont{" + df_local['Chinese'].iloc[0] + r"}} \textit{" + df_local['pinyin'].iloc[0] + r"}"
if pd.notna(df_local['Ch literal'].iloc[0]):
  zh += r" [" + str(df_local['Ch literal'].iloc[0]) + r"]"
if pd.notna(df_local['Ch alt'].iloc[0]):
  zh += r"; " + str(df_local['Ch alt'].iloc[0])
zh += dot + nl

hu = r"\textbf{Hungarian:} \textit{" + df_local['Hungarian'].iloc[0] +  r"}"
if pd.notna(df_local['Hu literal'].iloc[0]):
  hu += r" [" + str(df_local['Hu literal'].iloc[0]) + r"]"
if pd.notna(df_local['Hu alt'].iloc[0]):
  hu += r"; " + str(df_local['Hu alt'].iloc[0])
hu += dot

names = en + ar + zh + hu + slash + nl

rule = r"\noindent{\color{black}\rule[0.5ex]{\linewidth}{.5pt}}" + nl
tab_begin = "\\begin{tabular}{@{}p{0.25\\linewidth}@{}p{0.75\\linewidth}@{}}\n"
tab_end = "\end{tabular}\n"

plant = r"Plant species: & \taxonn{" + str(df_local['species'].iloc[0]) + r"}{" + str(df_local['species by'].iloc[0]) + "}"
plant_syn = r" (syn. \taxonn{" + str(df_local['species syn'].iloc[0]) + "}{" + str(df_local['species syn by'].iloc[0]) + "})"
plant_alt = r"; \textit{" + str(df_local['species alt'].iloc[0]) + "}"
if pd.notna(df_local['species syn'].iloc[0]):
  plant = plant + plant_syn
else:
  plant = plant
if pd.notna(df_local['species alt'].iloc[0]):
  plant = plant + plant_alt
else:
  plant = plant

plant += slash + nl

fam = r"Family: & \textit{" + str(df_local['family'].iloc[0]) + r"}" + slash + nl
region = r"Region of origin: & " + str(df_local['region of origin'].iloc[0]) + slash + nl
cult = r"Cultivated in: & " + str(df_local['cultivation'].iloc[0]) + slash + nl
part = r"Part used: & " + str(df_local['part used'].iloc[0]) + slash + nl
color = r"Color: & " + str(df_local['color'].iloc[0]) + slash + nl
env_begin = r"\begin{spice}" + "\label{spice:" + id + "}" + nl
env_end = r"\end{spice}"

box = env_begin + header + names + rule + tab_begin + plant + fam + part + region + cult + color + tab_end + env_end

# Save the spicebox as a standalone tex file
filename = re.sub(" ", "_", id)
filename = filename.lower()
file = open(path_out_tex + "{}.tex".format("spicebox_" + filename), "w", encoding='utf-8')
file.write(box)
file.close()
print("Spicebox '" + str(id) + "' as a tex file was created.")

box

Started the generation of 'allspice' as spicebox...
Spicebox 'allspice' as a tex file was created.


'\\begin{spice}\\label{spice:allspice}\n\\textsc{Allspice} \\hfill \\href{https://powo.science.kew.org/taxon/196799-2}{POWO} \\\\\n\\textbf{English:} \\textit{allspice}; \\textit{pimento; Jamaica pepper}. \n\\textbf{Arabic:} {\\arabicfont{فلفل إفرنجي}} \\textit{fulful ifranjī} [Frankish pepper]. \n\\textbf{Chinese:} {\\tradchinesefont{多香果}} \\textit{duōxiāngguǒ} [many-spice-fruit]. \n\\textbf{Hungarian:} \\textit{szegfűbors} [clove-pepper]; \\textit{jamaicaibors} [Jamaican-pepper]; \\textit{amomummag} [amomum-seed].  \\\\\n\\noindent{\\color{black}\\rule[0.5ex]{\\linewidth}{.5pt}}\n\\begin{tabular}{@{}p{0.25\\linewidth}@{}p{0.75\\linewidth}@{}}\nPlant species: & \\taxonn{Pimenta dioica}{(L.) Merr.} (syn. \\taxonn{Pimenta officinalis}{Lindl.}) \\\\\nFamily: & \\textit{Myrtaceae} \\\\\nPart used: & unripe fruit; leaf \\\\\nRegion of origin: & S. Mexico to C. America; Caribbean \\\\\nCultivated in: & Jamaica; Mexico; Honduras \\\\\nColor: & dark brown \\\\\n\\end{tabular}\n\\end{spice}'

In [32]:
def spicebox(key):

  # The following function will create a spice box for the key, to be used in LaTeX.
  print("Started the generation of '" + key + "' as spicebox...")

  df_local = df_spices.loc[df_spices['id'] == key]
  # df_local.fillna('', inplace=True)

  slash = r" \\"
  nl = "\n"
  dot = r". "

  headname = df_local['id'].iloc[0].capitalize()
  headname = re.sub("_", " ", headname)

  id = df_local['id'].iloc[0]
  head = r"\textsc{" + headname + r"} \hfill "
  powo = r"\href{" + str(df_local['POWO'].iloc[0]) + r"}{POWO} \\" 
  # tpl = r" ⸱ \href{" + df_local['TPL'].iloc[0] + r"}{TPL} \\"
  header = head + powo + nl

  en = r"\textbf{English:} \textit{" + df_local['English'].iloc[0] +  r"}"
  if pd.notna(df_local['En alt'].iloc[0]):
    en += r"; \textit{" + str(df_local['En alt'].iloc[0]) + r"}"
  en += dot + nl

  ar = r"\textbf{Arabic:} {\arabicfont{" + df_local['Arabic'].iloc[0] + r"}} \textit{" + df_local['Ar transliteration'].iloc[0] + r"}"
  if pd.notna(df_local['Ar literal'].iloc[0]):
    ar += r" [" + str(df_local['Ar literal'].iloc[0]) + r"]"
  if pd.notna(df_local['Ar alt'].iloc[0]):
    ar += r"; " + str(df_local['Ar alt'].iloc[0])
  ar += dot + nl

  zh = r"\textbf{Chinese:} {\tradchinesefont{" + df_local['Chinese'].iloc[0] + r"}} \textit{" + df_local['pinyin'].iloc[0] + r"}"
  if pd.notna(df_local['Ch literal'].iloc[0]):
    zh += r" [" + str(df_local['Ch literal'].iloc[0]) + r"]"
  if pd.notna(df_local['Ch alt'].iloc[0]):
    zh += r"; " + str(df_local['Ch alt'].iloc[0])
  zh += dot + nl

  hu = r"\textbf{Hungarian:} \textit{" + df_local['Hungarian'].iloc[0] +  r"}"
  if pd.notna(df_local['Hu literal'].iloc[0]):
    hu += r" [" + str(df_local['Hu literal'].iloc[0]) + r"]"
  if pd.notna(df_local['Hu alt'].iloc[0]):
    hu += r"; " + str(df_local['Hu alt'].iloc[0])
  hu += dot

  names = en + ar + zh + hu + slash + nl

  rule = r"\noindent{\color{black}\rule[0.5ex]{\linewidth}{.5pt}}" + nl
  tab_begin = "\\begin{tabular}{@{}p{0.25\\linewidth}@{}p{0.75\\linewidth}@{}}\n"
  tab_end = "\end{tabular}\n"

  plant = r"Plant species: & \taxonn{" + str(df_local['species'].iloc[0]) + r"}{" + str(df_local['species by'].iloc[0]) + "}"
  plant_syn = r" (syn. \taxonn{" + str(df_local['species syn'].iloc[0]) + "}{" + str(df_local['species syn by'].iloc[0]) + "})"
  plant_alt = r"; \textit{" + str(df_local['species alt'].iloc[0]) + "}"
  if pd.notna(df_local['species syn'].iloc[0]):
    plant = plant + plant_syn
  else:
    plant = plant
  if pd.notna(df_local['species alt'].iloc[0]):
    plant = plant + plant_alt
  else:
    plant = plant

  plant += slash + nl

  fam = r"Family: & \textit{" + str(df_local['family'].iloc[0]) + r"}" + slash + nl
  region = r"Region of origin: & " + str(df_local['region of origin'].iloc[0]) + slash + nl
  cult = r"Cultivated in: & " + str(df_local['cultivation'].iloc[0]) + slash + nl
  part = r"part used: & " + str(df_local['part used'].iloc[0]) + slash + nl
  color = r"Color: & " + str(df_local['color'].iloc[0]) + slash + nl
  env_begin = r"\begin{spice}" + "\label{spice:" + id + "}" + nl
  env_end = r"\end{spice}"

  box = env_begin + header + names + rule + tab_begin + plant + fam + part + region + cult + color + tab_end + env_end

  # Save the spicebox as a standalone tex file
  filename = re.sub(" ", "_", id)
  filename = filename.lower()
  file = open(path_out_tex + "{}.tex".format("spicebox_" + filename), "w", encoding='utf-8')
  file.write(box)
  file.close()
  print("Spicebox '" + str(id) + "' as a tex file was created.")

  return box

In [33]:
spicebox('allspice')

Started the generation of 'allspice' as spicebox...
Spicebox 'allspice' as a tex file was created.


'\\begin{spice}\\label{spice:allspice}\n\\textsc{Allspice} \\hfill \\href{https://powo.science.kew.org/taxon/196799-2}{POWO} \\\\\n\\textbf{English:} \\textit{allspice}; \\textit{pimento; Jamaica pepper}. \n\\textbf{Arabic:} {\\arabicfont{فلفل إفرنجي}} \\textit{fulful ifranjī} [Frankish pepper]. \n\\textbf{Chinese:} {\\tradchinesefont{多香果}} \\textit{duōxiāngguǒ} [many-spice-fruit]. \n\\textbf{Hungarian:} \\textit{szegfűbors} [clove-pepper]; \\textit{jamaicaibors} [Jamaican-pepper]; \\textit{amomummag} [amomum-seed].  \\\\\n\\noindent{\\color{black}\\rule[0.5ex]{\\linewidth}{.5pt}}\n\\begin{tabular}{@{}p{0.25\\linewidth}@{}p{0.75\\linewidth}@{}}\nPlant species: & \\taxonn{Pimenta dioica}{(L.) Merr.} (syn. \\taxonn{Pimenta officinalis}{Lindl.}) \\\\\nFamily: & \\textit{Myrtaceae} \\\\\npart used: & unripe fruit; leaf \\\\\nRegion of origin: & S. Mexico to C. America; Caribbean \\\\\nCultivated in: & Jamaica; Mexico; Honduras \\\\\nColor: & dark brown \\\\\n\\end{tabular}\n\\end{spice}'

In [34]:
# Loop all the dataset
def spiceboxes():
  for key in list_of_spices:
    spicebox(key)
  print('Done')

spiceboxes()

Started the generation of 'Sichuan pepper' as spicebox...
Spicebox 'Sichuan pepper' as a tex file was created.
Started the generation of 'allspice' as spicebox...
Spicebox 'allspice' as a tex file was created.
Started the generation of 'anise' as spicebox...
Spicebox 'anise' as a tex file was created.
Started the generation of 'asafoetida' as spicebox...
Spicebox 'asafoetida' as a tex file was created.
Started the generation of 'caraway' as spicebox...
Spicebox 'caraway' as a tex file was created.
Started the generation of 'cardamom' as spicebox...
Spicebox 'cardamom' as a tex file was created.
Started the generation of 'cassia' as spicebox...
Spicebox 'cassia' as a tex file was created.
Started the generation of 'chile' as spicebox...
Spicebox 'chile' as a tex file was created.
Started the generation of 'cinnamon' as spicebox...
Spicebox 'cinnamon' as a tex file was created.
Started the generation of 'clove' as spicebox...
Spicebox 'clove' as a tex file was created.
Started the genera

## Create name data

In [34]:
# Read and store content of an excel file 
read_file = pd.read_excel(path_in+"names.xlsx")

# Write the dataframe object into csv file
read_file.to_csv(path_in+"names.csv", index = None, header=True)

# Load in dataset of names
df_names=pd.read_csv(path_in+'names.csv', header =[0], delimiter=',', encoding="utf-8")

# fillna
df_names.fillna('', inplace=True)

# Turn reference from Zotero to human
for index, row in df_names.iterrows():
    m = re.search(r"(\w+)_(\w*-?\w+)_(\d+)", str(row['dictionary']))
    if m is not None:
        author = re.sub("_", " ", m.group(1))
        df_names.loc[index,'source human'] = author.title() + ", " + m.group(3)
    else:
        df_names.loc[index,'source human'] = str(row['dictionary'])

for index, row in df_names.iterrows():
    m = re.search(r"(\w+)_(\w*-?\w+)_(\d+)", str(row['source zotero']))
    if row['source human'] == '':
        if m is not None:
            author = re.sub("_", " ", m.group(1))
            df_names.loc[index,'source human'] = author.title() + ", " + m.group(3)
        else:
            df_names.loc[index,'source human'] = str(row['source zotero'])
    else:
        continue

# Write the dataframe object into csv file
df_names.to_csv(path_in+"names.csv", index = None, header=True)

# Save names
wb = Workbook()
ws = wb.active
with open(path_in + "names.csv", 'r', encoding="utf-8") as f:
    for row in csv.reader(f):
        ws.append(row)
wb.save(path_in + 'names.xlsx')

### Tables with names

In [35]:
# A list of keys to loop all dataset:
# keylist = ['allspice', 'anise', 'asafoetida', 'caraway', 'cardamom', 'chile', 'cinnamon', 'clove', 'coriander', 'cumin', 'dill', 'fenugreek', 'fennel', 'ginger', 'long pepper', 'nutmeg', 'pepper', 'saffron', 'Sichuan pepper', 'star anise', 'turmeric', 'vanilla']
keylist = list_of_spices

key = "allspice"
# key2 = "cassia"

In [37]:
# 3 separate tables with sources
names = df_names.loc[df_names['include'] == 'yes'] # include ones to include
# names = df_names.loc[df_names['conventionalized'] == 'yes'] # exclude those not in a dictionary

names.fillna('', inplace=True)

# Changes
names['status'] = pd.Categorical(names['status'], ["default", "alias", "historic", "obsolete", "related"]) # add categorical order here
names.sort_values("status", inplace = True) # sort according to the categories
names.sort_values(['species', 'term'], inplace = True, key=lambda col: col.str.lower()) # sort by order of columns, ignoring casing

#Lowercase
names['dictionary'] = names['dictionary'].str.lower()
names['source zotero'] = names['source zotero'].str.lower()

# Only keep first dictionary item
names['dictionary'] = names['dictionary'].str.replace(r';.*', '', regex=True)

# If source not '', add round latex cite command
names.loc[names['dictionary'] != '','dictionary'] = "\\textcite{" + names['dictionary'].str.lower() + "}" 
names.loc[names['source zotero'] != '','source zotero'] = "\\textcite{" + names['source zotero'] + "}"

# names.loc[names['dictionary'].isnull(),'dictionary'] = names['source zotero'] # Fill column with other if NaN
names.loc[names['dictionary'] == '','dictionary'] = names['source zotero'] # Fill column with other if ''
names.loc[names['source zotero'] == '','source zotero'] = names['dictionary'] # Fill column with other if ''

# Datasets
names_en = names.loc[names['language'] == 'English']
names_en = names_en.loc[(names_en['id']==key)]
if 'key2' in locals():
  names_en = names_en.loc[(names_en['id']==key) | (names_en['id']==key2)] # for 2 keys
sorted = names_en['status'].astype(str).argsort()
pd.DataFrame(names_en.values[sorted], names_en.index[sorted], names_en.columns)
names_en.reset_index(inplace=True, drop=True)

names_ar = names.loc[names['language'] == 'Arabic']
names_ar = names_ar.loc[(names_ar['id']==key)]
if 'key2' in locals():
  names_ar = names_ar.loc[(names_ar['id']==key) | (names_ar['id']==key2)]  # for 2 keys
sorted = names_ar['status'].astype(str).argsort()
pd.DataFrame(names_ar.values[sorted], names_ar.index[sorted], names_ar.columns)
names_ar.reset_index(inplace=True, drop=True)

names_zh = names.loc[names['language'] == 'Chinese']
names_zh = names_zh.loc[(names_zh['id']==key)]
if 'key2' in locals():
  names_zh = names_zh.loc[(names_zh['id']==key) | (names_zh['id']==key2)] # for 2 keys
sorted = names_zh['status'].astype(str).argsort()
pd.DataFrame(names_zh.values[sorted], names_zh.index[sorted], names_zh.columns)
names_zh.reset_index(inplace=True, drop=True)

if 'key2' in locals():
  keys = key + " and " + key2
else:
  keys = key

spp_column_width = "0.15"
src_column_width = "0.15"

#English
table_en = "\\begin{table}[!ht]\n\centering\n\\begin{tabularx}{\\textwidth}"
# ------------------------------------------------------------------------------
table_en += "{@{}l" # no.
table_en += ">{\itshape \small}l" # species
# table_en += ">{\itshape \small}p{" + spp_column_width + "\\textwidth}" # species
table_en += "L" # name
# table_en += ">{\\raggedleft\\arraybackslash \small}p{" + src_column_width + "\\textwidth}"
table_en += ">{\small}l" # source
# table_en += ">{\small}p{" + src_column_width + "\\textwidth}" # source
table_en += "@{}}\n\\toprule\n" # ----------------------------------------------
table_en += "\\textbf{\#}" # no.
table_en += " & \multicolumn{1}{l}{\\textbf{Species}}" # species
table_en += " & \multicolumn{1}{l}{\\textbf{Name}}" # name
# table_en += " & \\textbf{Status}"  # status
table_en += " & \multicolumn{1}{l}{\\textbf{Source}}" # source
table_en += " \\\\\n\midrule\n" #-----------------------------------------------

for index, row in names_en.iterrows():
  if row['status'] == 'default':
    line = "\\textbf{" + str(index+1) + "}" # no.
    line += "\t& \\textbf{" + str(row['species']) + "}" # species
    line += "\t& \\textbf{" + str(row['term']) + "}" # name
    # line += "\t& \\textbf{" + str(row['status']) + "}" # status
    line += "\t& \\textbf{" + str(row['source zotero']) + "}" # reference
  else:
    line = str(index+1)
    line += "\t& " + str(row['species']) # species
    line += "\t& " + str(row['term']) # name
    # line += "\t& " + str(row['status']) # status
    line += "\t& " + str(row['source zotero']) # reference
  line += " \\\\"

  table_en = table_en + line + "\n"
caption = "\caption{Various names for " + keys + " in English.}\n"
label = "\label{table:names_" + key + "_en}\n"
table_en += "\\bottomrule\n\end{tabularx}\n" + caption + label + "\end{table}\n\n"



#Arabic
table_ar = "\\begin{table}[!ht]\n\centering\n\\begin{tabularx}{\\textwidth}"
# ------------------------------------------------------------------------------
table_ar += "{@{}l" # no.
table_ar += ">{\itshape \small}l" # species
# table_ar += ">{\itshape \small}p{" + spp_column_width + "\\textwidth}" # species
table_ar += "r" # script
table_ar += ">{\itshape}l" # name
table_ar += "L" # gloss
table_ar += ">{\small}l" # source
# table_ar += ">{\small}p{" + src_column_width + "\\textwidth}" # source
table_ar += "@{}}\n\\toprule\n" # ----------------------------------------------
table_ar += "\\textbf{\#}" # no.
table_ar += " & \multicolumn{1}{l}{\\textbf{Species}}" # species
table_ar += " & \multicolumn{1}{l}{\\textbf{Name}}" # script
table_ar += " & \multicolumn{1}{l}{\\textbf{Tr.}}" # name
table_ar += " & \multicolumn{1}{l}{\\textbf{Gloss}}" # gloss
table_ar += " & \multicolumn{1}{l}{\\textbf{Source}}" # source
table_ar += " \\\\\n\midrule\n" #-----------------------------------------------

for index, row in names_ar.iterrows():
  if row['status'] == 'default':
    line = "\\textbf{" + str(index+1) + "}" # no.
    line += "\t& \\textbf{" + str(row['species']) + "}" # species
    line += "\t& \\textbf{" + str(row['script']) + "}" # script
    line += "\t& \\textbf{" + str(row['term']) + "}" # name
    line += "\t& \\textbf{" + str(row['literal']) + "}" # literal
    # line += "\t& \\textbf{" + str(row['status']) + "}" # status
    line += "\t& \\textbf{" + str(row['source zotero']) + "}" # reference
  else:
    line = str(index+1)
    line += "\t& " + str(row['species']) # species
    line += "\t& " + str(row['script']) # script
    line += "\t& " + str(row['term']) # name
    line += "\t& " + str(row['literal']) # lit
    # line += "\t& " + str(row['status']) # status
    line += "\t& " + str(row['source zotero']) # reference
  line += " \\\\"

  table_ar = table_ar + line + "\n"
caption = "\caption{Various names for " + keys + " in Arabic.}\n"
label = "\label{table:names_" + key + "_ar}\n"
table_ar += "\\bottomrule\n\end{tabularx}\n" + caption + label + "\end{table}\n\n"

#Chinese
table_zh = "\\begin{table}[!ht]\n\centering\n\\begin{tabularx}{\\textwidth}"
# ------------------------------------------------------------------------------
table_zh += "{@{}l" # no.
table_zh += ">{\itshape \small}l" # species
# table_zh += ">{\itshape \small}p{" + spp_column_width + "\\textwidth}" # species
table_zh += "l" # script
table_zh += ">{\itshape}l" # name
table_zh += "L" # gloss
table_zh += ">{\small}l" # source
# table_zh += ">{\small}p{" + src_column_width + "\\textwidth}" # source
table_zh += "@{}}\n\\toprule\n" # ----------------------------------------------
table_zh += "\\textbf{\#}" # no.
table_zh += " & \multicolumn{1}{l}{\\textbf{Species}}" # species
table_zh += " & \multicolumn{1}{l}{\\textbf{Name}}" # script
table_zh += " & \multicolumn{1}{l}{\\textbf{Tr.}}" # name
table_zh += " & \multicolumn{1}{l}{\\textbf{Gloss}}" # gloss
table_zh += " & \multicolumn{1}{l}{\\textbf{Source}}" # source
table_zh += " \\\\\n\midrule\n" #-----------------------------------------------

for index, row in names_zh.iterrows():
  if row['status'] == 'default':
    line = "\\textbf{" + str(index+1) + "}" # no.
    line += "\t& \\textbf{" + str(row['species']) + "}" # species
    line += "\t& \\textbf{\\tradchinesefont{" + str(row['script']) + "}}" # script
    line += "\t& \\textbf{" + str(row['term']) + "}" # name
    line += "\t& \\textbf{" + str(row['literal']) + "}" # literal
    # line += "\t& \\textbf{" + str(row['status']) + "}" # status
    line += "\t& \\textbf{" + str(row['source zotero']) + "}" # reference
  else:
    line = str(index+1)
    line += "\t& " + str(row['species']) # species
    line += "\t& \\tradchinesefont{" + str(row['script']) + "}" # script
    line += "\t& " + str(row['term']) # name
    line += "\t& " + str(row['literal']) # lit
    # line += "\t& " + str(row['status']) # status
    line += "\t& " + str(row['source zotero']) # reference
  line += " \\\\"
  table_zh = table_zh + line + "\n"
  
keys = re.sub('_', '', keys)
caption = "\caption{Various names for " + keys + " in Chinese.}\n"
label = "\label{table:names_" + key + "_zh}\n"
table_zh += "\\bottomrule\n\end{tabularx}\n" + caption + label + "\end{table}\n\n"

# All
tables = table_en + table_ar + table_zh

# Write
filename = re.sub(" ", "_", key)
filename = filename.lower()

f = open(path_out_tex + "{}.tex".format("names_" + filename + "_en"), "w", encoding='utf-8')
f.write(table_en)
f.close()
print("Table for names of '" + str(key) + "' in English as a tex file was created.")

f = open(path_out_tex + "{}.tex".format("names_" + filename + "_ar"), "w", encoding='utf-8')
f.write(table_ar)
f.close()
print("Table for names of '" + str(key) + "' in Arabic as a tex file was created.")

f = open(path_out_tex + "{}.tex".format("names_" + filename + "_zh"), "w", encoding='utf-8')
f.write(table_zh)
f.close()
print("Table for names of '" + str(key) + "' in Chinese as a tex file was created.")

Table for names of 'allspice' in English as a tex file was created.
Table for names of 'allspice' in Arabic as a tex file was created.
Table for names of 'allspice' in Chinese as a tex file was created.




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer

In [38]:
# 3 separate tables without species 
names = df_names.loc[df_names['include'] == 'yes'] # include ones to include
# names = df_names.loc[df_names['conventionalized'] == 'yes'] # exclude those not in a dictionary

names.fillna('', inplace=True)

# Changes
names['status'] = pd.Categorical(names['status'], ["default", "alias", "historic", "obsolete", "related"]) # add categorical order here
names.sort_values("status", inplace = True) # sort according to the categories
names.sort_values(['species', 'term'], inplace = True, key=lambda col: col.str.lower()) # sort by order of columns, ignoring casing

#Lowercase
names['dictionary'] = names['dictionary'].str.lower()
names['source zotero'] = names['source zotero'].str.lower()

# Only keep first dictionary item
names['dictionary'] = names['dictionary'].str.replace(r';.*', '', regex=True)

# If source not '', add round latex cite command
names.loc[names['dictionary'] != '','dictionary'] = "\\textcite{" + names['dictionary'].str.lower() + "}" 
names.loc[names['source zotero'] != '','source zotero'] = "\\textcite{" + names['source zotero'] + "}"

# names.loc[names['dictionary'].isnull(),'dictionary'] = names['source zotero'] # Fill column with other if NaN
names.loc[names['dictionary'] == '','dictionary'] = names['source zotero'] # Fill column with other if ''
names.loc[names['source zotero'] == '','source zotero'] = names['dictionary'] # Fill column with other if ''

# Datasets
names_en = names.loc[names['language'] == 'English']
names_en = names_en.loc[(names_en['id']==key)]
if 'key2' in locals():
  names_en = names_en.loc[(names_en['id']==key) | (names_en['id']==key2)] # for 2 keys
sorted = names_en['status'].astype(str).argsort()
pd.DataFrame(names_en.values[sorted], names_en.index[sorted], names_en.columns)
names_en.reset_index(inplace=True, drop=True)

names_ar = names.loc[names['language'] == 'Arabic']
names_ar = names_ar.loc[(names_ar['id']==key)]
if 'key2' in locals():
  names_ar = names_ar.loc[(names_ar['id']==key) | (names_ar['id']==key2)]  # for 2 keys
sorted = names_ar['status'].astype(str).argsort()
pd.DataFrame(names_ar.values[sorted], names_ar.index[sorted], names_ar.columns)
names_ar.reset_index(inplace=True, drop=True)

names_zh = names.loc[names['language'] == 'Chinese']
names_zh = names_zh.loc[(names_zh['id']==key)]
if 'key2' in locals():
  names_zh = names_zh.loc[(names_zh['id']==key) | (names_zh['id']==key2)] # for 2 keys
sorted = names_zh['status'].astype(str).argsort()
pd.DataFrame(names_zh.values[sorted], names_zh.index[sorted], names_zh.columns)
names_zh.reset_index(inplace=True, drop=True)

if 'key2' in locals():
  keys = key + " and " + key2
else:
  keys = key

spp_column_width = "0.15"
src_column_width = "0.15"

#English
table_en = "\\begin{table}[!ht]\n\centering\n\\begin{tabularx}{\\textwidth}"
# ------------------------------------------------------------------------------
table_en += "{@{}l" # no.
# table_en += ">{\itshape \small}l" # species
# table_en += ">{\itshape \small}p{" + spp_column_width + "\\textwidth}" # species
table_en += "L" # name
# table_en += ">{\\raggedleft\\arraybackslash \small}p{" + src_column_width + "\\textwidth}"
table_en += ">{\small}l" # source
# table_en += ">{\small}p{" + src_column_width + "\\textwidth}" # source
table_en += "@{}}\n\\toprule\n" # ----------------------------------------------
table_en += "\\textbf{\#}" # no.
# table_en += " & \multicolumn{1}{l}{\\textbf{Species}}" # species
table_en += " & \multicolumn{1}{l}{\\textbf{Name}}" # name
# table_en += " & \\textbf{Status}"  # status
table_en += " & \multicolumn{1}{l}{\\textbf{Source}}" # source
table_en += " \\\\\n\midrule\n" #-----------------------------------------------

for index, row in names_en.iterrows():
  if row['status'] == 'default':
    line = "\\textbf{" + str(index+1) + "}" # no.
    # line += "\t& \\textbf{" + str(row['species']) + "}" # species
    line += "\t& \\textbf{" + str(row['term']) + "}" # name
    # line += "\t& \\textbf{" + str(row['status']) + "}" # status
    line += "\t& \\textbf{" + str(row['source zotero']) + "}" # reference
  else:
    line = str(index+1)
    # line += "\t& " + str(row['species']) # species
    line += "\t& " + str(row['term']) # name
    # line += "\t& " + str(row['status']) # status
    line += "\t& " + str(row['source zotero']) # reference
  line += " \\\\"
  table_en = table_en + line + "\n"

keys = re.sub('_', '', keys)
caption = "\caption{Various names for " + keys + " in English.}\n"
key = re.sub(" ", "_", key)
label = "\label{table:names_" + key + "_en}\n"
table_en += "\\bottomrule\n\end{tabularx}\n" + caption + label + "\end{table}\n\n"


#Arabic
table_ar = "\\begin{table}[!ht]\n\centering\n\\begin{tabularx}{\\textwidth}"
# ------------------------------------------------------------------------------
table_ar += "{@{}l" # no.
# table_ar += ">{\itshape \small}l" # species
# table_ar += ">{\itshape \small}p{" + spp_column_width + "\\textwidth}" # species
table_ar += "r" # script
table_ar += ">{\itshape}l" # name
table_ar += "L" # gloss
table_ar += ">{\small}l" # source
# table_ar += ">{\small}p{" + src_column_width + "\\textwidth}" # source
table_ar += "@{}}\n\\toprule\n" # ----------------------------------------------
table_ar += "\\textbf{\#}" # no.
# table_ar += " & \multicolumn{1}{l}{\\textbf{Species}}" # species
table_ar += " & \multicolumn{1}{l}{\\textbf{Name}}" # script
table_ar += " & \multicolumn{1}{l}{\\textbf{Transliteration}}" # name
table_ar += " & \multicolumn{1}{l}{\\textbf{Gloss}}" # gloss
table_ar += " & \multicolumn{1}{l}{\\textbf{Source}}" # source
table_ar += " \\\\\n\midrule\n" #-----------------------------------------------

for index, row in names_ar.iterrows():
  if row['status'] == 'default':
    line = "\\textbf{" + str(index+1) + "}" # no.
    # line += "\t& \\textbf{" + str(row['species']) + "}" # species
    line += "\t& \\textbf{" + str(row['script']) + "}" # script
    line += "\t& \\textbf{" + str(row['term']) + "}" # name
    line += "\t& \\textbf{" + str(row['literal']) + "}" # literal
    # line += "\t& \\textbf{" + str(row['status']) + "}" # status
    line += "\t& \\textbf{" + str(row['source zotero']) + "}" # reference
  else:
    line = str(index+1)
    # line += "\t& " + str(row['species']) # species
    line += "\t& " + str(row['script']) # script
    line += "\t& " + str(row['term']) # name
    line += "\t& " + str(row['literal']) # lit
    # line += "\t& " + str(row['status']) # status
    line += "\t& " + str(row['source zotero']) # reference
  line += " \\\\"

  table_ar = table_ar + line + "\n"
caption = "\caption{Various names for " + keys + " in Arabic.}\n"
label = "\label{table:names_" + key + "_ar}\n"
table_ar += "\\bottomrule\n\end{tabularx}\n" + caption + label + "\end{table}\n\n"

#Chinese
table_zh = "\\begin{table}[!ht]\n\centering\n\\begin{tabularx}{\\textwidth}"
# ------------------------------------------------------------------------------
table_zh += "{@{}l" # no.
# table_zh += ">{\itshape \small}l" # species
# table_zh += ">{\itshape \small}p{" + spp_column_width + "\\textwidth}" # species
table_zh += "l" # script
table_zh += ">{\itshape}l" # name
table_zh += "L" # gloss
table_zh += ">{\small}l" # source
# table_zh += ">{\small}p{" + src_column_width + "\\textwidth}" # source
table_zh += "@{}}\n\\toprule\n" # ----------------------------------------------
table_zh += "\\textbf{\#}" # no.
# table_zh += " & \multicolumn{1}{l}{\\textbf{Species}}" # species
table_zh += " & \multicolumn{1}{l}{\\textbf{Name}}" # script
table_zh += " & \multicolumn{1}{l}{\\textbf{Transliteration}}" # name
table_zh += " & \multicolumn{1}{l}{\\textbf{Gloss}}" # gloss
table_zh += " & \multicolumn{1}{l}{\\textbf{Source}}" # source
table_zh += " \\\\\n\midrule\n" #-----------------------------------------------

for index, row in names_zh.iterrows():
  if row['status'] == 'default':
    line = "\\textbf{" + str(index+1) + "}" # no.
    # line += "\t& \\textbf{" + str(row['species']) + "}" # species
    line += "\t& \\textbf{\\tradchinesefont{" + str(row['script']) + "}}" # script
    line += "\t& \\textbf{" + str(row['term']) + "}" # name
    line += "\t& \\textbf{" + str(row['literal']) + "}" # literal
    # line += "\t& \\textbf{" + str(row['status']) + "}" # status
    line += "\t& \\textbf{" + str(row['source zotero']) + "}" # reference
  else:
    line = str(index+1)
    # line += "\t& " + str(row['species']) # species
    line += "\t& \\tradchinesefont{" + str(row['script']) + "}" # script
    line += "\t& " + str(row['term']) # name
    line += "\t& " + str(row['literal']) # lit
    # line += "\t& " + str(row['status']) # status
    line += "\t& " + str(row['source zotero']) # reference
  line += " \\\\"
  table_zh = table_zh + line + "\n"
  
keys = re.sub('_', '', keys)
caption = "\caption{Various names for " + keys + " in Chinese.}\n"
label = "\label{table:names_" + key + "_zh}\n"
table_zh += "\\bottomrule\n\end{tabularx}\n" + caption + label + "\end{table}\n\n"

# All
tables = table_en + table_ar + table_zh


# Write
filename = re.sub(" ", "_", key)
filename = filename.lower()

f = open(path_out_tex + "{}.tex".format("names_" + filename + "_en"), "w", encoding='utf-8')
f.write(table_en)
f.close()
print("Table for names of '" + str(key) + "' in English as a tex file was created.")

f = open(path_out_tex + "{}.tex".format("names_" + filename + "_ar"), "w", encoding='utf-8')
f.write(table_ar)
f.close()
print("Table for names of '" + str(key) + "' in Arabic as a tex file was created.")

f = open(path_out_tex + "{}.tex".format("names_" + filename + "_zh"), "w", encoding='utf-8')
f.write(table_zh)
f.close()
print("Table for names of '" + str(key) + "' in Chinese as a tex file was created.")

Table for names of 'allspice' in English as a tex file was created.
Table for names of 'allspice' in Arabic as a tex file was created.
Table for names of 'allspice' in Chinese as a tex file was created.




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer

In [39]:
def nametables(key):
  names = df_names.loc[df_names['include'] == 'yes'] # include ones to include
  # names = df_names.loc[df_names['conventionalized'] == 'yes'] # exclude those not in a dictionary

  names.fillna('', inplace=True)

  # Changes
  names['status'] = pd.Categorical(names['status'], ["default", "alias", "historic", "obsolete", "related"]) # add categorical order here
  names.sort_values("status", inplace = True) # sort according to the categories
  names.sort_values(['species', 'term'], inplace = True, key=lambda col: col.str.lower()) # sort by order of columns, ignoring casing

  #Lowercase
  names['dictionary'] = names['dictionary'].str.lower()
  names['source zotero'] = names['source zotero'].str.lower()

  # Only keep first dictionary item
  names['dictionary'] = names['dictionary'].str.replace(r';.*', '', regex=True)

  # If source not '', add round latex cite command
  names.loc[names['dictionary'] != '','dictionary'] = "\\textcite{" + names['dictionary'].str.lower() + "}" 
  names.loc[names['source zotero'] != '','source zotero'] = "\\textcite{" + names['source zotero'] + "}"

  # names.loc[names['dictionary'].isnull(),'dictionary'] = names['source zotero'] # Fill column with other if NaN
  names.loc[names['dictionary'] == '','dictionary'] = names['source zotero'] # Fill column with other if ''
  names.loc[names['source zotero'] == '','source zotero'] = names['dictionary'] # Fill column with other if ''

  # Datasets
  names_en = names.loc[names['language'] == 'English']
  names_en = names_en.loc[(names_en['id']==key)]
  if 'key2' in locals():
    names_en = names_en.loc[(names_en['id']==key) | (names_en['id']==key2)] # for 2 keys
  sorted = names_en['status'].astype(str).argsort()
  pd.DataFrame(names_en.values[sorted], names_en.index[sorted], names_en.columns)
  names_en.reset_index(inplace=True, drop=True)

  names_ar = names.loc[names['language'] == 'Arabic']
  names_ar = names_ar.loc[(names_ar['id']==key)]
  if 'key2' in locals():
    names_ar = names_ar.loc[(names_ar['id']==key) | (names_ar['id']==key2)]  # for 2 keys
  sorted = names_ar['status'].astype(str).argsort()
  pd.DataFrame(names_ar.values[sorted], names_ar.index[sorted], names_ar.columns)
  names_ar.reset_index(inplace=True, drop=True)

  names_zh = names.loc[names['language'] == 'Chinese']
  names_zh = names_zh.loc[(names_zh['id']==key)]
  if 'key2' in locals():
    names_zh = names_zh.loc[(names_zh['id']==key) | (names_zh['id']==key2)] # for 2 keys
  sorted = names_zh['status'].astype(str).argsort()
  pd.DataFrame(names_zh.values[sorted], names_zh.index[sorted], names_zh.columns)
  names_zh.reset_index(inplace=True, drop=True)

  if 'key2' in locals():
    keys = key + " and " + key2
  else:
    keys = key

  spp_column_width = "0.15"
  src_column_width = "0.15"

  #English
  table_en = "\\begin{table}[!ht]\n\centering\n\\begin{tabularx}{\\textwidth}"
  # ------------------------------------------------------------------------------
  table_en += "{@{}l" # no.
  table_en += ">{\itshape \small}l" # species
  # table_en += ">{\itshape \small}p{" + spp_column_width + "\\textwidth}" # species
  table_en += "L" # name
  # table_en += ">{\\raggedleft\\arraybackslash \small}p{" + src_column_width + "\\textwidth}"
  table_en += ">{\small}l" # source
  # table_en += ">{\small}p{" + src_column_width + "\\textwidth}" # source
  table_en += "@{}}\n\\toprule\n" # ----------------------------------------------
  table_en += "\\textbf{\#}" # no.
  table_en += " & \multicolumn{1}{l}{\\textbf{Species}}" # species
  table_en += " & \multicolumn{1}{l}{\\textbf{Name}}" # name
  # table_en += " & \\textbf{Status}"  # status
  table_en += " & \multicolumn{1}{l}{\\textbf{Source}}" # source
  table_en += " \\\\\n\midrule\n" #-----------------------------------------------

  for index, row in names_en.iterrows():
    if row['status'] == 'default':
      line = "\\textbf{" + str(index+1) + "}" # no.
      line += "\t& \\textbf{" + str(row['species']) + "}" # species
      line += "\t& \\textbf{" + str(row['term']) + "}" # name
      # line += "\t& \\textbf{" + str(row['status']) + "}" # status
      line += "\t& \\textbf{" + str(row['source zotero']) + "}" # reference
    else:
      line = str(index+1)
      line += "\t& " + str(row['species']) # species
      line += "\t& " + str(row['term']) # name
      # line += "\t& " + str(row['status']) # status
      line += "\t& " + str(row['source zotero']) # reference
    line += " \\\\"
    table_en = table_en + line + "\n"
  
  keys = re.sub('_', '', keys)
  caption = "\caption{Various names for " + keys + " in English.}\n"
  key = re.sub(" ", "_", key)
  label = "\label{table:names_" + key + "_en}\n"
  table_en += "\\bottomrule\n\end{tabularx}\n" + caption + label + "\end{table}\n\n"



  #Arabic
  table_ar = "\\begin{table}[!ht]\n\centering\n\\begin{tabularx}{\\textwidth}"
  # ------------------------------------------------------------------------------
  table_ar += "{@{}l" # no.
  table_ar += ">{\itshape \small}l" # species
  # table_ar += ">{\itshape \small}p{" + spp_column_width + "\\textwidth}" # species
  table_ar += "r" # script
  table_ar += ">{\itshape}l" # name
  table_ar += "L" # gloss
  table_ar += ">{\small}l" # source
  # table_ar += ">{\small}p{" + src_column_width + "\\textwidth}" # source
  table_ar += "@{}}\n\\toprule\n" # ----------------------------------------------
  table_ar += "\\textbf{\#}" # no.
  table_ar += " & \multicolumn{1}{l}{\\textbf{Species}}" # species
  table_ar += " & \multicolumn{1}{l}{\\textbf{Name}}" # script
  table_ar += " & \multicolumn{1}{l}{\\textbf{Tr.}}" # name
  table_ar += " & \multicolumn{1}{l}{\\textbf{Gloss}}" # gloss
  table_ar += " & \multicolumn{1}{l}{\\textbf{Source}}" # source
  table_ar += " \\\\\n\midrule\n" #-----------------------------------------------

  for index, row in names_ar.iterrows():
    if row['status'] == 'default':
      line = "\\textbf{" + str(index+1) + "}" # no.
      line += "\t& \\textbf{" + str(row['species']) + "}" # species
      line += "\t& \\textbf{" + str(row['script']) + "}" # script
      line += "\t& \\textbf{" + str(row['term']) + "}" # name
      line += "\t& \\textbf{" + str(row['literal']) + "}" # literal
      # line += "\t& \\textbf{" + str(row['status']) + "}" # status
      line += "\t& \\textbf{" + str(row['source zotero']) + "}" # reference
    else:
      line = str(index+1)
      line += "\t& " + str(row['species']) # species
      line += "\t& " + str(row['script']) # script
      line += "\t& " + str(row['term']) # name
      line += "\t& " + str(row['literal']) # lit
      # line += "\t& " + str(row['status']) # status
      line += "\t& " + str(row['source zotero']) # reference
    line += " \\\\"

    table_ar = table_ar + line + "\n"
  caption = "\caption{Various names for " + keys + " in Arabic.}\n"
  label = "\label{table:names_" + key + "_ar}\n"
  table_ar += "\\bottomrule\n\end{tabularx}\n" + caption + label + "\end{table}\n\n"

  #Chinese
  table_zh = "\\begin{table}[!ht]\n\centering\n\\begin{tabularx}{\\textwidth}"
  # ------------------------------------------------------------------------------
  table_zh += "{@{}l" # no.
  table_zh += ">{\itshape \small}l" # species
  # table_zh += ">{\itshape \small}p{" + spp_column_width + "\\textwidth}" # species
  table_zh += "l" # script
  table_zh += ">{\itshape}l" # name
  table_zh += "L" # gloss
  table_zh += ">{\small}l" # source
  # table_zh += ">{\small}p{" + src_column_width + "\\textwidth}" # source
  table_zh += "@{}}\n\\toprule\n" # ----------------------------------------------
  table_zh += "\\textbf{\#}" # no.
  table_zh += " & \multicolumn{1}{l}{\\textbf{Species}}" # species
  table_zh += " & \multicolumn{1}{l}{\\textbf{Name}}" # script
  table_zh += " & \multicolumn{1}{l}{\\textbf{Tr.}}" # name
  table_zh += " & \multicolumn{1}{l}{\\textbf{Gloss}}" # gloss
  table_zh += " & \multicolumn{1}{l}{\\textbf{Source}}" # source
  table_zh += " \\\\\n\midrule\n" #-----------------------------------------------

  for index, row in names_zh.iterrows():
    if row['status'] == 'default':
      line = "\\textbf{" + str(index+1) + "}" # no.
      line += "\t& \\textbf{" + str(row['species']) + "}" # species
      line += "\t& \\textbf{\\tradchinesefont{" + str(row['script']) + "}}" # script
      line += "\t& \\textbf{" + str(row['term']) + "}" # name
      line += "\t& \\textbf{" + str(row['literal']) + "}" # literal
      # line += "\t& \\textbf{" + str(row['status']) + "}" # status
      line += "\t& \\textbf{" + str(row['source zotero']) + "}" # reference
    else:
      line = str(index+1)
      line += "\t& " + str(row['species']) # species
      line += "\t& \\tradchinesefont{" + str(row['script']) + "}" # script
      line += "\t& " + str(row['term']) # name
      line += "\t& " + str(row['literal']) # lit
      # line += "\t& " + str(row['status']) # status
      line += "\t& " + str(row['source zotero']) # reference
    line += " \\\\"

    table_zh = table_zh + line + "\n"
  caption = "\caption{Various names for " + keys + " in Chinese.}\n"
  label = "\label{table:names_" + key + "_zh}\n"
  table_zh += "\\bottomrule\n\end{tabularx}\n" + caption + label + "\end{table}\n\n"

  # All
  tables = table_en + table_ar + table_zh

  # Write
  filename = re.sub(" ", "_", key)
  filename = filename.lower()

  f = open(path_out_tex + "{}.tex".format("names_" + filename + "_en"), "w", encoding='utf-8')
  f.write(table_en)
  f.close()
  print("Table for names of '" + str(key) + "' in English as a tex file was created.")

  f = open(path_out_tex + "{}.tex".format("names_" + filename + "_ar"), "w", encoding='utf-8')
  f.write(table_ar)
  f.close()
  print("Table for names of '" + str(key) + "' in Arabic as a tex file was created.")

  f = open(path_out_tex + "{}.tex".format("names_" + filename + "_zh"), "w", encoding='utf-8')
  f.write(table_zh)
  f.close()
  print("Table for names of '" + str(key) + "' in Chinese as a tex file was created.")

  return

In [40]:
nametables('allspice')

Table for names of 'allspice' in English as a tex file was created.
Table for names of 'allspice' in Arabic as a tex file was created.
Table for names of 'allspice' in Chinese as a tex file was created.




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer

### Comparative table

In [41]:
# One table comparing conventionalized words in the three languages

names = df_names.loc[df_names['include'] == 'yes'] # include ones to include
names = names.loc[names['conventionalized'] == 'yes'] # exclude those not in a dictionary

# remove obsolete

names.fillna('', inplace=True)

# # Changes
names['status'] = pd.Categorical(names['status'], ["default", "alias", "historic", "obsolete", "related"]) # add categorical order here
names.sort_values("status", inplace = True) # sort according to the categories
names.sort_values(['species', 'term'], inplace = True, key=lambda col: col.str.lower()) # sort by order of columns, ignoring casing


# Only keep first dictionary item
names['dictionary'] = names['dictionary'].str.replace(r';.*', '', regex=True) 

names.loc[names['dictionary'] != '','dictionary'] = "\\textcite{" + names['dictionary'].str.lower() + "}" #if source not '', add round
names.loc[names['source zotero'] != '','source zotero'] = "\\textcite{" + names['source zotero'] + "}" #if source not '', add round
# names.loc[names['dictionary'].isnull(),'dictionary'] = names['source zotero'] # Fill column with other if NaN
names.loc[names['dictionary'] == '','dictionary'] = names['source zotero'] # Fill column with other if ''

# Datasets
names_en = names.loc[names['language'] == 'English']
names_en = names_en.loc[(names_en['id']==key)]
if 'key2' in locals():
  names_en = names_en.loc[(names_en['id']==key) | (names_en['id']==key2)] # for 2 keys
sorted = names_en['status'].astype(str).argsort()
pd.DataFrame(names_en.values[sorted], names_en.index[sorted], names_en.columns)
names_en.reset_index(inplace=True, drop=True)

names_ar = names.loc[names['language'] == 'Arabic']
names_ar = names_ar.loc[(names_ar['id']==key)]
if 'key2' in locals():
  names_ar = names_ar.loc[(names_ar['id']==key) | (names_ar['id']==key2)]  # for 2 keys
sorted = names_ar['status'].astype(str).argsort()
pd.DataFrame(names_ar.values[sorted], names_ar.index[sorted], names_ar.columns)
names_ar.reset_index(inplace=True, drop=True)

names_zh = names.loc[names['language'] == 'Chinese']
names_zh = names_zh.loc[(names_zh['id']==key)]
if 'key2' in locals():
  names_zh = names_zh.loc[(names_zh['id']==key) | (names_zh['id']==key2)] # for 2 keys
sorted = names_zh['status'].astype(str).argsort()
pd.DataFrame(names_zh.values[sorted], names_zh.index[sorted], names_zh.columns)
names_zh.reset_index(inplace=True, drop=True)

if 'key2' in locals():
  keys = key + " and " + key2
else:
  keys = key

begin = "\\begin{table}[!ht]\n\centering\n\\begin{tabularx}{\\textwidth}"
begin += "{@{}l" # no.
begin += "l" # language
begin += ">{\itshape}l" # term
begin += "L" # gloss
begin += "l" # status
# begin += ">{\\raggedleft\\arraybackslash \small}p{0.25\\textwidth}"
begin += ">{\small}l" # source
begin += "@{}}\n\\toprule\n" # ----------------------------------------------
begin += "\\textbf{\#}" # no.
begin += " & \\textbf{Language}"  # language
begin += " & \multicolumn{1}{l}{\\textbf{Term}}" # term
begin += " & \\textbf{Gloss}"  # gloss
begin += " & \\textbf{Status}"  # status
begin += " & \multicolumn{1}{l}{\\textbf{Source}}" # source
begin += " \\\\\n\midrule\n" #-----------------------------------------------

empty = "\midrule\n & & & & & \\\\\n\midrule\n"
empty = "\midrule\n"
keys = re.sub('_', '', keys)
caption = "\caption{Conventionalized names for " + keys +  " in English, Arabic, and Chinese, found in dictionaries.}\n"
label = "\label{table:names_" + str(key)+ "}\n"
end = "\\bottomrule\n\end{tabularx}\n" + caption + label + "\end{table}\n\n"

lines = ""
lines_en = ""
for index, row in names_en.iterrows():
  if row["borrowing"] == 'yes':
    arrow = "" #"\\rightarrow "
  else:
    arrow = ""
  line = str(index+1)
  line += "\t& " + str(row['language'])
  line += "\t& " + arrow + str(row['term'])
  line += "\t& " + str(row['literal'])
  line += "\t& " + str(row['status'])
  line += "\t& " + str(row['dictionary'])
  line += " \\\\\n"
  lines_en += line

lines_ar = ""
for index, row in names_ar.iterrows():
  if row["borrowing"] == 'yes':
    arrow = "" #"\\rightarrow "
  else:
    arrow = ""
  line = str(index+1)
  line += "\t& " + str(row['language'])
  line += "\t& " + arrow + str(row['term'])
  line += "\t& " + str(row['literal'])
  line += "\t& " + str(row['status'])
  line += "\t& " + str(row['dictionary'])
  line += " \\\\\n"
  lines_ar += line

lines_zh = ""
for index, row in names_zh.iterrows():
  if row["borrowing"] == 'yes':
    arrow = "" #"\\rightarrow "
  else:
    arrow = ""
  line = str(index+1)
  line += "\t& " + str(row['language'])
  line += "\t& " + arrow + str(row['term'])
  line += "\t& " + str(row['literal'])
  line += "\t& " + str(row['status'])
  line += "\t& " + str(row['dictionary'])
  line += " \\\\\n"
  lines_zh += line

lines = lines_en + empty + lines_ar + empty + lines_zh
table = begin + lines + end

# Write
filename = re.sub(" ", "_", key)
filename = filename.lower()
f = open(path_out_tex + "{}.tex".format("names_" + filename), "w", encoding='utf-8')
f.write(table)
f.close()
print("Comparative table for names of '" + str(key) + "' as a tex file was created.")
# print(table)

Comparative table for names of 'allspice' as a tex file was created.


In [42]:
# One table comparing conventionalized words in the three languages, with borrowed status

names = df_names.loc[df_names['include'] == 'yes'] # include ones to include
names = names.loc[names['conventionalized'] == 'yes'] # exclude those not in a dictionary

# remove obsolete

names.fillna('', inplace=True)

# # Changes
names['status'] = pd.Categorical(names['status'], ["default", "alias", "historic", "obsolete", "related"]) # add categorical order here
names.sort_values("status", inplace = True) # sort according to the categories
names.sort_values(['species', 'term'], inplace = True, key=lambda col: col.str.lower()) # sort by order of columns, ignoring casing


# Only keep first dictionary item
names['dictionary'] = names['dictionary'].str.replace(r';.*', '', regex=True) 

names.loc[names['dictionary'] != '','dictionary'] = "\\textcite{" + names['dictionary'].str.lower() + "}" #if source not '', add round
names.loc[names['source zotero'] != '','source zotero'] = "\\textcite{" + names['source zotero'] + "}" #if source not '', add round
# names.loc[names['dictionary'].isnull(),'dictionary'] = names['source zotero'] # Fill column with other if NaN
names.loc[names['dictionary'] == '','dictionary'] = names['source zotero'] # Fill column with other if ''

# Datasets
names_en = names.loc[names['language'] == 'English']
names_en = names_en.loc[(names_en['id']==key)]
if 'key2' in locals():
  names_en = names_en.loc[(names_en['id']==key) | (names_en['id']==key2)] # for 2 keys
sorted = names_en['status'].astype(str).argsort()
pd.DataFrame(names_en.values[sorted], names_en.index[sorted], names_en.columns)
names_en.reset_index(inplace=True, drop=True)

names_ar = names.loc[names['language'] == 'Arabic']
names_ar = names_ar.loc[(names_ar['id']==key)]
if 'key2' in locals():
  names_ar = names_ar.loc[(names_ar['id']==key) | (names_ar['id']==key2)]  # for 2 keys
sorted = names_ar['status'].astype(str).argsort()
pd.DataFrame(names_ar.values[sorted], names_ar.index[sorted], names_ar.columns)
names_ar.reset_index(inplace=True, drop=True)

names_zh = names.loc[names['language'] == 'Chinese']
names_zh = names_zh.loc[(names_zh['id']==key)]
if 'key2' in locals():
  names_zh = names_zh.loc[(names_zh['id']==key) | (names_zh['id']==key2)] # for 2 keys
sorted = names_zh['status'].astype(str).argsort()
pd.DataFrame(names_zh.values[sorted], names_zh.index[sorted], names_zh.columns)
names_zh.reset_index(inplace=True, drop=True)

if 'key2' in locals():
  keys = key + " and " + key2
else:
  keys = key

begin = "\\begin{table}[!ht]\n\centering\n\\begin{tabularx}{\\textwidth}"
begin += "{@{}l" # no.
begin += "l" # language
begin += ">{\itshape}l" # term
begin += "L" # gloss
begin += "l" # status
# begin += ">{\\raggedleft\\arraybackslash \small}p{0.25\\textwidth}"
begin += ">{\small}l" # source
begin += "@{}}\n\\toprule\n" # ----------------------------------------------
begin += "\\textbf{\#}" # no.
begin += " & \\textbf{Language}"  # language
begin += " & \multicolumn{1}{l}{\\textbf{Term}}" # term
begin += " & \\textbf{Gloss}"  # gloss
begin += " & \\textbf{Loan}"  # status
begin += " & \multicolumn{1}{l}{\\textbf{Source}}" # source
begin += " \\\\\n\midrule\n" #-----------------------------------------------

empty = "\midrule\n & & & & & \\\\\n\midrule\n"
empty = "\midrule\n"
keys = re.sub('_', '', keys)
caption = "\caption{Conventionalized names for " + keys +  " in English, Arabic, and Chinese, found in dictionaries.}\n"
key = re.sub(" ", "_", key)
label = "\label{table:names_" + str(key)+ "}\n"
end = "\\bottomrule\n\end{tabularx}\n" + caption + label + "\end{table}\n\n"

lines = ""
lines_en = ""
for index, row in names_en.iterrows():
  if row["borrowing"] == 'yes':
    arrow = "" #"\\rightarrow "
  else:
    arrow = ""
  line = str(index+1)
  line += "\t& " + str(row['language'])
  line += "\t& " + arrow + str(row['term'])
  line += "\t& " + str(row['literal'])
  line += "\t& " + str(row['borrowing'])
  line += "\t& " + str(row['dictionary'])
  line += " \\\\\n"
  lines_en += line

lines_ar = ""
for index, row in names_ar.iterrows():
  if row["borrowing"] == 'yes':
    arrow = "" #"\\rightarrow "
  else:
    arrow = ""
  line = str(index+1)
  line += "\t& " + str(row['language'])
  line += "\t& " + arrow + str(row['term'])
  line += "\t& " + str(row['literal'])
  line += "\t& " + str(row['borrowing'])
  line += "\t& " + str(row['dictionary'])
  line += " \\\\\n"
  lines_ar += line

lines_zh = ""
for index, row in names_zh.iterrows():
  if row["borrowing"] == 'yes':
    arrow = "" #"\\rightarrow "
  else:
    arrow = ""
  line = str(index+1)
  line += "\t& " + str(row['language'])
  line += "\t& " + arrow + str(row['term'])
  line += "\t& " + str(row['literal'])
  line += "\t& " + str(row['borrowing'])
  line += "\t& " + str(row['dictionary'])
  line += " \\\\\n"
  lines_zh += line

lines = lines_en + empty + lines_ar + empty + lines_zh
table = begin + lines + end

# Write
filename = re.sub(" ", "_", key)
filename = filename.lower()
f = open(path_out_tex + "{}.tex".format("names_" + filename), "w", encoding='utf-8')
f.write(table)
f.close()
print("Comparative table for names of '" + str(key) + "' as a tex file was created.")
# print(table)

Comparative table for names of 'allspice' as a tex file was created.


In [43]:
def comptable(key):

  # One table comparing conventionalized words in the three languages, with borrowed status



  names = df_names.loc[df_names['include'] == 'yes'] # include ones to include
  names = names.loc[names['conventionalized'] == 'yes'] # exclude those not in a dictionary

  # remove obsolete

  names.fillna('', inplace=True)

  # # Changes
  names['status'] = pd.Categorical(names['status'], ["default", "alias", "historic", "obsolete", "related"]) # add categorical order here
  names.sort_values("status", inplace = True) # sort according to the categories
  names.sort_values(['species', 'term'], inplace = True, key=lambda col: col.str.lower()) # sort by order of columns, ignoring casing


  # Only keep first dictionary item
  names['dictionary'] = names['dictionary'].str.replace(r';.*', '', regex=True) 

  names.loc[names['dictionary'] != '','dictionary'] = "\\textcite{" + names['dictionary'].str.lower() + "}" #if source not '', add round
  names.loc[names['source zotero'] != '','source zotero'] = "\\textcite{" + names['source zotero'] + "}" #if source not '', add round
  # names.loc[names['dictionary'].isnull(),'dictionary'] = names['source zotero'] # Fill column with other if NaN
  names.loc[names['dictionary'] == '','dictionary'] = names['source zotero'] # Fill column with other if ''

  # Datasets
  names_en = names.loc[names['language'] == 'English']
  names_en = names_en.loc[(names_en['id']==key)]
  if 'key2' in locals():
    names_en = names_en.loc[(names_en['id']==key) | (names_en['id']==key2)] # for 2 keys
  sorted = names_en['status'].astype(str).argsort()
  pd.DataFrame(names_en.values[sorted], names_en.index[sorted], names_en.columns)
  names_en.reset_index(inplace=True, drop=True)

  names_ar = names.loc[names['language'] == 'Arabic']
  names_ar = names_ar.loc[(names_ar['id']==key)]
  if 'key2' in locals():
    names_ar = names_ar.loc[(names_ar['id']==key) | (names_ar['id']==key2)]  # for 2 keys
  sorted = names_ar['status'].astype(str).argsort()
  pd.DataFrame(names_ar.values[sorted], names_ar.index[sorted], names_ar.columns)
  names_ar.reset_index(inplace=True, drop=True)

  names_zh = names.loc[names['language'] == 'Chinese']
  names_zh = names_zh.loc[(names_zh['id']==key)]
  if 'key2' in locals():
    names_zh = names_zh.loc[(names_zh['id']==key) | (names_zh['id']==key2)] # for 2 keys
  sorted = names_zh['status'].astype(str).argsort()
  pd.DataFrame(names_zh.values[sorted], names_zh.index[sorted], names_zh.columns)
  names_zh.reset_index(inplace=True, drop=True)

  if 'key2' in locals():
    keys = key + " and " + key2
  else:
    keys = key

  begin = "\\begin{table}[!ht]\n\centering\n\\begin{tabularx}{\\textwidth}"
  begin += "{@{}l" # no.
  begin += "l" # language
  begin += ">{\itshape}l" # term
  begin += "L" # gloss
  begin += "l" # status
  # begin += ">{\\raggedleft\\arraybackslash \small}p{0.25\\textwidth}"
  begin += ">{\small}l" # source
  begin += "@{}}\n\\toprule\n" # ----------------------------------------------
  begin += "\\textbf{\#}" # no.
  begin += " & \\textbf{Language}"  # language
  begin += " & \multicolumn{1}{l}{\\textbf{Term}}" # term
  begin += " & \\textbf{Gloss}"  # gloss
  begin += " & \\textbf{Loan}"  # status
  begin += " & \multicolumn{1}{l}{\\textbf{Source}}" # source
  begin += " \\\\\n\midrule\n" #-----------------------------------------------

  empty = "\midrule\n & & & & & \\\\\n\midrule\n"
  empty = "\midrule\n"
  keys = re.sub('_', '', keys)
  caption = "\caption{Conventionalized names for " + keys +  " in English, Arabic, and Chinese, found in dictionaries.}\n"
  key = re.sub(" ", "_", key)
  label = "\label{table:names_" + str(key)+ "}\n"
  end = "\\bottomrule\n\end{tabularx}\n" + caption + label + "\end{table}\n\n"

  lines = ""
  lines_en = ""
  for index, row in names_en.iterrows():
    if row["borrowing"] == 'yes':
      arrow = "" #"\\rightarrow "
    else:
      arrow = ""
    line = str(index+1)
    line += "\t& " + str(row['language'])
    line += "\t& " + arrow + str(row['term'])
    line += "\t& " + str(row['literal'])
    line += "\t& " + str(row['borrowing'])
    line += "\t& " + str(row['dictionary'])
    line += " \\\\\n"
    lines_en += line

  lines_ar = ""
  for index, row in names_ar.iterrows():
    if row["borrowing"] == 'yes':
      arrow = "" #"\\rightarrow "
    else:
      arrow = ""
    line = str(index+1)
    line += "\t& " + str(row['language'])
    line += "\t& " + arrow + str(row['term'])
    line += "\t& " + str(row['literal'])
    line += "\t& " + str(row['borrowing'])
    line += "\t& " + str(row['dictionary'])
    line += " \\\\\n"
    lines_ar += line

  lines_zh = ""
  for index, row in names_zh.iterrows():
    if row["borrowing"] == 'yes':
      arrow = "" #"\\rightarrow "
    else:
      arrow = ""
    line = str(index+1)
    line += "\t& " + str(row['language'])
    line += "\t& " + arrow + str(row['term'])
    line += "\t& " + str(row['literal'])
    line += "\t& " + str(row['borrowing'])
    line += "\t& " + str(row['dictionary'])
    line += " \\\\\n"
    lines_zh += line

  lines = lines_en + empty + lines_ar + empty + lines_zh
  table = begin + lines + end

  # Write
  filename = re.sub(" ", "_", key)
  filename = filename.lower()
  f = open(path_out_tex + "{}.tex".format("names_" + filename), "w", encoding='utf-8')
  f.write(table)
  f.close()
  print("Comparative table for names of '" + str(key) + "' as a tex file was created.")
  # print(table)
  return

In [44]:
comptable('allspice')

Comparative table for names of 'allspice' as a tex file was created.


### Automate

In [45]:
# Loop all the dataset
def tables():
  for key in keylist:
    nametables(key)
  print('Done')

  for key in keylist:
    comptable(key)
  print('Done')

tables()



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer

Table for names of 'Sichuan_pepper' in English as a tex file was created.
Table for names of 'Sichuan_pepper' in Arabic as a tex file was created.
Table for names of 'Sichuan_pepper' in Chinese as a tex file was created.
Table for names of 'allspice' in English as a tex file was created.
Table for names of 'allspice' in Arabic as a tex file was created.
Table for names of 'allspice' in Chinese as a tex file was created.
Table for names of 'anise' in English as a tex file was created.
Table for names of 'anise' in Arabic as a tex file was created.
Table for names of 'anise' in Chinese as a tex file was created.
Table for names of 'asafoetida' in English as a tex file was created.
Table for names of 'asafoetida' in Arabic as a tex file was created.
Table for names of 'asafoetida' in Chinese as a tex file was created.




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer

Table for names of 'caraway' in English as a tex file was created.
Table for names of 'caraway' in Arabic as a tex file was created.
Table for names of 'caraway' in Chinese as a tex file was created.
Table for names of 'cardamom' in English as a tex file was created.
Table for names of 'cardamom' in Arabic as a tex file was created.
Table for names of 'cardamom' in Chinese as a tex file was created.
Table for names of 'cassia' in English as a tex file was created.
Table for names of 'cassia' in Arabic as a tex file was created.
Table for names of 'cassia' in Chinese as a tex file was created.
Table for names of 'chile' in English as a tex file was created.
Table for names of 'chile' in Arabic as a tex file was created.
Table for names of 'chile' in Chinese as a tex file was created.
Table for names of 'cinnamon' in English as a tex file was created.
Table for names of 'cinnamon' in Arabic as a tex file was created.
Table for names of 'cinnamon' in Chinese as a tex file was created.
Tab



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer

Table for names of 'long_pepper' in English as a tex file was created.
Table for names of 'long_pepper' in Arabic as a tex file was created.
Table for names of 'long_pepper' in Chinese as a tex file was created.
Table for names of 'mace' in English as a tex file was created.
Table for names of 'mace' in Arabic as a tex file was created.
Table for names of 'mace' in Chinese as a tex file was created.
Table for names of 'nutmeg' in English as a tex file was created.
Table for names of 'nutmeg' in Arabic as a tex file was created.
Table for names of 'nutmeg' in Chinese as a tex file was created.
Table for names of 'pepper' in English as a tex file was created.
Table for names of 'pepper' in Arabic as a tex file was created.
Table for names of 'pepper' in Chinese as a tex file was created.
Table for names of 'saffron' in English as a tex file was created.
Table for names of 'saffron' in Arabic as a tex file was created.
Table for names of 'saffron' in Chinese as a tex file was created.
Tab

## Create etymological data

In [52]:
# Read and store content of an excel file 
read_file = pd.read_excel(path_in+"etymologies.xlsx")

# Write the dataframe object into csv file
read_file.to_csv (path_in+"etymologies.csv", index = None, header=True)


# Load in dataset
df_etymologies=pd.read_csv(path_in+'etymologies.csv', header =[0], delimiter=',', encoding="utf-8")

# Split the dataset wherever an empty row is found
df_list_with_na = np.split(df_etymologies, df_etymologies[df_etymologies.isnull().all(1)].index)

# Drop NAs and reset the index
df_list = []
for df in df_list_with_na:
  df = df.dropna(how='all')
  df.reset_index(inplace=True, drop=True)
  df_list.append(df)

# Automatically extract IDs from the dataset
ids = []
for df in df_list:
  df = df.dropna(how='all')
  df.reset_index(drop=True)
  id = str(df['id'].iloc[0])
  ids.append(id)

# Print the number if IDs and what are they
length = len(df_list)
print(ids)
print(length, "words in total")

#Create a defaultdict of spice-word etymologies
etymologies=defaultdict(list)
for i in range(length):
  etymologies[ids[i]]=df_list[i]

# Testing
# print(etymologies['saffron'])
# print(type(etymologies['saffron']))
# etymologies['saffron']

['tester', 'allspice', 'fulful ifranji', 'duoxiangguo', 'pimento', 'anise', 'anisun', 'huiqin', 'asafoetida', 'hing', 'hiltit', 'anjudan', 'awei', 'xingqu', 'caraway', 'karawiya', 'geluzi', 'cardamom', 'amomum', 'hal', 'qaqulla', 'doukou', 'cassia', 'salikha', 'rougui', 'cinnamon', 'darsini', 'qirfa', 'chile', 'fulful harr', 'lajiao', 'paprika', 'clove', 'qaranful', 'dingxiang', 'coriander', 'kuzbura', 'yansui', 'husui', 'cumin', 'kammun', 'ziran', 'dill', 'shibitt', 'shiluo', 'fennel', 'shamar', 'huixiang', 'fenugreek', 'hulba', 'huluba', 'ginger', 'zanjabil', 'jiang', 'long pepper', 'darfilfil', 'biba', 'mace', 'basbas', 'roudoukoupi', 'nutmeg', 'jawz al-tib', 'roudoukou', 'pepper', 'fulful', 'hujiao', 'bors', 'saffron', 'zafaran', 'fanhonghua', 'Sichuan pepper', 'fagara', 'fulful sitshuwan', 'huajiao', 'star anise', 'yansun najmi', 'bajiaohuixiang', 'badian', 'turmeric', 'kurkum', 'jianghuang', 'vanilla', 'faniliya', 'xiangcao']
84 words in total


### Etymology box for LaTeX

In [53]:
key = "allspice"

################################################################################

# The following code will create a etymology box environment for the key, to be used in LaTeX
print("Started the generation of '" + key + "' as etymbox...")

df_local = etymologies[key]
# df_local.fillna('', inplace=True)

# # Skipping those marked
df_local = df_local[df_local['boxskip'] != 'yes']
df_local.reset_index(inplace=True, drop=True)

# # Replace empty cells with NaNs
# df_local.replace(r'^\s*$', np.nan, regex=True, inplace=True)
# df_local.replace(r'^nan$', np.nan, regex=True, inplace=True)

# Create content and sources step by step
content = ""
source = ""
sources = ""
source_list = []
nl = "\n"

for index, row in df_local.iterrows():
  step = "< "
  if pd.notna(row['complex']): # complex relationships
    row['complex'] = re.sub("and from", "+", row['complex'])
    step += row['complex'] + " "
  if pd.notna(row['language']): # language
    step += row['language'] + " "
  if pd.notna(row['script']): # script
    script = "{" + row['script'] + "} "
    if row['language'] == 'Chinese':
      script = "\\tc{" + row['script'] + "} "
    step += script
  if pd.notna(row['term']): # term
    step += "\\textit{" + row['term'] + "} "
  if pd.notna(row['IPA']): # IPA
    step += row['IPA'] + " "
  if pd.notna(row['meaning']): # meaning
    step += "`" + row['meaning'] + "' "
  if pd.notna(row['literal']): # literal meaning
    step += "[" + row['literal'] + "] "
  step = re.sub(' $', '', step)
  step += ", "
  if pd.notna(row['explanation']): # explanation
    step += row['explanation'] + " "
  if pd.notna(row['remark']): # remark
    step += "(" + row['remark'] + ") "
  step = re.sub(',? ?$', '', step)

  if pd.notna(row['date']): # dates
    step += ", "
    row['date'] = re.sub('a(?=\d)', 'a. ', row['date'])
    row['date'] = re.sub('c(?=\d)', 'ca. ', row['date'])
    if re.match('^-\d\d?$', row['date']): # if is a century BC
      row['date'] = re.sub("-", "", row['date'])
      date = "\\nth{" + row['date'] + "} c. \BC{}" # ARAB NUMERALS
      # date = " " + roman(row['date']) + " \BC{}" # ROMAN NUMERALS
    elif re.match('^\d\d?$', row['date']): # if is a century AD
      date = "\\nth{" + row['date'] + "} c. \AD{}" # ARAB NUMERALS
      # date = " " + roman(row['date']) + " \AD{}" # ROMAN NUMERALS
    else:
      date = row['date'] + " " # if it's a year
    step += date
    step = re.sub(',? ?$', '', step)
  if pd.notna(row['cognates']): # cognates
    step += "; cf. cognates " + row['cognates'] + " "
  if pd.notna(row['derivates']): # cognates
    if pd.notna(row['cognates']):
      step = re.sub(' $', '', step)
      step += "; " + row['derivates'] + " "
    else:
      step = re.sub(' $', '', step)
      step += "; cf. " + row['derivates'] + " "
  step = re.sub(',? ?$', '', step)
  # step = re.sub('cf\..*?(cf\.)', '', step)

# Final touches
  if row['doubt'] == 'yes':
    step = re.sub(r'<', '<\\\\textss{?}', step) # ???
  if row['complex'] == '+':
    step = re.sub('<', '', step)
  if row['complex'] == 'or from':
    step = re.sub('<', '', step)
  content += step + nl

# Sources
  source=""
  if pd.notna(row['source zotero']):
    source = row['source zotero']
    print("1",source)
    if '{' in source:
      source = "s" + row['source zotero'].lower()
      print(source)
    else:
      source = "{" + row['source zotero'].lower() + "}"
      print(source)
    if pd.notna(row['source page']):
      source = "[" + str(row['source page']) + "]{" + row['source zotero'].lower() + "}"
      print("4",source)
      if row['source page'].isalpha() == True:
        source = "[s.v. " + str(row['source page']) + "]{" + row['source zotero'].lower() + "}"
        print("5",source)
    source = "\\textcite" + source
  print(source)
  source_list.append(source)

# clear duplicates from sources:
print("SL1: ", source_list)
# source_set = sorted(set(source_list), key=source_list.index)
source_set = set(source_list)
print("SS2: ", source_set)
source_list2 = list(source_set)
print("S3: ", source_list2)
sources_unduplicated = '; '.join(source_list2)
print("S4: ", sources_unduplicated)
# test for duplicates
newlist = [] # empty list to hold unique elements from the list
duplist = [] # empty list to hold the duplicate elements from the list
for i in source_list:
    if i not in newlist:
        newlist.append(i)
    else:
        duplist.append(i) # this method catches the first duplicate entries, and appends them to the list
# The next step is to print the duplicate entries, and the unique entries
print("List of duplicates", duplist)
print("Unique Item List", newlist) 
if len(duplist) > 0:
  # print("UNDUPL")
  sources = sources_unduplicated
else:
  # print("ORI")
  sources =  '; '.join(source_list)
# print("S5: ", sources)

sources =  '; '.join(source_list)

# Cleaning
sources = re.sub("; $", "", sources)
sources = re.sub("^; ", "", sources)
sources = re.sub("(; )+", "; ", sources)
sources = "\\footnote{" + sources + "}\n"

content = re.sub("\n$", "", content)
content = re.sub(r"^< ", "", content) # delete the first <
content = re.sub(r"\n,", ",", content)
content = re.sub(r" nan ", " ", content)
content = re.sub("(<\.\n?)+$", "", content)

content += sources

env_begin = r"\begin{etymology}" + "\label{ety:" + df_local['id'].iloc[0] + "}" + nl 
env_end = r"\end{etymology}"

box = env_begin + content + env_end
box = re.sub(r"\u200e", "", box) #removes right to left mark

# Save the spicebox as a standalone tex file

filename = re.sub(" ", "_", key)
filename = filename.lower()
f = open(path_out_tex + "{}.tex".format("etymbox_" + filename), "w", encoding='utf-8')
f.write(box)
f.close()
print("Etymology-box '" + str(key) + "' as a tex file was created.")
box

Started the generation of 'allspice' as etymbox...
1 oed
{oed}
4 [allspice]{oed}
5 [s.v. allspice]{oed}
\textcite[s.v. allspice]{oed}
SL1:  ['\\textcite[s.v. allspice]{oed}']
SS2:  {'\\textcite[s.v. allspice]{oed}'}
S3:  ['\\textcite[s.v. allspice]{oed}']
S4:  \textcite[s.v. allspice]{oed}
List of duplicates []
Unique Item List ['\\textcite[s.v. allspice]{oed}']
Etymology-box 'allspice' as a tex file was created.


'\\begin{etymology}\\label{ety:allspice}\nEnglish \\textit{allspice}, from \\textit{all} + \\textit{spice}; after the flavour profile that resembles the combined aroma of cloves, nutmeg, cinnamon, and black pepper, 1621\\footnote{\\textcite[s.v. allspice]{oed}}\n\\end{etymology}'

In [54]:
def etymbox(key):
  
  # The following code will create a etymology box environment for the key, to be used in LaTeX
  print("Started the generation of '" + key + "' as etymbox...")

  df_local = etymologies[key]
  # df_local.fillna('', inplace=True)

  # # Skipping those marked
  df_local = df_local[df_local['boxskip'] != 'yes']
  df_local.reset_index(inplace=True, drop=True)

  # # Replace empty cells with NaNs
  # df_local.replace(r'^\s*$', np.nan, regex=True, inplace=True)
  # df_local.replace(r'^nan$', np.nan, regex=True, inplace=True)

  # Create content and sources step by step
  content = ""
  source = ""
  sources = ""
  source_list = []
  nl = "\n"

  for index, row in df_local.iterrows():
    step = "< "
    if pd.notna(row['complex']): # complex relationships
      row['complex'] = re.sub("and from", "+", row['complex'])
      step += row['complex'] + " "
    if pd.notna(row['language']): # language
      step += row['language'] + " "
    if pd.notna(row['script']): # script
      script = "{" + row['script'] + "} "
      if row['language'] == 'Chinese':
        script = "\\tc{" + row['script'] + "} "
      step += script
    if pd.notna(row['term']): # term
      step += "\\textit{" + row['term'] + "} "
    if pd.notna(row['IPA']): # IPA
      step += row['IPA'] + " "
    if pd.notna(row['meaning']): # meaning
      step += "`" + row['meaning'] + "' "
    if pd.notna(row['literal']): # literal meaning
      step += "[" + row['literal'] + "] "
    step = re.sub(' $', '', step)
    step += ", "
    if pd.notna(row['explanation']): # explanation
      step += row['explanation'] + " "
    if pd.notna(row['remark']): # remark
      step += "(" + row['remark'] + ") "
    step = re.sub(',? ?$', '', step)

    if pd.notna(row['date']): # dates
      step += ", "
      row['date'] = re.sub('a(?=\d)', 'a. ', row['date'])
      row['date'] = re.sub('c(?=\d)', 'ca. ', row['date'])
      if re.match('^-\d\d?$', row['date']): # if is a century BC
        row['date'] = re.sub("-", "", row['date'])
        date = "\\nth{" + row['date'] + "} c. \BC{}" # ARAB NUMERALS
        # date = " " + roman(row['date']) + " \BC{}" # ROMAN NUMERALS
      elif re.match('^\d\d?$', row['date']): # if is a century AD
        date = "\\nth{" + row['date'] + "} c. \AD{}" # ARAB NUMERALS
        # date = " " + roman(row['date']) + " \AD{}" # ROMAN NUMERALS
      else:
        date = row['date'] + " " # if it's a year
      step += date
      step = re.sub(',? ?$', '', step)
    if pd.notna(row['cognates']): # cognates
      step += "; cf. cognates " + row['cognates'] + " "
    if pd.notna(row['derivates']): # cognates
      if pd.notna(row['cognates']):
        step = re.sub(' $', '', step)
        step += "; " + row['derivates'] + " "
      else:
        step = re.sub(' $', '', step)
        step += "; cf. " + row['derivates'] + " "
    step = re.sub(',? ?$', '', step)
    # step = re.sub('cf\..*?(cf\.)', '', step)

  # Final touches
    if row['doubt'] == 'yes':
      step = re.sub(r'<', '<\\\\textss{?}', step) # ???
    if row['complex'] == '+':
      step = re.sub('<', '', step)
    if row['complex'] == 'or from':
      step = re.sub('<', '', step)
    content += step + nl

  # Sources
    source=""
    if pd.notna(row['source zotero']):
      source = row['source zotero']
      # print(source)
      if '{' in source:
        source = "s" + row['source zotero'].lower()
        # print(source)
      else:
        source = "{" + row['source zotero'].lower() + "}"
        # print(source)
      if pd.notna(row['source page']):
        source = "[" + str(row['source page']) + "]{" + row['source zotero'].lower() + "}"
        # print(source)
        if row['source page'].isalpha() == True:
          source = "[s.v. " + str(row['source page']) + "]{" + row['source zotero'].lower() + "}"
          # print(source)
      source = "\\textcite" + source
    # print(source)
    source_list.append(source)

  # clear duplicates from sources:
  # print("SL1: ", source_list)
  # source_set = sorted(set(source_list), key=source_list.index)
  source_set = set(source_list)
  # print("SS2: ", source_set)
  source_list2 = list(source_set)
  # print("S3: ", source_list2)
  sources_unduplicated = '; '.join(source_list2)
  # print("S4: ", sources_unduplicated)
  # test for duplicates
  newlist = [] # empty list to hold unique elements from the list
  duplist = [] # empty list to hold the duplicate elements from the list
  for i in source_list:
      if i not in newlist:
          newlist.append(i)
      else:
          duplist.append(i) # this method catches the first duplicate entries, and appends them to the list
  # The next step is to print the duplicate entries, and the unique entries
  # print("List of duplicates", duplist)
  # print("Unique Item List", newlist) 
  if len(duplist) > 0:
    # print("UNDUPL")
    sources = sources_unduplicated
  else:
    # print("ORI")
    sources =  '; '.join(source_list)
  # print("S5: ", sources)

  sources =  '; '.join(source_list)


  # Cleaning
  sources = re.sub("; $", "", sources)
  sources = re.sub("^; ", "", sources)
  sources = re.sub("(; )+", "; ", sources)
  sources = "\\footnote{" + sources + "}\n"

  content = re.sub("\n$", "", content)
  content = re.sub(r"^< ", "", content) # delete the first <
  content = re.sub(r"\n,", ",", content)
  content = re.sub(r" nan ", " ", content)
  content = re.sub("(<\.\n?)+$", "", content)

  content += sources

  env_begin = r"\begin{etymology}" + "\label{ety:" + df_local['id'].iloc[0] + "}" + nl 
  env_end = r"\end{etymology}"

  box = env_begin + content + env_end
  box = re.sub(r"\u200e", "", box) #removes right to left mark

  # Save the spicebox as a standalone tex file
  filename = re.sub(" ", "_", key)
  filename = filename.lower()
  f = open(path_out_tex + "{}.tex".format("etymbox_" + filename), "w", encoding='utf-8')  
  f.write(box)
  f.close()
  print("Etymology-box '" + str(key) + "' as a tex file was created.")

  return box

In [55]:
etymbox("allspice")

Started the generation of 'allspice' as etymbox...
Etymology-box 'allspice' as a tex file was created.


'\\begin{etymology}\\label{ety:allspice}\nEnglish \\textit{allspice}, from \\textit{all} + \\textit{spice}; after the flavour profile that resembles the combined aroma of cloves, nutmeg, cinnamon, and black pepper, 1621\\footnote{\\textcite[s.v. allspice]{oed}}\n\\end{etymology}'

In [56]:
# Loop all the dataset
def etymboxes():
  for key in ids:
    etymbox(key)
  print('Done')

etymboxes()

Started the generation of 'tester' as etymbox...
Etymology-box 'tester' as a tex file was created.
Started the generation of 'allspice' as etymbox...
Etymology-box 'allspice' as a tex file was created.
Started the generation of 'fulful ifranji' as etymbox...
Etymology-box 'fulful ifranji' as a tex file was created.
Started the generation of 'duoxiangguo' as etymbox...
Etymology-box 'duoxiangguo' as a tex file was created.
Started the generation of 'pimento' as etymbox...
Etymology-box 'pimento' as a tex file was created.
Started the generation of 'anise' as etymbox...
Etymology-box 'anise' as a tex file was created.
Started the generation of 'anisun' as etymbox...
Etymology-box 'anisun' as a tex file was created.
Started the generation of 'huiqin' as etymbox...
Etymology-box 'huiqin' as a tex file was created.
Started the generation of 'asafoetida' as etymbox...
Etymology-box 'asafoetida' as a tex file was created.
Started the generation of 'hing' as etymbox...
Etymology-box 'hing' as 

### Etymology tree for LaTeX

In [None]:
# # Parboxed, linear tree
# key = "hal"



# # ------------------------------------------------------------------------------
# df_local = etymologies[key]
# df_local = df_local[df_local['treeskip'] != 'yes']
# df_local.reset_index(inplace=True, drop=True)

# c1=0.5
# c2=0.2
# c3=0.2

# meaning_size = "\\small" # "\\footnotesize"
# language_size = "" # "\\small"

# # Terms
# tree = ""
# lines = ""
# par_begin = "" #"\\parbox{" + str(c1) + "\\textwidth}{\centering "
# par_begin = "\\parbox{" + str(c1) + "\\textwidth}{\centering "
# par_end = "" #"}"
# par_end = "}"
# for index, row in df_local.iterrows():
#   if pd.notnull(row['term']):
#     if pd.notnull(row['script']):
#       line = "{" + row['script'] + "} \\textit{" + str(row['term']) + "} \\\ " + row['language']
#     else:
#       line = "\\textit{" + str(row['term']) + "} \\\ " + str(row['language'])
#   else:
#     line = row['language']
#   if pd.notnull(row['meaning']):
#                 line += " \\\ {" + meaning_size + "`" + row['meaning'] + "'}"
#   if pd.notnull(row['explanation']):
#                 line += " {" + meaning_size + "(" + row['explanation'] + ")}"
#   if row['loanword'] == "yes":
#     line = "\\hspace{-1.25em} \\textcolor{\\accentcolor}{\\rightarrow} " + line
#   line = par_begin + line + par_end
#   if index == 0:
#     line = line + ", no edge, baseline, for tree={l sep=20,s sep=2mm}"
#   if row['doubt'] == "yes":
#     line = line + ", edge=dashed"
#   # if row['influencer'] == "yes":
#   #   line = line + ", edge={PolyU}"
#   #   line = line + ", edge label={node[midway,right,font=\scriptsize]{borrowing}}"
#   line = "\t"*index + "[" + line
#   line = line  + ", name=" + str(row['language'])
#   line = line  + ", tier=" + str(index)
#   line = re.sub(r" nan ", " ", line)
#   lines = lines + line + "\n"
# lines += "]"*(index+1) + "\n"

# # Dates
# par_begin = "" #"\\parbox{" + str(c3) + "\\textwidth}{\centering "
# par_begin = "\\parbox{" + str(c2) + "\\textwidth}{\centering "
# par_end = "" #"}"
# par_end = "}"
# dates = ""
# for index, row in df_local.iterrows():
#   if pd.notna(row['date']): # dates
#     row['date'] = re.sub('a(?=\d)', 'a. ', row['date'])
#     row['date'] = re.sub('c(?=\d)', 'ca. ', row['date'])
#     if re.match('^-\d\d?$', row['date']): # if is a century BC
#       row['date'] = re.sub("-", "", row['date'])
#       # date = "\\nth{" + row['date'] + "} c. \BC{}" # ARAB NUMERALS
#       date = " " + roman(row['date']) + " \BC{}" # ROMAN NUMERALS
#     elif re.match('^\d\d?$', row['date']): # if is a century AD
#       # date = "\\nth{" + row['date'] + "} c. \AD{}" # ARAB NUMERALS
#       date = " " + roman(row['date']) + " \AD{}" # ROMAN NUMERALS
#     else:
#       date = row['date'] + " " # if it's a year
#   else:
#     date = ""
#   date = par_begin + date + par_end + ", no edge, tier=" + str(index)
#   date = "\t"*index + "[" + date
#   date = re.sub('c(?=\d)', 'ca. ', date)
#   dates += date + "\n"
# dates += "]"*(index+1) + "\n"

# # References
# par_begin = "" #"\\parbox{" + str(c3) + "\\textwidth}{\centering "
# par_begin = "\\parbox{" + str(c3) + "\\textwidth}{\centering "
# par_end = "" #"}"
# par_end = "}"
# references = ""
# for index, row in df_local.iterrows():
#   if r'_' in str(row['reference']):
#     row['reference'] = "\\textcite{" + row['reference'] + "}"
#   tier = ", no edge, tier=" + str(index)
#   nl = "; \\\\"
#   size = ""
#   ll = ""
#   reference = ""
#   if pd.notna(row['ref link']) and pd.notna(row['reference']) and pd.notna(row['ref link2']) and pd.notna(row['reference2']):
#     link1 = "\href{" + str(row['ref link']) + "}"
#     ref1 = "{" + str(row['reference']) + "}"
#     link2 = "\href{" + str(row['ref link2']) + "}"
#     ref2 = "{" + str(row['reference2']) + "}"
#     reference = par_begin + size + link1 + ref1 + ll + nl + size + link2 + ref2 + ll + par_end + tier
#     reference = "\t"*index + "[" + reference
#     references += reference + "\n"
#   if pd.notna(row['ref link']) and pd.notna(row['reference']) and pd.isna(row['ref link2']) and pd.notna(row['reference2']):
#     link1 = "\href{" + str(row['ref link']) + "}"
#     ref1 = "{" + str(row['reference']) + "}"
#     link2 = "\href{" + str(row['ref link2']) + "}"
#     ref2 = "{" + str(row['reference2']) + "}"
#     reference = par_begin + size + link1 + ref1 + ll + nl + size + ref2 + ll + par_end + tier
#     reference = "\t"*index + "[" + reference
#     references += reference + "\n"
#   if pd.notna(row['ref link']) and pd.notna(row['reference']) and pd.isna(row['ref link2']) and pd.isna(row['reference2']):
#     link1 = "\href{" + str(row['ref link']) + "}"
#     ref1 = "{" + str(row['reference']) + "}"
#     link2 = "\href{" + str(row['ref link2']) + "}"
#     ref2 = "{" + str(row['reference2']) + "}"
#     reference = par_begin + size + link1 + ref1 + ll + par_end + tier
#     reference = "\t"*index + "[" + reference
#     references += reference + "\n"
#   if pd.isna(row['ref link']) and pd.notna(row['reference']) and pd.isna(row['ref link2']) and pd.isna(row['reference2']):
#     link1 = "\href{" + str(row['ref link']) + "}"
#     ref1 = "{" + str(row['reference']) + "}"
#     link2 = "\href{" + str(row['ref link2']) + "}"
#     ref2 = "{" + str(row['reference2']) + "}"
#     reference = par_begin + size + ref1 + ll + par_end + tier
#     reference = "\t"*index + "[" + reference
#     references += reference + "\n"
# references += "]"*(index+1) + "\n"

# # Sources
# sources = ""
# for index, row in df_local.iterrows():
#   if pd.notna(row['source zotero']):
#     source = "\\textcite{" + row['source zotero'].lower() + "}"
#     # if row['source zotero'].str.contains(pat = '; '): # what if a cell has 2 sources?
#     if pd.notna(row['source page']):
#       source = "\\textcite[" + str(row['source page']) + "]{" + row['source zotero'].lower() + "}"
#       if row['source page'].isalpha() == True:
#         source = "\\textcite[s.v. " + str(row['source page']) + "]{" + row['source zotero'].lower() + "}"
#   source_list.append(source)

# # Clear duplicates from sources:
# source_set = sorted(set(source_list), key=source_list.index)
# source_list2 = list(source_set)
# sources_unduplicated = '; '.join(source_list2)
# # Test for duplicates
# newlist = [] # empty list to hold unique elements from the list
# duplist = [] # empty list to hold the duplicate elements from the list
# for i in source_list:
#     if i not in newlist:
#         newlist.append(i)
#     else:
#         duplist.append(i)
# if len(duplist) > 0:
#   sources = sources_unduplicated
# else:
#   sources =  '; '.join(source_list)

# # Cleaning
# sources = re.sub("; $", "", sources)
# sources = re.sub("^; ", "", sources)
# sources = re.sub("(; )+", "; ", sources)
# sources = "\n\n\\bigskip\n\\noindent{\\footnotesize\n\\textbf{Sources:} " + sources + ".}\n"

# # intro = "\\begin{forest}\nfor tree={align=center,calign=first,edge={line width=1pt}}\n[, l sep=0\n"
# intro = "\\begin{forest}\nfor tree={base=top,align=center,calign=center}\n[, l sep=0\n"
# outro = "]\n\end{forest}"
# tree = intro + lines + dates + references + outro

# tree = tree.replace("[nan", "[")
# tree = tree.replace("{\centering nan}", "{\centering }")
# tree = tree.replace("#", "\#")
# tree = tree.replace("%", "\%")
# tree = tree.replace("&", "\&")

# label = "\label{fol:" + df_local['id'].iloc[0] + "}\n"

# tree = "\\begin{folio}" + "{" + df_local['id'].iloc[0].capitalize() + "}" + label + tree + sources + "\n\\end{folio}"
# tree

In [None]:
# # Not parboxed, linear tree
# key = "hal"



# # ------------------------------------------------------------------------------
# df_local = etymologies[key]
# df_local = df_local[df_local['treeskip'] != 'yes']
# df_local.reset_index(inplace=True, drop=True)

# c1=0.5
# c2=0.1
# c3=0.1

# term_size = "\\large "
# meaning_size = "\\small " # "\\footnotesize"
# language_size = "\\normalsize "

# # Terms
# tree = ""
# lines = ""
# par_begin = "" #"\\parbox{" + str(c1) + "\\textwidth}{\centering "
# # par_begin = "\\parbox{" + str(c1) + "\\textwidth}{\centering "
# par_end = "" #"}"
# # par_end = "}"
# for index, row in df_local.iterrows():
#   if pd.notnull(row['term']):
#     if pd.notnull(row['script']):
#       line = "{" + term_size + row['script'] + "} {" + term_size + "\\textit{" + str(row['term']) + "}} \\\ {" + language_size + str(row['language']) + "}"
#     else:
#       line = "{" + term_size + "\\textit{" + str(row['term']) + "}} \\\ {" + language_size + str(row['language']) + "}"
#   else:
#     line = "{" + language_size + str(row['language']) + "}"
#   if pd.notnull(row['meaning']):
#                 line += " \\\ {" + meaning_size + "`" + row['meaning'] + "'}"
#   if pd.notnull(row['explanation']):
#                 line += " {" + meaning_size + "(" + row['explanation'] + ")}"
#   if row['loanword'] == "yes":
#     line = "\\hspace{-1.25em} \\textcolor{\\accentcolor}{\\rightarrow} " + line
#   line = par_begin + line + par_end
#   if index == 0:
#     line = line + ", no edge, baseline,"
#   if row['doubt'] == "yes":
#     line = line + ", edge=dashed"
#   # if row['influencer'] == "yes":
#   #   line = line + ", edge={PolyU}"
#   #   line = line + ", edge label={node[midway,right,font=\scriptsize]{borrowing}}"
#   line = "\t"*index + "[" + line
#   line = line  + ", name=" + str(row['language'])
#   line = line  + ", tier=" + str(index)
#   line = re.sub(r" nan ", " ", line)
#   lines = lines + line + "\n"
# lines += "]"*(index+1) + "\n"

# # Dates
# par_begin = "" #"\\parbox{" + str(c3) + "\\textwidth}{\centering "
# # par_begin = "\\parbox{" + str(c2) + "\\textwidth}{\\raggedright "
# par_end = "" #"}"
# # par_end = "}"
# dates = ""
# for index, row in df_local.iterrows():
#   if pd.notna(row['date']): # dates
#     row['date'] = re.sub('a(?=\d)', 'a. ', row['date'])
#     row['date'] = re.sub('c(?=\d)', 'ca. ', row['date'])
#     if re.match('^-\d\d?$', row['date']): # if is a century BC
#       row['date'] = re.sub("-", "", row['date'])
#       # date = "\\nth{" + row['date'] + "} c. \BC{}" # ARAB NUMERALS
#       date = " " + roman(row['date']) + " \BC{}" # ROMAN NUMERALS
#     elif re.match('^\d\d?$', row['date']): # if is a century AD
#       # date = "\\nth{" + row['date'] + "} c. \AD{}" # ARAB NUMERALS
#       date = " " + roman(row['date']) + " \AD{}" # ROMAN NUMERALS
#     else:
#       date = row['date'] + " " # if it's a year
#   else:
#     date = ""
#   date = par_begin + date + par_end + ", no edge, tier=" + str(index)
#   date = "\t"*index + "[" + date
#   date = re.sub('c(?=\d)', 'ca. ', date)
#   dates += date + "\n"
# dates += "]"*(index+1) + "\n"

# # References
# par_begin = "" #"\\parbox{" + str(c3) + "\\textwidth}{\centering "
# # par_begin = "\\parbox{" + str(c3) + "\\textwidth}{\\raggedleft "
# par_end = "" #"}"
# # par_end = "}"
# references = ""
# for index, row in df_local.iterrows():
#   if r'_' in str(row['reference']):
#     row['reference'] = "\\textcite{" + row['reference'] + "}"
#   tier = ", no edge, tier=" + str(index)
#   nl = "; \\\\"
#   size = ""
#   ll = ""
#   reference = ""
#   if pd.notna(row['ref link']) and pd.notna(row['reference']) and pd.notna(row['ref link2']) and pd.notna(row['reference2']):
#     link1 = "\href{" + str(row['ref link']) + "}"
#     ref1 = "{" + str(row['reference']) + "}"
#     link2 = "\href{" + str(row['ref link2']) + "}"
#     ref2 = "{" + str(row['reference2']) + "}"
#     reference = par_begin + size + link1 + ref1 + ll + nl + size + link2 + ref2 + ll + par_end + tier
#     reference = "\t"*index + "[" + reference
#     references += reference + "\n"
#   if pd.notna(row['ref link']) and pd.notna(row['reference']) and pd.isna(row['ref link2']) and pd.notna(row['reference2']):
#     link1 = "\href{" + str(row['ref link']) + "}"
#     ref1 = "{" + str(row['reference']) + "}"
#     link2 = "\href{" + str(row['ref link2']) + "}"
#     ref2 = "{" + str(row['reference2']) + "}"
#     reference = par_begin + size + link1 + ref1 + ll + nl + size + ref2 + ll + par_end + tier
#     reference = "\t"*index + "[" + reference
#     references += reference + "\n"
#   if pd.notna(row['ref link']) and pd.notna(row['reference']) and pd.isna(row['ref link2']) and pd.isna(row['reference2']):
#     link1 = "\href{" + str(row['ref link']) + "}"
#     ref1 = "{" + str(row['reference']) + "}"
#     link2 = "\href{" + str(row['ref link2']) + "}"
#     ref2 = "{" + str(row['reference2']) + "}"
#     reference = par_begin + size + link1 + ref1 + ll + par_end + tier
#     reference = "\t"*index + "[" + reference
#     references += reference + "\n"
#   if pd.isna(row['ref link']) and pd.notna(row['reference']) and pd.isna(row['ref link2']) and pd.isna(row['reference2']):
#     link1 = "\href{" + str(row['ref link']) + "}"
#     ref1 = "{" + str(row['reference']) + "}"
#     link2 = "\href{" + str(row['ref link2']) + "}"
#     ref2 = "{" + str(row['reference2']) + "}"
#     reference = par_begin + size + ref1 + ll + par_end + tier
#     reference = "\t"*index + "[" + reference
#     references += reference + "\n"
# references += "]"*(index+1) + "\n"

# # Sources
# source_list = []
# for index, row in df_local.iterrows():
#   if pd.notna(row['source zotero']):
#     source = "\\textcite{" + row['source zotero'].lower() + "}"
#     # if row['source zotero'].str.contains(pat = '; '): # what if a cell has 2 sources?
#     if pd.notna(row['source page']):
#       source = "\\textcite[" + str(row['source page']) + "]{" + row['source zotero'].lower() + "}"
#       if row['source page'].isalpha() == True:
#         source = "\\textcite[s.v. " + str(row['source page']) + "]{" + row['source zotero'].lower() + "}"
#   source_list.append(source)

# # Clear duplicates from sources:
# source_set = sorted(set(source_list), key=source_list.index)
# source_list2 = list(source_set)
# sources_unduplicated = '; '.join(source_list2)
# # Test for duplicates
# newlist = [] # empty list to hold unique elements from the list
# duplist = [] # empty list to hold the duplicate elements from the list
# for i in source_list:
#     if i not in newlist:
#         newlist.append(i)
#     else:
#         duplist.append(i)
# if len(duplist) > 0:
#   sources = sources_unduplicated
# else:
#   sources =  '; '.join(source_list)

# # Cleaning
# sources = re.sub("; $", "", sources)
# sources = re.sub("^; ", "", sources)
# sources = re.sub("(; )+", "; ", sources)
# sources = "\n\n\\bigskip\n\\raggedright\n\\noindent{\\footnotesize\n\\textbf{Sources:} " + sources + ".}\n"

# # intro = "\\begin{forest}\nfor tree={align=center,calign=first,edge={line width=1pt}}\n[, l sep=0\n"
# intro = "\\begin{forest}\nfor tree={base=top, calign=center, align=center, s sep=10mm, l sep=10mm}\n[, l sep=0\n"
# outro = "]\n\end{forest}"
# tree = intro + lines + dates + references + outro

# tree = tree.replace("[nan", "[")
# tree = tree.replace("{\centering nan}", "{\centering }")
# tree = tree.replace("#", "\#")
# tree = tree.replace("%", "\%")
# tree = tree.replace("&", "\&")

# label = "\label{fol:" + df_local['id'].iloc[0] + "}\n"

# tree = "\\begin{folio}" + "{" + df_local['id'].iloc[0].capitalize() + "}" + label + tree + sources + "\n\\end{folio}"
# tree

In [None]:
# # Improved, parboxed, for forked one that has 'partly'
# key = "anise"

# df_local = etymology[key]
# # df_local = df_local[df_local['skip'] != 'yes']
# df_local.reset_index(inplace=True, drop=True)

# # Move rows to end of the df by condition
# forked = 'fork' in df_local['complex'].values
# print(forked)
# if forked == True:
#   p = df_local.index[df_local['complex'] == 'fork'].tolist()
#   idx = df_local.index.tolist()
#   length = len(idx)
#   fork = p[0]
#   idx.pop(fork)
#   df_local = df_local.reindex(idx + [p[0]])
#   df_local.reset_index(inplace=True, drop=True)
#   idx = df_local.index.tolist()

# tree = ""
# lines = ""
# par_begin = "\\parbox{0.3\\textwidth}{\centering "
# par_end = "}"
# for index, row in df_local.iterrows():
#   line = "\\normalsize \\textit{" + row['term'] + "} \\\ \\small{" + row['language'] + "}"
#   if pd.notnull(row['meaning']):
#                 line += " \\\ \\footnotesize{`" + row['meaning'] + "'}"
#   if pd.notnull(row['explanation']):
#                 line += " \\footnotesize{(" + row['explanation'] + ")}"
#   if row['loanword'] == "yes":
#     line = "\\hspace{-1.25em} \\textcolor{OliveGreen}{\\rightarrow} " + line
#   line = "[" + par_begin + line + par_end
#   if index == idx[0]:
#     line = line + ", no edge, baseline"
#   if row['doubt'] == "yes":
#     line = line + ", edge=dashed"
#   # if row['influencer'] == "yes":
#   #   line = line + ", edge={PolyU}"
#   #   line = line + ", edge label={node[midway,right,font=\scriptsize]{influenced by}}"

#   if forked == True:
#     if index == idx[-2]: #second to last
#       line += ", tier=" + str(index)
#       line += "]"*(length-fork-1)
#     elif index == idx[-1]: #last (the branch)
#       line += ", tier=" + str(fork)
#       line += "]"*(fork+1)
#     else:
#       line += ", tier=" + str(index)
#     lines += line + "\n"
#   else:
#     line += ", tier=" + str(index)
#     lines += line + "\n"
#     lines += "]"*(index+1)

# # Dates
# par_begin = "\\parbox{0.1\\textwidth}{\centering "
# par_end = "}"
# dates = ""
# for index, row in df_local.iterrows():
#   date = "[" + par_begin + str(row['date']) + par_end + ", no edge"

#   if forked == True:
#     if index == idx[-2]: #second to last
#       date += ", tier=" + str(index)
#       date += "]"*(length-fork-1)
#     elif index == idx[-1]: #last (the branch)
#       date += ", tier=" + str(fork)
#       date += "]"*(fork+1)
#     else:
#       date += ", tier=" + str(index)
#     dates += date + "\n"
#   else:
#     date += ", tier=" + str(index)
#     dates += date + "\n"
#     dates += "]"*(index+1)

# # References
# par_begin = "\\parbox{0.1\\textwidth}{\centering "
# par_end = "}"
# references = ""
# for index, row in df_local.iterrows():
#   edge = ", no edge"
#   begin = "["
#   nl = "; \\\\"
#   size = "\\small {"
#   ll = "}"
#   if pd.notna(row['link']) and pd.notna(row['reference']) and pd.notna(row['link2']) and pd.notna(row['reference2']):
#     link1 = "\href{" + str(row['link']) + "}"
#     ref1 = "{" + str(row['reference']) + "}"
#     link2 = "\href{" + str(row['link2']) + "}"
#     ref2 = "{" + str(row['reference2']) + "}"
#     reference = begin + par_begin + size + link1 + ref1 + ll + nl + size + link2 + ref2 + ll + par_end + edge

#   if pd.notna(row['link']) and pd.notna(row['reference']) and pd.isna(row['link2']) and pd.notna(row['reference2']):
#     link1 = "\href{" + str(row['link']) + "}"
#     ref1 = "{" + str(row['reference']) + "}"
#     link2 = "\href{" + str(row['link2']) + "}"
#     ref2 = "{" + str(row['reference2']) + "}"
#     reference = begin + par_begin + size + link1 + ref1 + ll + nl + size + ref2 + ll + par_end + edge

#   if pd.notna(row['link']) and pd.notna(row['reference']) and pd.isna(row['link2']) and pd.isna(row['reference2']):
#     link1 = "\href{" + str(row['link']) + "}"
#     ref1 = "{" + str(row['reference']) + "}"
#     link2 = "\href{" + str(row['link2']) + "}"
#     ref2 = "{" + str(row['reference2']) + "}"
#     reference = begin + par_begin + size + link1 + ref1 + ll + par_end + edge

#   if pd.isna(row['link']) and pd.notna(row['reference']) and pd.isna(row['link2']) and pd.isna(row['reference2']):
#     link1 = "\href{" + str(row['link']) + "}"
#     ref1 = "{" + str(row['reference']) + "}"
#     link2 = "\href{" + str(row['link2']) + "}"
#     ref2 = "{" + str(row['reference2']) + "}"
#     reference = begin + par_begin + size + ref1 + ll + par_end + edge

#   if forked == True:
#     if index == idx[-2]: #second to last
#       reference += ", tier=" + str(index)
#       reference += "]"*(length-fork-1)
#     elif index == idx[-1]: #last (the branch)
#       reference += ", tier=" + str(fork)
#       reference += "]"*(fork+1)
#     else:
#       reference += ", tier=" + str(index)
#     references += reference + "\n"
#   else:
#     reference += ", tier=" + str(index)
#     references += reference + "\n"
#     references += "]"*(index+1)



# # intro = "\\begin{forest}\nfor tree={align=center,calign=first,edge={line width=1pt}}\n[, l sep=0\n"
# intro = "\\begin{forest}\nfor tree={align=center,calign=first}\n[, l sep=0\n"
# outro = "]\n\end{forest}"
# tree = intro + lines + dates + references + outro

# tree = tree.replace("[nan", "[")
# tree = tree.replace("{\centering nan}", "{\centering }")
# tree = tree.replace("#", "\#")
# tree = tree.replace("%", "\%")
# tree = tree.replace("&", "\&")
# label = "\label{fol:" + df_local['id'].iloc[0] + "}\n"

# sources = ""
# for index, row in df_local.iterrows():
#   if pd.notna(row['source zotero']):
#     sources += str(row['source zotero']) + "; " 
#     source_list = sources.split('; ')
#     source_set = set(source_list)
#     source = '; '.join(source_set)

# source = re.sub('; $', '.', source)
# source = "\n\n\\bigskip\n\\raggedright\nSources: " + source

# tree = "\\begin{folio}" + "{" + df_local['id'].iloc[0].capitalize() + "}" + label + tree + source + "\n\\end{folio}"

# tree

In [None]:
# # The following code will create a folio environment for the key, to be used in LaTeX, simple

# key = "pimento"

# df_local = etymology[key]
# df_local = df_local[df_local['skip'] != 'yes']
# df_local.reset_index(inplace=True, drop=True)

# tree = ""
# lines = ""
# for index, row in df_local.iterrows():   
#   line = row['term'] + "\\\\\small{" + row['language'] + "}"
#   if pd.notnull(row['meaning']):
#                 line += " \\\ \\footnotesize{`" + row['meaning'] + "'}"
#   if pd.notnull(row['explanation']):
#                 line += " \\footnotesize{(" + row['explanation'] + ")}"
#   if row['loanword'] == "yes":
#     line = "\\hspace{-1.25em} \\textcolor{OliveGreen}{\\rightarrow} " + line
#   if index == 0:
#     line = line + ", no edge, baseline"
#   if row['doubt'] == "yes":
#     line = line + ", edge=dotted"
#   # if row['loaner'] == "yes":
#   #   line = line + ", edge={PolyU}"
#     # line = line + ", edge label={node[midway,right,font=\scriptsize]{borrowing}}"
#   line = "[" + line
#   line = line  + ", tier=" + str(index)

#   lines = lines + line + "\n"
# lines += "]"*(index+1) + "\n"
# # print(lines)

# dates = ""
# for index, row in df_local.iterrows():
#   date = "[" + str(row['date']) + ", no edge, tier=" + str(index)
#   dates = dates + date + "\n"
# dates += "]"*(index+1) + "\n"
# # print(dates)

# references = ""
# for index, row in df_local.iterrows():
#   tier = ", no edge, tier=" + str(index)
#   begin = "["
#   nl = ";\\\\"
#   sma = "\\small{"
#   ll = "}"
#   if pd.notna(row['link']) and pd.notna(row['reference']) and pd.notna(row['link2']) and pd.notna(row['reference2']):
#     link1 = "\href{" + str(row['link']) + "}"
#     ref1 = "{" + str(row['reference']) + "}"
#     link2 = "\href{" + str(row['link2']) + "}"
#     ref2 = "{" + str(row['reference2']) + "}"
#     reference = begin + sma + link1 + ref1 + ll + nl + sma + link2 + ref2 + ll+ tier
#     references = references + reference + "\n"
#   if pd.notna(row['link']) and pd.notna(row['reference']) and pd.isna(row['link2']) and pd.notna(row['reference2']):
#     link1 = "\href{" + str(row['link']) + "}"
#     ref1 = "{" + str(row['reference']) + "}"
#     link2 = "\href{" + str(row['link2']) + "}"
#     ref2 = "{" + str(row['reference2']) + "}"
#     reference = begin + sma + link1 + ref1 + ll + nl + sma + ref2 + ll + tier
#     references = references + reference + "\n"
#   if pd.notna(row['link']) and pd.notna(row['reference']) and pd.isna(row['link2']) and pd.isna(row['reference2']):
#     link1 = "\href{" + str(row['link']) + "}"
#     ref1 = "{" + str(row['reference']) + "}"
#     link2 = "\href{" + str(row['link2']) + "}"
#     ref2 = "{" + str(row['reference2']) + "}"
#     reference = begin + sma + link1 + ref1 + ll + tier
#     references = references + reference + "\n"
#   if pd.isna(row['link']) and pd.notna(row['reference']) and pd.isna(row['link2']) and pd.isna(row['reference2']):
#     link1 = "\href{" + str(row['link']) + "}"
#     ref1 = "{" + str(row['reference']) + "}"
#     link2 = "\href{" + str(row['link2']) + "}"
#     ref2 = "{" + str(row['reference2']) + "}"
#     reference = begin + sma + ref1 + ll + tier
#     references = references + reference + "\n"
# references += "]"*(index+1) + "\n"
# # print(references)

# intro = "\\begin{forest}\nfor tree={grow=270,align=center,calign=first,edge={line width=1pt}}\n[, l sep=0\n"
# outro = "]\n\end{forest}"
# tree = intro + lines + dates + references + outro

# tree = tree.replace("[nan", "[")
# tree = tree.replace("#", "\#")
# tree = tree.replace("%", "\%")
# tree = tree.replace("&", "\&")
# tree = "\\begin{folio}{" + df_local['id'].iloc[0].capitalize() + "}\n" + tree + "\n\\end{folio}"

# tree

In [None]:
# "" + " | " + row['script']
# edge label={node[midway,below,font=\scriptsize]{influenced}}
# edge={line width=1pt}
# edge={PolyU} 

# [,no edge]

## Create language data

In [None]:
# Path
path = "data/languages/"

### WALS

In [None]:
# load in dataset (old)
df=pd.read_csv(path + 'wals_languages_ori.csv', delimiter=',', encoding="utf-8")
df

# extract only desired columns
selectlist = ['iso_code', 'glottocode', 'Name', 'latitude', 'longitude', 'genus', 'family', 'macroarea', 'countrycodes', '138A Tea']
df = df[selectlist]
# rename columns
df.columns = ['iso', 'glcode', 'name', 'lat', 'lon', 'genus', 'family', 'macroarea', 'country', 'tea']

# add level
df['level'] = "language"

# from
df['from'] = "wals1"

# sort by alphabetical order
df=df.sort_values('name')
df.reset_index(inplace=True, drop=True)

# write data out
wals1=df
wals1

Unnamed: 0,iso,glcode,name,lat,lon,genus,family,macroarea,country,tea,level,from
0,knw,kung1261,!Xun (Ekoka),-19.666667,18.000000,Ju-Kung,Kxa,Africa,NA AO,,language,wals1
1,nmn,xooo1239,!Xóõ,-24.000000,21.500000,Tu,Tu,Africa,BW,2 Words derived from Min Nan Chinese te,language,wals1
2,alu,area1240,'Are'are,-9.250000,161.166667,Oceanic,Austronesian,Papunesia,SB,,language,wals1
3,hnh,anii1246,//Ani,-18.916667,21.916667,Khoe-Kwadi,Khoe-Kwadi,Africa,BW,,language,wals1
4,xam,xamm1241,/Xam,-31.000000,21.000000,Tu,Tu,Africa,ZA,,language,wals1
...,...,...,...,...,...,...,...,...,...,...,...,...
2674,zul,zulu1248,Zulu (Southern),-31.000000,30.000000,Bantoid,Niger-Congo,Africa,ZA,,language,wals1
2675,zun,zuni1245,Zuni,35.083333,-108.833333,Zuni,Zuni,North America,US,,language,wals1
2676,jya,jiar1239,rGyalrong (Caodeng),31.666667,101.750000,rGyalrong,Sino-Tibetan,Eurasia,CN,,language,wals1
2677,eme,emer1243,Émérillon,3.166667,-52.416667,Tupi-Guaraní,Tupian,South America,GF,,language,wals1


In [None]:
# load in dataset (new)
df=pd.read_csv(path+'wals_languages.csv', delimiter=',', encoding="utf-8")

# extract only desired columns
selectlist = ['iso_codes', 'name', 'latitude', 'longitude', 'macroarea']
df = df[selectlist]
# rename columns
df.columns = ['iso', 'name', 'lat', 'lon', 'macroarea']

#add level
df['level'] = "language"

# from
df['from'] = "wals2"

# sort by alphabetical order
df=df.sort_values('name')
df.reset_index(inplace=True, drop=True)

# write data out
wals2=df
wals2

Unnamed: 0,iso,name,lat,lon,macroarea,level,from
0,alu,'Are'are,-9.250000,161.166667,Papunesia,language,wals2
1,hnh,//Ani,-18.916667,21.916667,Africa,language,wals2
2,huc,=|Hoan,-25.500000,25.000000,Africa,language,wals2
3,apq,A-Pucikwar,12.166667,92.833333,Eurasia,language,wals2
4,aiw,Aari,6.000000,36.583333,Africa,language,wals2
...,...,...,...,...,...,...,...
1995,tnc,Retuarã,-0.500000,-70.666667,South America,language,wals2
1996,ran,Riantana,-7.500000,138.500000,Papunesia,language,wals2
1997,jya,rGyalrong (Caodeng),31.666667,101.750000,Eurasia,language,wals2
1998,eme,Émérillon,3.166667,-52.416667,South America,language,wals2


In [None]:
# concatenate wals1 and wals2
frames = [wals1, wals2]
df = pd.concat(frames)

# sort by alphabetical order
df=df.sort_values(['name', 'family'])
df.reset_index(inplace=True, drop=True)

# drop duplicates
df = df.drop_duplicates(subset = ['name'], keep = 'first').reset_index(drop = True)

# write
wals=df
wals.to_csv(path+'wals.csv')
wals

Unnamed: 0,iso,glcode,name,lat,lon,genus,family,macroarea,country,tea,level,from
0,knw,kung1261,!Xun (Ekoka),-19.666667,18.000000,Ju-Kung,Kxa,Africa,NA AO,,language,wals1
1,nmn,xooo1239,!Xóõ,-24.000000,21.500000,Tu,Tu,Africa,BW,2 Words derived from Min Nan Chinese te,language,wals1
2,alu,area1240,'Are'are,-9.250000,161.166667,Oceanic,Austronesian,Papunesia,SB,,language,wals1
3,hnh,anii1246,//Ani,-18.916667,21.916667,Khoe-Kwadi,Khoe-Kwadi,Africa,BW,,language,wals1
4,xam,xamm1241,/Xam,-31.000000,21.000000,Tu,Tu,Africa,ZA,,language,wals1
...,...,...,...,...,...,...,...,...,...,...,...,...
2707,zul,zulu1248,Zulu (Southern),-31.000000,30.000000,Bantoid,Niger-Congo,Africa,ZA,,language,wals1
2708,zun,zuni1245,Zuni,35.083333,-108.833333,Zuni,Zuni,North America,US,,language,wals1
2709,jya,jiar1239,rGyalrong (Caodeng),31.666667,101.750000,rGyalrong,Sino-Tibetan,Eurasia,CN,,language,wals1
2710,eme,emer1243,Émérillon,3.166667,-52.416667,Tupi-Guaraní,Tupian,South America,GF,,language,wals1


In [None]:
# # search
# keyword = "Tea"
# wals[wals.eq(keyword).any(1)]

### Glottolog

In [None]:
# load in dataset 1
df=pd.read_csv(path+'languages_and_dialects_geo.csv', delimiter=',', encoding="utf-8")

# rename columns
df.columns = ['glcode', 'name', 'iso', 'level', 'macroarea', 'lat', 'lon']

# from
df['from'] = "glot"

#write
glot1=df
glot1

Unnamed: 0,glcode,name,iso,level,macroarea,lat,lon,from
0,3adt1234,3Ad-Tekles,,dialect,Africa,,,glot
1,aala1237,Aalawa,,dialect,Papunesia,,,glot
2,aant1238,Aantantara,,dialect,Papunesia,,,glot
3,aari1239,Aari,aiw,language,Africa,5.95034,36.5721,glot
4,aari1240,Aariya,aay,language,Eurasia,,,glot
...,...,...,...,...,...,...,...,...
21324,zuwa1238,Zuwadza,,dialect,Papunesia,,,glot
21325,zwal1238,Zwall,,dialect,Africa,,,glot
21326,zyph1238,Zyphe,zyp,language,Eurasia,22.52400,93.2640,glot
21327,zyud1238,Zyuzdin,,dialect,Eurasia,,,glot


In [None]:
# load in dataset 2
df=pd.read_csv(path+'languoid.csv', delimiter=',', encoding="utf-8")
df

# extract only desired columns
selectlist = ['name', 'iso639P3code', 'id', 'level', 'family_id', 'country_ids', 'latitude', 'longitude']
df = df[selectlist]

# rename columns
df.columns = ['name', 'iso', 'glcode', 'level', 'family', 'country', 'lat', 'lon']

# from
df['from'] = "glot"

#drop signlanguages, bookkeeping, unattested, artificial
df = df[df['family'] != 'sign1238']
df = df[df['family'] != 'book1242']
df = df[df['family'] != 'unat1236']
df = df[df['family'] != 'arti1236']

#manually fix families
df['family'] = [re.sub('sino1245', "Sino-Tibetan", str(x)) for x in df['family']]
df['family'] = [re.sub('drav1251', "Dravidian", str(x)) for x in df['family']]
df['family'] = [re.sub('atla1278', "Atlantic-Congo", str(x)) for x in df['family']]
df['family'] = [re.sub('aust1307', "Austronesian", str(x)) for x in df['family']]
df['family'] = [re.sub('indo1319', "Indo-European", str(x)) for x in df['family']]
df['family'] = [re.sub('afro1255', "Afro-Asiatic", str(x)) for x in df['family']]
df['family'] = [re.sub('aust1305', "Austroasiatic", str(x)) for x in df['family']]
df['family'] = [re.sub('nucl1709', "Nuclear Trans New Guinea", str(x)) for x in df['family']]
df['family'] = [re.sub('pama1250', "Pama-Nyungan", str(x)) for x in df['family']]
df['family'] = [re.sub('pama1250', "Pama-Nyungan", str(x)) for x in df['family']]
df['family'] = [re.sub('tupi1275', "Tupian", str(x)) for x in df['family']]
df['family'] = [re.sub('tuuu1241', "Tuu", str(x)) for x in df['family']]
df['family'] = [re.sub('ural1272', "Uralic", str(x)) for x in df['family']]
df['family'] = [re.sub('turk1311', "Turkic", str(x)) for x in df['family']]
df['family'] = [re.sub('khoe1240', "Khoe–Kwadi", str(x)) for x in df['family']]
df['family'] = [re.sub('lake1255', "Lakes Plain", str(x)) for x in df['family']]
df['family'] = [re.sub('japo1237', "Japonic", str(x)) for x in df['family']]
df['family'] = [re.sub('kxaa1236', "Kxa", str(x)) for x in df['family']]
df['family'] = [re.sub('utoa1244', "Uto-Aztecan", str(x)) for x in df['family']]
df['family'] = [re.sub('mong1349', "Mongolic-Khitan", str(x)) for x in df['family']]


# sort by alphabetical order
df=df.sort_values('name')
df.reset_index(inplace=True, drop=True)

# write data out
glot2=df
glot2

Unnamed: 0,name,iso,glcode,level,family,country,lat,lon,from
0,!Gã!ne,,gane1238,language,Tuu,ZA,-31.3200,28.7500,glot
1,!Ui,,kwii1241,family,Tuu,,,,glot
2,'Abd Al-Kuri,,abda1238,dialect,Afro-Asiatic,YE,12.1959,52.2282,glot
3,'Aden,,aden1242,dialect,Afro-Asiatic,,,,glot
4,'Algaden,,alga1234,dialect,Afro-Asiatic,,,,glot
...,...,...,...,...,...,...,...,...,...
25094,Ōita-ben,,oita1237,dialect,Japonic,,,,glot
25095,Ōmuta-ben,,omut1237,dialect,Japonic,,,,glot
25096,Ōsaka,,osak1237,dialect,Japonic,,,,glot
25097,Šumadija-Vojvodina,,suma1275,dialect,Indo-European,,,,glot


In [None]:
# merge
df = pd.merge(glot1, glot2, on=["name", "iso", "level", "glcode", "lat", "lon", "from"], how='outer')
print(df.shape)

# sort by alphabetical order
df=df.sort_values(['name', 'family'])
df.reset_index(inplace=True, drop=True)

# write
glot=df
glot.to_csv(path+'glot.csv')
glot

(25833, 10)


Unnamed: 0,glcode,name,iso,level,macroarea,lat,lon,from,family,country
0,gane1238,!Gã!ne,,language,Africa,-31.3200,28.7500,glot,Tuu,ZA
1,oung1238,!O!ung,oun,language,Africa,-15.3000,14.3500,glot,,
2,kwii1241,!Ui,,family,,,,glot,Tuu,
3,abda1238,'Abd Al-Kuri,,dialect,Africa,12.1959,52.2282,glot,Afro-Asiatic,YE
4,aden1242,'Aden,,dialect,Eurasia,,,glot,Afro-Asiatic,
...,...,...,...,...,...,...,...,...,...,...
25828,oita1237,Ōita-ben,,dialect,Eurasia,,,glot,Japonic,
25829,omut1237,Ōmuta-ben,,dialect,Eurasia,,,glot,Japonic,
25830,osak1237,Ōsaka,,dialect,Eurasia,,,glot,Japonic,
25831,suma1275,Šumadija-Vojvodina,,dialect,Eurasia,,,glot,Indo-European,


In [None]:
# keyword = "Eastern Panjabi"
# glot[glot.eq(keyword).any(1)]

### Concat

In [None]:
#concatenate wals and glottolog
frames = [wals, glot]
df = pd.concat(frames)
print(df.shape)

# sort by alphabetical order
df=df.sort_values(['name', 'family'])
df.reset_index(inplace=True, drop=True)

#drop duplicates
df.drop_duplicates(subset=['glcode', 'name'], keep='first', inplace=True, ignore_index=True)
print(df.shape)

#drop if no code
# df = df[df['iso'].notna()]
# print(df.shape)

# reorder columns
df = df[['iso', 'glcode', 'name', 'level', 'genus', 'family', 'macroarea', 'country', 'lat', 'lon', 'from']]

#rename
df = df.rename(columns={"name": "language", "genus":"branch"})
df

# write data out
languages = df
languages.to_csv(path+'languages.csv')
languages

(28545, 12)
(27266, 12)


Unnamed: 0,iso,glcode,language,level,branch,family,macroarea,country,lat,lon,from
0,,gane1238,!Gã!ne,language,,Tuu,Africa,ZA,-31.320000,28.75,glot
1,oun,oung1238,!O!ung,language,,,Africa,,-15.300000,14.35,glot
2,,kwii1241,!Ui,family,,Tuu,,,,,glot
3,knw,kung1261,!Xun (Ekoka),language,Ju-Kung,Kxa,Africa,NA AO,-19.666667,18.00,wals1
4,nmn,xooo1239,!Xóõ,language,Tu,Tu,Africa,BW,-24.000000,21.50,wals1
...,...,...,...,...,...,...,...,...,...,...,...
27261,,oita1237,Ōita-ben,dialect,,Japonic,Eurasia,,,,glot
27262,,omut1237,Ōmuta-ben,dialect,,Japonic,Eurasia,,,,glot
27263,,osak1237,Ōsaka,dialect,,Japonic,Eurasia,,,,glot
27264,,suma1275,Šumadija-Vojvodina,dialect,,Indo-European,Eurasia,,,,glot


In [None]:
# #checking
# so = languages[languages['language'] == 'New Latin']
# so

In [None]:
# remove duplicates by glcode, but which one? # filter duplicates and get rid of plant one if found
# df['duplicate'] = df[df.duplicated(subset=['name'], keep=False)]['from']=="wals" # change subset to be more strict e.g. subset=['language', 'term']
# df.drop(df[df['duplicate'] == True].index, inplace=True)
# df.drop(['duplicate'], axis=1, inplace=True)
# df.reset_index(inplace=True, drop=True)
# df

In [None]:
# #checking
# languages=languages.sample(20000)
# languages

### Augment dataset

In [None]:
#add coordinates by hand
languages.loc[languages.language == 'Arabic', ['lat', 'lon']] = 27.96, 43.85 # Coordinates of Standard Arabic, OK
languages.loc[languages.language == 'West Germanic', ['lat', 'lon']] = 48, 9
languages.loc[languages.language == 'Semitic', ['lat', 'lon']] = 29, 40
languages.loc[languages.language == 'Anglo-Norman', ['lat', 'lon']] = 51, -1
languages.loc[languages.language == 'Romance', ['lat', 'lon']] = 44, 12
languages.loc[languages.language == 'Medieval Latin', ['lat', 'lon']] = 46.2, 1.3 # Western Europe coordinates
languages.loc[languages.language == 'Late Latin', ['lat', 'lon']] = 46.2, 1.3 # Western Europe coordinates
languages.loc[languages.language == 'Germanic', ['lat', 'lon']] = 48.25, 10.60 #Middle High German
languages.loc[languages.language == 'New Latin', ['lat', 'lon']] = 46.2, 1.3 # Western Europe coordinates
languages.loc[languages.language == 'Slavic', ['lat', 'lon']] = 43.72, 22.84 #Church Slavic???
languages.loc[languages.language == 'Brunei Malay', ['lat', 'lon']] = 4.53, 114.72 #Coordinates of Brunei, OK?
languages.loc[languages.language == 'Fante', ['lat', 'lon']] = 6.35, -1.33 #Coordinates of hyperglot Akan, OK
languages.loc[languages.language == 'Twi', ['lat', 'lon']] = 6.35, -1.33 #Coordinates of hyperglot Akan, OK
languages.loc[languages.language == 'Mongolian', ['lat', 'lon']] = 48.32, 106.29 #Coordinates of hypoglot, Halh Mongolian (most spoken dialect) OK
languages.loc[languages.language == 'Volapük', ['lat', 'lon']] = 49, 8.24 #Coordinates of Karlsrühe, Baden OK
languages.loc[languages.language == 'Luxemburgish', ['lat', 'lon']] = 49.68, 6.15 #Coordinates of hyperglot, Moselle Franconian OK
languages.loc[languages.language == 'Armenian', ['lat', 'lon']] = 49.68, 6.15 #Coordinates of hyperglot, Moselle Franconian OK
languages.loc[languages.language == 'Dravidian', ['lat', 'lon']] = 11, 78 #Coordinates of Old Tamil
languages.loc[languages.language == 'Aramaic', ['lat', 'lon']] =33.91, 42.19 # Coordinates of Imperial Aramaic (700-300 BCE)
languages.loc[languages.language == 'Turkic', ['lat', 'lon']] =55.49, 47.16 # Coordinates of Chuvash, cognate of bors.
languages.loc[languages.language == 'Iranian', ['lat', 'lon']] =39.70, 66.98 # Coordinates of Sogdian, cognate bors
# modify names
languages.loc[languages.language == 'Old English (ca. 450-1100)', ['language']] = 'Old English'
languages.loc[languages.language == 'Old French (842-ca. 1400)', ['language']] = 'Old French'
languages.loc[languages.language == 'Jewish Babylonian Aramaic (ca. 200-1200 CE)', ['language']] = 'Jewish Babylonian Aramaic'
# languages.loc[languages.language == 'Pahlavi', ['language']] = 'Middle Persian'
# Late Middle Indo-Aryan in the new instead of Middle Indo Aryan

languages

Unnamed: 0,iso,glcode,language,level,branch,family,macroarea,country,lat,lon,from
0,,gane1238,!Gã!ne,language,,Tuu,Africa,ZA,-31.320000,28.75,glot
1,oun,oung1238,!O!ung,language,,,Africa,,-15.300000,14.35,glot
2,,kwii1241,!Ui,family,,Tuu,,,,,glot
3,knw,kung1261,!Xun (Ekoka),language,Ju-Kung,Kxa,Africa,NA AO,-19.666667,18.00,wals1
4,nmn,xooo1239,!Xóõ,language,Tu,Tu,Africa,BW,-24.000000,21.50,wals1
...,...,...,...,...,...,...,...,...,...,...,...
27261,,oita1237,Ōita-ben,dialect,,Japonic,Eurasia,,,,glot
27262,,omut1237,Ōmuta-ben,dialect,,Japonic,Eurasia,,,,glot
27263,,osak1237,Ōsaka,dialect,,Japonic,Eurasia,,,,glot
27264,,suma1275,Šumadija-Vojvodina,dialect,,Indo-European,Eurasia,,,,glot


In [None]:
# add language
languages.loc[-1] = ["dra", "drav1251", "Proto-Dravidian", "proto-language", np.NaN, "Dravidian", "Asia", "IN SL", 11, 78, "manual"]  # adding a row
languages.index = languages.index + 1  # shifting index
languages.loc[-1] = ["pii", "", "Proto-Iranian", "proto-language", np.NaN, "Indo-European", "Eurasia", np.NaN, 45, 65, "manual"]  # Loc of Andronovo culture
languages.index = languages.index + 1  # shifting index
languages.loc[-1] = ["pst", "sino1245", "Proto-Sino-Tibetan", "proto-language", np.NaN, "Sino-Tibetan", "Asia", np.NaN, 24, 100, "manual"]  # Loc of Andronovo culture
languages.index = languages.index + 1  # shifting index
languages = languages.sort_index()  # sorting by index
languages

Unnamed: 0,iso,glcode,language,level,branch,family,macroarea,country,lat,lon,from
0,pst,sino1245,Proto-Sino-Tibetan,proto-language,,Sino-Tibetan,Asia,,24.00,100.00,manual
1,pii,,Proto-Iranian,proto-language,,Indo-European,Eurasia,,45.00,65.00,manual
2,dra,drav1251,Proto-Dravidian,proto-language,,Dravidian,Asia,IN SL,11.00,78.00,manual
3,,gane1238,!Gã!ne,language,,Tuu,Africa,ZA,-31.32,28.75,glot
4,oun,oung1238,!O!ung,language,,,Africa,,-15.30,14.35,glot
...,...,...,...,...,...,...,...,...,...,...,...
27264,,oita1237,Ōita-ben,dialect,,Japonic,Eurasia,,,,glot
27265,,omut1237,Ōmuta-ben,dialect,,Japonic,Eurasia,,,,glot
27266,,osak1237,Ōsaka,dialect,,Japonic,Eurasia,,,,glot
27267,,suma1275,Šumadija-Vojvodina,dialect,,Indo-European,Eurasia,,,,glot


In [None]:
# keyword = "Proto-Dravidian"
# languages[languages.eq(keyword).any(1)]

# #checking
# so = languages[languages['iso'] == 'dra']
# so


# % ,language,iso,glcode,family,branch,macroarea,country,lat,lon,timespan
# % 1,Sanskrit,san,sans1269,Indo-European,Indic,Eurasia,IN,20.0,77.0,-1500-01-01/-1100-01-01
# % 2,Middle Indo-Aryan,pra,midd1350,Indo-European,Indic,Eurasia,IN,26.61,78.97,0500-01-01/1300-01-01
# % 3,Pahlavi,pal,pahl1241,Indo-European,Iranian,Eurasia,IR,30.06,52.7,0224-01-01/0650-01-01


### Save dataset

In [None]:
languages.to_csv(path + 'languages.csv')

## Wiktionary extractor

### Multilingual extractions

This pipeline takes an input of a txt file, that contains the data manually copied from a Wiktionary translations section. The tool will clean the data using regular expressions, and organize it in a neat csv, ready for plotting or further analysis.

In [304]:
key = "cinnamon"

In [305]:
#open txt and seperate every line
df = open(path_in + "wiktionary/" + key + '_spice.txt', "r", encoding = 'utf8')
lines = df.readlines()
df.close()

# remove /n at the end of each line
for index, line in enumerate(lines):
      lines[index] = line.strip()

#creating a dataframe(consider u want to convert your data to 2 columns)
df = pd.DataFrame(columns=('first', 'second'))
i = 0  
first = "" 
second = ""  
for line in lines:
        #you have to kind of define what are the values in columns,for example second column includes:
        second = re.sub(r'', "", line)
        #this is how you create next line data
        df.loc[i] = [first, second]
        i =i+1


df['second'].replace('', np.nan, inplace=True)
df.dropna(axis=0, inplace=True)
df.reset_index(drop=True, inplace=True)
df.drop(['first'], axis=1, inplace=True)
df=df[df.second.str.contains(":", regex=True)]
df = pd.DataFrame(df.second.str.split(':', 1).tolist(), columns = ['language','term'])

df['sense'] = 'spice'

#fill empty cells backward (i.e fill the parent language content with value from a variant if the former is empty)
df.term[df['term']==""] = np.NaN
df['term'] =  df.term.str.extract('(.*)').fillna(method='bfill')

# df = df.sort_values('language')

spice = df
print(spice)

           language                                             term  sense
0         Afrikaans                                           kaneel  spice
1          Albanian                                   kanellë (sq) f  spice
2           Amharic                                     ቀረፋ (ḳäräfa)  spice
3            Arabic                               قِرْفَة‎ f (qirfa)  spice
4   Egyptian Arabic                                  قرفة‎ f (ʾerfa)  spice
..              ...                                              ...    ...
88            Uzbek                            dolchin (uz), koritsa  spice
89       Vietnamese                                         quế (vi)  spice
90          Volapük                                           kirfat  spice
91            Welsh                                        synamon m  spice
92          Yiddish   צימרינג‎ m (tsimring), צימערינג‎ m (tsimering)  spice

[93 rows x 3 columns]


In [306]:
path = path_in + "wiktionary/" + key + '_plant.txt'

if os.path.exists(path):

      #open txt and seperate every line
      df = open(path_in + "wiktionary/" + key + '_plant.txt', "r", encoding = 'utf8')
      lines = df.readlines()
      df.close()

      # remove /n at the end of each line
      for index, line in enumerate(lines):
            lines[index] = line.strip()

      #creating a dataframe(consider u want to convert your data to 2 columns)
      df = pd.DataFrame(columns=('first', 'second'))
      i = 0 
      first = "" 
      second = ""  
      for line in lines:
            #you have to kind of define what are the values in columns,for example second column includes:
            second = re.sub(r'', "", line)
            #this is how you create next line data
            df.loc[i] = [first, second]
            i =i+1

      df['second'].replace('', np.nan, inplace=True)
      df.dropna(axis=0, inplace=True)
      df.reset_index(drop=True, inplace=True)
      df.drop(['first'], axis=1, inplace=True)
      df=df[df.second.str.contains(":", regex=True)]
      df = pd.DataFrame(df.second.str.split(':', 1).tolist(), columns = ['language','term'])

      df['sense'] = 'plant'

      #fill empty cells backward (i.e fill the parent language content with value from a variant)
      df.term[df['term']==""] = np.NaN
      df['term'] =  df.term.str.extract('(.*)').fillna(method='bfill')
      # df = df.sort_values('language')

      plant = df
      print(plant)

           language                                   term  sense
0         Afrikaans                             kaneelboom  plant
1            Arabic                     قِرْفَة‎ f (qirfa)  plant
2     Hijazi Arabic                          قرفة‎ (girfa)  plant
3   Moroccan Arabic                          قرفة‎ (qarfa)  plant
4           Aramaic   ܕܪܨܝܢܝ‎ (dārṣīnī), ܨܝܢܕܪܓ‎ (ṣīndreḡ)  plant
..              ...                                    ...    ...
61             Thai                       อบเชย (òp-chəəi)  plant
62          Tibetan                     ཤིང་ཚ (shing tsha)  plant
63          Turkish                           tarçın ağacı  plant
64       Vietnamese                                cây quế  plant
65          Volapük                   kirfatep, kirfatabim  plant

[66 rows x 3 columns]


In [307]:
# concat
frames = [spice, plant]
df = pd.concat(frames)

# sort by categories
df['sense'] = pd.Categorical(df['sense'], ["spice", "plant"]) # add categorical order here
df.sort_values("sense", inplace = True) # sort according to the categories

# sort
df = df.sort_values('language')

# drop duplicates
df = df.drop_duplicates(subset = ['language', 'term'], keep = 'first').reset_index(drop = True)

# reset index
df.reset_index(inplace=True, drop=True)

# filter duplicates and get rid of plant name if spice name found
df['duplicate'] = df[df.duplicated(subset=['language'], keep=False)]['sense']=="plant" # change subset to be more strict e.g. subset=['language', 'term']
df.drop(df[df['duplicate'] == True].index, inplace=True)
df.drop(['duplicate'], axis=1, inplace=True)
df.reset_index(inplace=True, drop=True)
df

Unnamed: 0,language,term,sense
0,Afrikaans,kaneel,spice
1,Albanian,kanellë (sq) f,spice
2,Amharic,ቀረፋ (ḳäräfa),spice
3,Ancient,κιννάμωμον n (kinnámōmon),spice
4,Arabic,قِرْفَة‎ f (qirfa),spice
...,...,...,...
104,Uzbek,"dolchin (uz), koritsa",spice
105,Vietnamese,quế (vi),spice
106,Volapük,kirfat,spice
107,Welsh,synamon m,spice


In [308]:
# cleaning
df['term'] = [re.sub(r' m ', " ", str(x)) for x in df['term']]
df['term'] = [re.sub(r' f ', " ", str(x)) for x in df['term']]
df['term'] = [re.sub(r' n ', " ", str(x)) for x in df['term']]
df['term'] = [re.sub(r' c ', " ", str(x)) for x in df['term']]
df['term'] = [re.sub(r' pl ', " ", str(x)) for x in df['term']]
df['term'] = [re.sub(r' m,', ",", str(x)) for x in df['term']]
df['term'] = [re.sub(r' f,', ",", str(x)) for x in df['term']]
df['term'] = [re.sub(r' n,', ",", str(x)) for x in df['term']]
df['term'] = [re.sub(r' c,', ",", str(x)) for x in df['term']]
df['term'] = [re.sub(r' pl,', ",", str(x)) for x in df['term']]
df['term'] = [re.sub(r' m$', "", str(x)) for x in df['term']]
df['term'] = [re.sub(r' f$', "", str(x)) for x in df['term']]
df['term'] = [re.sub(r' n$', "", str(x)) for x in df['term']]
df['term'] = [re.sub(r' c$', "", str(x)) for x in df['term']]
df['term'] = [re.sub(r' pl$', "", str(x)) for x in df['term']]
df['term'] = [re.sub(r'\(bcl\)', "", str(x)) for x in df['term']]
df['term'] = [re.sub(r'\(nds\)', "", str(x)) for x in df['term']]
df['term'] = [re.sub(r'\(scn\)', "", str(x)) for x in df['term']]
df['term'] = [re.sub(r'\(ast\)', "", str(x)) for x in df['term']]
df['term'] = [re.sub(r'\(Föhr-Amrum\)', "", str(x)) for x in df['term']]
df['term'] = [re.sub(r'\s?\(\w\w\)', "", str(x)) for x in df['term']]
df['term'] = [re.sub(r'\(please verify\)', "", str(x)) for x in df['term']]
df['term'] = [re.sub(r'\s+', " ", str(x)) for x in df['term']]
df['term'] = [re.sub(r' ,', ",", str(x)) for x in df['term']]
df['term'] = [re.sub(r'^\s', "", str(x)) for x in df['term']]
df['term'] = [re.sub(r'\s$', "", str(x)) for x in df['term']]

# transliteration, formatting
df['transliteration'] = df.term.str.findall("(\((\w*\,?\.?\-?\:?\d?\s?)+\))").fillna(method='ffill')
df['transliteration'] = [re.sub(r"[\(\)\[\]]", "", str(x)) for x in df['transliteration']]
df['transliteration'] = [re.sub(r", ''", "", str(x)) for x in df['transliteration']]
df['transliteration'] = [re.sub(r"^'", "", str(x)) for x in df['transliteration']]
df['transliteration'] = [re.sub(r"'$", "", str(x)) for x in df['transliteration']]
df['transliteration'] = [re.sub(r"', '", ", ", str(x)) for x in df['transliteration']]

# clean term of transliteration
df['term'] = [re.sub(r"(\((\w*\,?\.?\-?\:?\d?\s?)+\))", "", str(x)) for x in df['term']] 
df['term'] = [re.sub(r" +,", ",", str(x)) for x in df['term']]
df['term'] = [re.sub(r" *$", "", str(x)) for x in df['term']] #!


# other
df['transliteration'] = [re.sub(r", taraškievica", "", str(x)) for x in df['transliteration']]

# drop NA
df = df[df.term != "please add this translation if you can"]

# create 'item' column where everythin is in
df['item'] = df['transliteration']
df['item'] = df['item'].replace('', pd.NA).fillna(df['term'])
df['source zotero'] = 'Wiktionary'

# reorder
df = df[['language', 'term', 'transliteration', 'item', 'sense']]

# Change languages to glottolog name
df['language'] = [re.sub(r"^Arabic$", "Standard Arabic", str(x)) for x in df['language']]
df['language'] = [re.sub(r"^Mandarin$", "Mandarin Chinese", str(x)) for x in df['language']]
df['language'] = [re.sub(r"^Old Armenian$", "Classical-Middle Armenian", str(x)) for x in df['language']]
df['language'] = [re.sub(r"^Croatian$", "Croatian Standard", str(x)) for x in df['language']]
df['language'] = [re.sub(r"^Serbian$", "Serbian Standard", str(x)) for x in df['language']]
df['language'] = [re.sub(r"^Bosnian$", "Bosnian Standard", str(x)) for x in df['language']]
df['language'] = [re.sub(r"^Serbo-Croatian$", "Serbian-Croatian-Bosnian", str(x)) for x in df['language']]
df['language'] = [re.sub(r"^Gaelic$", "Scottish Gaelic", str(x)) for x in df['language']]
df['language'] = [re.sub(r"^Greek$", "Modern Greek", str(x)) for x in df['language']]
df['language'] = [re.sub(r"^Punjabi$", "Eastern Panjabi", str(x)) for x in df['language']]
df['language'] = [re.sub(r"^Norman$", "Anglo-Norman", str(x)) for x in df['language']]
df['language'] = [re.sub(r"^Bikol Central$", "Coastal-Naga Bikol", str(x)) for x in df['language']]
df['language'] = [re.sub(r"^Armenian (Eastern)$", "Eastern Armenian", str(x)) for x in df['language']]
df['language'] = [re.sub(r"^Armenian (Western)$", "Western Armenian", str(x)) for x in df['language']]
df['language'] = [re.sub(r"^Manipuri (Meitei-Lon)$", "Manipuri", str(x)) for x in df['language']]
df['language'] = [re.sub(r"^Naga (Sumi)$", "Sumi Naga", str(x)) for x in df['language']]
df['language'] = [re.sub(r"^Naga (Tangkhul)$", "North-Central Tangkhul Naga", str(x)) for x in df['language']] # A hypoglot
df['language'] = [re.sub(r"^Naga (Rengma)$", "Northern Rengma Naga", str(x)) for x in df['language']]
df['language'] = [re.sub(r"^Naga (Lotha)$", "Lotha Naga", str(x)) for x in df['language']]
df['language'] = [re.sub(r"^Naga (Konyak)$", "Konyak Naga", str(x)) for x in df['language']]
df['language'] = [re.sub(r"^Naga (Chakhesang-Chokri)$", "Chokri Naga", str(x)) for x in df['language']]
df['language'] = [re.sub(r"^Naga (Ao)$", "Ao Naga", str(x)) for x in df['language']]
df['language'] = [re.sub(r"^Naga (Angami)$", "Angami Naga", str(x)) for x in df['language']]
df['language'] = [re.sub(r"^Kyrgyz$", "Kirghiz", str(x)) for x in df['language']]
df['language'] = [re.sub(r"^Hebrew$", "Modern Hebrew", str(x)) for x in df['language']]
df['language'] = [re.sub(r"^Northern Sami$", "North Sami", str(x)) for x in df['language']]



#write
df.to_excel(path_in + "wiktionary/" + key + '_generated.xlsx', sheet_name='wiktionary', index=None, encoding="utf-8")
df

Unnamed: 0,language,term,transliteration,item,sense
0,Afrikaans,kaneel,,kaneel,spice
1,Albanian,kanellë,,kanellë,spice
2,Amharic,ቀረፋ,ḳäräfa,ḳäräfa,spice
3,Ancient,κιννάμωμον,kinnámōmon,kinnámōmon,spice
4,Standard Arabic,قِرْفَة‎,qirfa,qirfa,spice
...,...,...,...,...,...
104,Uzbek,"dolchin, koritsa",,"dolchin, koritsa",spice
105,Vietnamese,quế,,quế,spice
106,Volapük,kirfat,,kirfat,spice
107,Welsh,synamon,,synamon,spice


Now the manual work: Use `spice_out.xlsx` to fix, amend, append, group, and organize the names, with the addition of other sources and create a `spice.xlsx` master list.

Recommended sources:
* Katzer (needs serious checking)
* WOLD
* Max Planck databases (CLIC3, etc.)
* others... 

After that, more preprocessing, cleaning, and merging with language data and coordinates.

In [309]:
# Read and store content of an excel file 
read_file = pd.read_excel(path_in + "wiktionary/" + key +".xlsx")

# Write the dataframe object into csv file
read_file.to_csv (path_in + "wiktionary/" + key + ".csv", index = None, header=True)

# Load in dataset
df=pd.read_csv(path_in + "wiktionary/" + key + ".csv", header =[0], delimiter=',', encoding="utf-8")

# extract only desired columns
selectlist = ['skip', 'language', 'term', 'transliteration', 'item', 'group']
df = df[selectlist]

print("Before skipping: ", df.shape)
df=df[df['skip'] != "yes"]
print("After skipping: ", df.shape)

# # drop columns manually
# df.drop(columns=['skip', 'literal', 'explanation', 'IPA', 'source zotero', 'notes', 'type', 'katzer', 'katzer tr', 'checked', 'reference', 'link'], inplace=True)

# Change languages to glottolog name
df['language'] = [re.sub(r"^Arabic$", "Standard Arabic", str(x)) for x in df['language']]
df['language'] = [re.sub(r"^Mandarin$", "Mandarin Chinese", str(x)) for x in df['language']]
df['language'] = [re.sub(r"^Old Armenian$", "Classical-Middle Armenian", str(x)) for x in df['language']]
df['language'] = [re.sub(r"^Croatian$", "Croatian Standard", str(x)) for x in df['language']]
df['language'] = [re.sub(r"^Serbian$", "Serbian Standard", str(x)) for x in df['language']]
df['language'] = [re.sub(r"^Bosnian$", "Bosnian Standard", str(x)) for x in df['language']]
df['language'] = [re.sub(r"^Serbo-Croatian$", "Serbian-Croatian-Bosnian", str(x)) for x in df['language']]
df['language'] = [re.sub(r"^Gaelic$", "Scottish Gaelic", str(x)) for x in df['language']]
df['language'] = [re.sub(r"^Greek$", "Modern Greek", str(x)) for x in df['language']]
df['language'] = [re.sub(r"^Punjabi$", "Eastern Panjabi", str(x)) for x in df['language']]
df['language'] = [re.sub(r"^Norman$", "Anglo-Norman", str(x)) for x in df['language']]
df['language'] = [re.sub(r"^Bikol Central$", "Coastal-Naga Bikol", str(x)) for x in df['language']]
df['language'] = [re.sub(r"^Armenian (Eastern)$", "Eastern Armenian", str(x)) for x in df['language']]
df['language'] = [re.sub(r"^Armenian (Western)$", "Western Armenian", str(x)) for x in df['language']]
df['language'] = [re.sub(r"^Manipuri (Meitei-Lon)$", "Manipuri", str(x)) for x in df['language']]
df['language'] = [re.sub(r"^Naga (Sumi)$", "Sumi Naga", str(x)) for x in df['language']]
df['language'] = [re.sub(r"^Naga (Tangkhul)$", "North-Central Tangkhul Naga", str(x)) for x in df['language']] # A hypoglot
df['language'] = [re.sub(r"^Naga (Rengma)$", "Northern Rengma Naga", str(x)) for x in df['language']]
df['language'] = [re.sub(r"^Naga (Lotha)$", "Lotha Naga", str(x)) for x in df['language']]
df['language'] = [re.sub(r"^Naga (Konyak)$", "Konyak Naga", str(x)) for x in df['language']]
df['language'] = [re.sub(r"^Naga (Chakhesang-Chokri)$", "Chokri Naga", str(x)) for x in df['language']]
df['language'] = [re.sub(r"^Naga (Ao)$", "Ao Naga", str(x)) for x in df['language']]
df['language'] = [re.sub(r"^Naga (Angami)$", "Angami Naga", str(x)) for x in df['language']]
df['language'] = [re.sub(r"^Kyrgyz$", "Kirghiz", str(x)) for x in df['language']]
df['language'] = [re.sub(r"^Hebrew$", "Modern Hebrew", str(x)) for x in df['language']]
df['language'] = [re.sub(r"^North Sami$", "North Saami", str(x)) for x in df['language']]
df['language'] = [re.sub(r"^Greenlandic$", "Kalaallisut", str(x)) for x in df['language']]
df['language'] = [re.sub(r"^West Frisian$", "Western Frisian", str(x)) for x in df['language']]
df['language'] = [re.sub(r"^Old Javanese$", "Kawi", str(x)) for x in df['language']]
df['language'] = [re.sub(r"^Old Church Slavonic$", "Church Slavic", str(x)) for x in df['language']]
df['language'] = [re.sub(r"^North Frisian$", "Northern Frisian", str(x)) for x in df['language']]
df['language'] = [re.sub(r"^Luxembourgish$", "Luxemburgish", str(x)) for x in df['language']]
df['language'] = [re.sub(r"^Armenian$", "Eastern Armenian", str(x)) for x in df['language']]
df['language'] = [re.sub(r"^Old East Slavic$", "Old Russian", str(x)) for x in df['language']]
df['language'] = [re.sub(r"^Ossetian$", "Modern Ossetic", str(x)) for x in df['language']]

inpt = df

# load in datasets
languages=pd.read_csv(path_in + 'languages/languages.csv', header =[0], delimiter=',', encoding="utf-8", index_col=[0])

# merge input and languages
df = pd.merge(inpt, languages, on=['language'])
print("Merged:", df.shape)

#drop duplicates
df.drop_duplicates(subset=['language', 'term'], keep='first', inplace=True, ignore_index=True)
print("Dropping duplicates:", df.shape)

multilingual = df
multilingual

#check missing ones
temp = pd.merge(inpt, multilingual, how='outer', suffixes=('','_y'), indicator=True)
missing = temp[temp['_merge']=='left_only'][inpt.columns]
print("The following terms and languages have failed to load:")
print(missing)

# df = df.dropna() #OPERATIVE ONLY
# df = df.fillna('x')

#sort by categories, cinnamon ######## AUTOMATE ########
df['group'] = pd.Categorical(df['group'], ["canela", "kinnamon", "korica", "qirfa", "darchin", "gui", "other"]) # add categorical order here
df.sort_values("group", inplace = True) # sort according to the categories

# #sort by categories, pepper ######## AUTOMATE ########
# df['group'] = pd.Categorical(df['group'], ["pippali", "pigment", "marica", "hujiao", "other"]) # add categorical order here
# df.sort_values("group", inplace = True) # sort according to the categories

# create text for annotation label
df['text'] = df['term'] + '<br>' + df['transliteration'].astype(str) + '<br>Language: ' + df['language'] + '<br>Family: ' + df['family']
df['text'] = [re.sub(r"<br>nan<br>", "<br>", str(x)) for x in df['text']]

df['term'] = [re.sub(r"\u200e", "", str(x)) for x in df['term']] #removes right to left mark
df['term'] = [re.sub(r" *$", "", str(x)) for x in df['term']] #!

# reindex?

# save
df.to_csv(path_in + "multilingual/" + key +'.csv')
df

Before skipping:  (162, 6)
After skipping:  (148, 6)
Merged: (155, 16)
Dropping duplicates: (148, 16)
The following terms and languages have failed to load:
Empty DataFrame
Columns: [skip, language, term, transliteration, item, group]
Index: []


Unnamed: 0,skip,language,term,transliteration,item,group,iso,glcode,level,branch,family,macroarea,country,lat,lon,from,text
0,,Afrikaans,kaneel,,kaneel,canela,afr,afri1274,language,Germanic,Indo-European,Africa,ZA,-31.00000,22.00000,wals1,kaneel<br>Language: Afrikaans<br>Family: Indo-...
36,,Estonian,kaneel,,kaneel,canela,est,esto1258,language,Finnic,Uralic,Eurasia,EE,59.00000,26.00000,wals1,kaneel<br>Language: Estonian<br>Family: Uralic
37,,Faroese,kanel,,kanel,canela,fao,faro1244,language,Germanic,Indo-European,Eurasia,DK,62.00000,-7.00000,wals1,kanel<br>Language: Faroese<br>Family: Indo-Eur...
38,,Finnish,kaneli,,kaneli,canela,fin,finn1318,language,Finnic,Uralic,Eurasia,FI,62.00000,25.00000,wals1,kaneli<br>Language: Finnish<br>Family: Uralic
39,,French,cannelle,,cannelle,canela,fra,stan1290,language,Romance,Indo-European,Eurasia,CH FR,48.00000,2.00000,wals1,cannelle<br>Language: French<br>Family: Indo-E...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121,,Sumi Naga,"losani, akusa",,"losani, akusa",other,nsm,sumi1235,language,,Sino-Tibetan,Eurasia,IN,25.99960,94.42350,glot,"losani, akusa<br>Language: Sumi Naga<br>Family..."
12,,Brunei Malay,kayu manis,,kayu manis,other,,brun1243,dialect,,Austronesian,Papunesia,,4.53000,114.72000,glot,kayu manis<br>Language: Brunei Malay<br>Family...
146,,Yoruba,eso igi gbigbẹ oloorun,,eso igi gbigbẹ oloorun,other,yor,yoru1245,language,,Atlantic-Congo,Africa,BJ NG,7.15345,3.67225,glot,eso igi gbigbẹ oloorun<br>Language: Yoruba<br>...
71,,Lao,ອົບເຊຍ,obsey,obsey,other,lao,laoo1244,language,Kam-Tai,Tai-Kadai,Eurasia,LA TH,18.00000,103.00000,wals1,ອົບເຊຍ<br>obsey<br>Language: Lao<br>Family: Ta...


# Analysis

## Analysis of spices

In [310]:
# Read and store content of an excel file 
read_file = pd.read_excel(path_in+"spices.xlsx")

# Write the dataframe object into csv file
read_file.to_csv (path_in+"spices.csv", index = None, header=True)

# Load in dataset
df_spices=pd.read_csv(path_in+'spices.csv', header =[0], delimiter=',', encoding="utf-8")

df_spices = df_spices.loc[df_spices['include'] == 'in'] # include ones to include

df = df_spices

#### Geographical distribution

In [311]:
# Plots to count things
series = df['macroarea'].value_counts()
df_result = pd.DataFrame(series)
df_result = df_result.reset_index()  
df_result.columns = ['area', 'total']
# print(df_result)
# df_result.loc[5:, 'language'] = 'other' # Replace everything with 'others' after the fifth!


In [312]:
# Variables
font_size = 40
hole_size = 0.45


fig = px.pie(df_result,
             values='total',
             names='area',
            #  color_discrete_sequence=[p1,p3,p5,p7,p9],
            #  color_discrete_sequence = px.colors.qualitative.Prism,
             color='area',
            #  sort = True,
             hole=hole_size,
            #  title='Ratio of spice names'
             template='odd',
             )

# fig = go.Figure(data=[go.Pie(labels=df_result.language, 
#                               values=df_result.total, 
#                               direction ='clockwise', 
#                               hole=.4, 
#                               marker_colors=[p2,p6,p4], 
#                               sort=True)])

fig.update_traces(textinfo='value',
                  direction ='clockwise',
                  sort=False)

fig.update_layout(
    width = 600, height=700,
    margin={"r":0,"t":0,"l":0,"b":0},
    font_family=font_family,
    font_color=font_color,
    font_size=font_size,
    # legend=dict(
    #     bgcolor=half_transparent,
    #     y=0, x=0, xanchor="left", yanchor="bottom", 
    #     # traceorder = 'normal', 
    #     # orientation="h"
    #     ))
    legend=dict(
        bgcolor=half_transparent,
        y=1, x=0, xanchor="left", yanchor="bottom", 
        # traceorder = 'normal', 
        orientation="h"
        ))

fig.update_layout(annotations=[dict(text=str(df.shape[0]), x=0.5, y=0.5, font_size=font_size, showarrow=False)])
                               
# fig.show()
# write and download
filename="macroarea_pie"
# fig.write_html(filename + ".html")
# files.download(filename + ".html")

fig.show()                  
# write and download
filename="macroarea_pie"
# fig.write_html(filename + ".html")
# files.download(filename + ".html")

fig.write_image(path_out_pdf+filename + ".pdf", engine="kaleido")
# files.download(filename + ".pdf")

In [313]:
# Spices by total no. of regions
df_spices.sort_values(["total regions"], inplace = True, ascending=False)

# Variables
font_size = 24

fig = px.bar(df, 
             x="id", y="total regions", 
            #  color="macroarea", 
            #  text="language",
            #  color_discrete_sequence = px.colors.qualitative.Prism,
             template="plotly_white",
             hover_name='id',
             hover_data={'species':True, 'region of origin':True, 'native regions':False, 'no. of native regions':True, 'introduced regions':False, 'no. of introduced regions':True, 'spreadability':True},
            #  labels = dict(lang="Language",total="Times")
             )


colors = ['lightslategray',] * 24
# colors[1] = 'crimson'

fig.update_traces(
                 marker_color=colors
                  )

fig.update_layout(
    font_family=font_family,
    font_color=font_color,
    font_size=font_size)


fig.update_xaxes(visible=True, title=None)
fig.update_yaxes(visible=True, title=None, showticklabels=True)
fig.update_xaxes(ticklabelposition="inside", tickangle = 270)
fig.update_yaxes(ticklabelposition="inside")
fig.update_xaxes(tickfont = dict(family=font_family, color=font_color, size = font_size-2))
fig.update_yaxes(tickfont = dict(family=font_family, color=font_color, size = font_size-2))
# fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='#f8f8f8') #f0f0f0
# fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='#f8f8f8')
# fig.update_layout(paper_bgcolor="white") # transparent background rgb(0,0,0,0)
# fig.update_layout(plot_bgcolor="white")#f6f6f6

fig.update_layout(
    width = 1000, height=300,
    margin={"r":0,"t":0,"l":0,"b":0},
    # showlegend=False,
    # legend=dict(
        # bgcolor=half_transparent,
        # x=0, y=1, xanchor="right", yanchor="top",
        # font = dict(size=font_size-2),
        # title = "Spreadability",
        # # traceorder = 'normal', 
        # orientation="v"
        # ))
    legend=dict(
        bgcolor=half_transparent,
        x=0, y=0, xanchor="left", yanchor="top",
        font = dict(size=font_size-2),
        # traceorder = 'normal', 
        orientation="h"
        ))


# fig.update_layout(annotations=[dict(text='English', x=0.11, y=0.5, font_size=24, showarrow=False), 
#                                dict(text='Arabic', x=0.50, y=0.5, font_size=24, showarrow=False),
#                                dict(text='Chinese', x=0.89, y=0.5, font_size=24, showarrow=False)
#                                ])

fig.show()

# write and download
filename="total_regions"
# fig.write_html(filename + ".html")
# files.download(filename + ".html")

fig.write_image(path_out_pdf + filename + ".pdf", engine="kaleido")
# files.download(filename + ".pdf")

In [314]:
# Spices by total no. of regions, grouped by macroarea
df.sort_values(["total regions"], inplace = True, ascending=False)

# Variables
font_size = 24
font_color = "black"
font_family = "Serif"
half_transparent = 'rgba(255,255,255,0.5)'

fig = px.bar(df,
             x="id", y="total regions", 
             color="macroarea", 
            #  text="language",
             template="plotly_white+odd",
             hover_name='id',
             hover_data={'species':True, 'region of origin':True, 'native regions':False, 'no. of native regions':True, 'introduced regions':False, 'no. of introduced regions':True, 'spreadability':True},
            #  labels = dict(lang="Language",total="Times")
             )


# colors = ['lightslategray',] * 24
# # colors[1] = 'crimson'

# fig.update_traces(
#                  marker_color=colors
#                   )

fig.update_layout(
    font_family=font_family,
    font_color=font_color,
    font_size=font_size)


fig.update_xaxes(visible=True, title=None)
fig.update_yaxes(visible=True, title=None, showticklabels=True)
fig.update_xaxes(ticklabelposition="inside", tickangle = 270)
fig.update_yaxes(ticklabelposition="inside")
fig.update_xaxes(tickfont = dict(family=font_family, color=font_color, size = font_size-2))
fig.update_yaxes(tickfont = dict(family=font_family, color=font_color, size = font_size-2))
# fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='#f8f8f8') #f0f0f0
# fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='#f8f8f8')
# fig.update_layout(paper_bgcolor="white") # transparent background rgb(0,0,0,0)
# fig.update_layout(plot_bgcolor="white")#f6f6f6

fig.update_layout(
    width = 1000, height=350,
    margin={"r":0,"t":0,"l":0,"b":0},
    # showlegend=False,
    # legend=dict(
        # bgcolor=half_transparent,
        # x=0, y=1, xanchor="right", yanchor="top",
        # font = dict(size=font_size-2),
        # title = "Spreadability",
        # # traceorder = 'normal', 
        # orientation="v"
        # ))
    legend=dict(
        bgcolor=half_transparent,
        x=0, y=0, xanchor="left", yanchor="top",
        font = dict(size=font_size-2),
        # traceorder = 'normal', 
        orientation="h"
        ))


# fig.update_layout(annotations=[dict(text='English', x=0.11, y=0.5, font_size=24, showarrow=False), 
#                                dict(text='Arabic', x=0.50, y=0.5, font_size=24, showarrow=False),
#                                dict(text='Chinese', x=0.89, y=0.5, font_size=24, showarrow=False)
#                                ])

fig.show()

# write and download
filename="total_regions_by_macroarea"
# fig.write_html(filename + ".html")
# files.download(filename + ".html")

fig.write_image(path_out_pdf+filename + ".pdf", engine="kaleido")
# files.download(filename + ".pdf")

In [315]:
# Spices by total no. of regions, grouped by family
df.sort_values(["total regions"], inplace = True, ascending=False)

# Variables
font_size = 24

fig = px.bar(df, 
             x="id", y="total regions", 
             color="family", 
            #  text="language",
             color_discrete_sequence = prism_extended,
             template="plotly_white",
             hover_name='id',
             hover_data={'species':True, 'region of origin':True, 'native regions':False, 'no. of native regions':True, 'introduced regions':False, 'no. of introduced regions':True, 'spreadability':True},
            #  labels = dict(lang="Language",total="Times")
             )


# colors = ['lightslategray',] * 24
# # colors[1] = 'crimson'

# fig.update_traces(
#                  marker_color=colors
#                   )

fig.update_layout(
    font_family=font_family,
    font_color=font_color,
    font_size=font_size)


fig.update_xaxes(visible=True, title=None)
fig.update_yaxes(visible=True, title=None, showticklabels=True)
fig.update_xaxes(ticklabelposition="inside", tickangle = 270)
fig.update_yaxes(ticklabelposition="inside")
fig.update_xaxes(tickfont = dict(family=font_family, color=font_color, size = font_size-2))
fig.update_yaxes(tickfont = dict(family=font_family, color=font_color, size = font_size-2))
# fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='#f8f8f8') #f0f0f0
# fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='#f8f8f8')
# fig.update_layout(paper_bgcolor="white") # transparent background rgb(0,0,0,0)
# fig.update_layout(plot_bgcolor="white")#f6f6f6

fig.update_layout(
    width = 1000, height=400,
    margin={"r":0,"t":0,"l":0,"b":0},
    # showlegend=False,
    # legend=dict(
        # bgcolor=half_transparent,
        # x=0, y=1, xanchor="right", yanchor="top",
        # font = dict(size=font_size-2),
        # title = "Spreadability",
        # # traceorder = 'normal', 
        # orientation="v"
        # ))
    legend=dict(
        bgcolor=half_transparent,
        x=0, y=0, xanchor="left", yanchor="top",
        font = dict(size=font_size-2),
        # traceorder = 'normal', 
        orientation="h"
        ))


# fig.update_layout(annotations=[dict(text='English', x=0.11, y=0.5, font_size=24, showarrow=False), 
#                                dict(text='Arabic', x=0.50, y=0.5, font_size=24, showarrow=False),
#                                dict(text='Chinese', x=0.89, y=0.5, font_size=24, showarrow=False)
#                                ])

fig.show()

# write and download
filename="total_regions_by_family"
# fig.write_html(filename + ".html")
# files.download(filename + ".html")

fig.write_image(path_out_pdf+filename + ".pdf", engine="kaleido")
# files.download(filename + ".pdf")

#### Spreadability

In [235]:
# Read and store content of an excel file 
read_file = pd.read_excel(path_in+"spices.xlsx")

# Write the dataframe object into csv file
read_file.to_csv (path_in+"spices.csv", index = None, header=True)

# Load in dataset
df_spices=pd.read_csv(path_in+'spices.csv', header =[0], delimiter=',', encoding="utf-8")

df_spices = df_spices.loc[df_spices['include'] == 'in'] # include ones to include

df = df_spices

df.sort_values(["spreadability"], inplace = True, ascending=False)


In [236]:
# Variables
font_size = 24

fig = px.bar(df, 
             x="id", y="spreadability", 
            #  color="family", 
            #  text="language",
            #  color_discrete_sequence = px.colors.qualitative.Prism,
             template="plotly_white",
             hover_name='id',
             hover_data={'species':True, 'region of origin':True, 'native regions':False, 'no. of native regions':True, 'introduced regions':False, 'no. of introduced regions':True, 'spreadability':True},
            #  labels = dict(lang="Language",total="Times")
             )


colors = ['lightslategray',] * 24
# colors[1] = 'crimson'

fig.update_traces(
                 marker_color=colors
                  )

fig.update_layout(
    font_family=font_family,
    font_color=font_color,
    font_size=font_size)


fig.update_xaxes(visible=True, title=None)
fig.update_yaxes(visible=True, title=None, showticklabels=True)
fig.update_xaxes(ticklabelposition="inside", tickangle = 270)
fig.update_yaxes(ticklabelposition="inside")
fig.update_xaxes(tickfont = dict(family=font_family, color=font_color, size = font_size-2))
fig.update_yaxes(tickfont = dict(family=font_family, color=font_color, size = font_size-2))
# fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='#f8f8f8') #f0f0f0
# fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='#f8f8f8')
# fig.update_layout(paper_bgcolor="white") # transparent background rgb(0,0,0,0)
# fig.update_layout(plot_bgcolor="white")#f6f6f6

fig.update_layout(
    width = 1000, height=300,
    margin={"r":0,"t":0,"l":0,"b":0},
    # showlegend=False,
    # legend=dict(
        # bgcolor=half_transparent,
        # x=0, y=1, xanchor="right", yanchor="top",
        # font = dict(size=font_size-2),
        # title = "Spreadability",
        # # traceorder = 'normal', 
        # orientation="v"
        # ))
    legend=dict(
        bgcolor=half_transparent,
        x=0, y=0, xanchor="left", yanchor="top",
        font = dict(size=font_size-2),
        # traceorder = 'normal', 
        orientation="h"
        ))


# fig.update_layout(annotations=[dict(text='English', x=0.11, y=0.5, font_size=24, showarrow=False), 
#                                dict(text='Arabic', x=0.50, y=0.5, font_size=24, showarrow=False),
#                                dict(text='Chinese', x=0.89, y=0.5, font_size=24, showarrow=False)
#                                ])

fig.show()

# write and download
filename="spreadability"
# fig.write_html(filename + ".html")
# files.download(filename + ".html")

fig.write_image(path_out_pdf+filename + ".pdf", engine="kaleido")
# files.download(filename + ".pdf")

In [237]:
# Variables
font_size = 24

fig = px.bar(df, 
             x="id", y="spreadability", 
             color="macroarea", 
            #  text="language",
             template="plotly_white+odd",
             hover_name='id',
             hover_data={'species':True, 'region of origin':True, 'native regions':False, 'no. of native regions':True, 'introduced regions':False, 'no. of introduced regions':True, 'spreadability':True},
            #  labels = dict(lang="Language",total="Times")
             )


# colors = ['lightslategray',] * 24
# colors[1] = 'crimson'

fig.update_traces(
                #  marker_color=colors
                  )

fig.update_layout(
    font_family=font_family,
    font_color=font_color,
    font_size=font_size)


fig.update_xaxes(visible=True, title=None)
fig.update_yaxes(visible=True, title=None, showticklabels=True)
fig.update_xaxes(ticklabelposition="inside", tickangle = 270)
fig.update_yaxes(ticklabelposition="inside")
fig.update_xaxes(tickfont = dict(family=font_family, color=font_color, size = font_size-2))
fig.update_yaxes(tickfont = dict(family=font_family, color=font_color, size = font_size-2))
# fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='#f8f8f8') #f0f0f0
# fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='#f8f8f8')
# fig.update_layout(paper_bgcolor="white") # transparent background rgb(0,0,0,0)
# fig.update_layout(plot_bgcolor="white")#f6f6f6

fig.update_layout(
    width = 1000, height=350,
    margin={"r":0,"t":0,"l":0,"b":0},
    # showlegend=False,
    # legend=dict(
        # bgcolor=half_transparent,
        # x=0, y=1, xanchor="right", yanchor="top",
        # font = dict(size=font_size-2),
        # title = "Spreadability",
        # # traceorder = 'normal', 
        # orientation="v"
        # ))
    legend=dict(
        bgcolor=half_transparent,
        x=0, y=0, xanchor="left", yanchor="top",
        font = dict(size=font_size-2),
        # traceorder = 'normal', 
        orientation="h"
        ))


# fig.update_layout(annotations=[dict(text='English', x=0.11, y=0.5, font_size=24, showarrow=False), 
#                                dict(text='Arabic', x=0.50, y=0.5, font_size=24, showarrow=False),
#                                dict(text='Chinese', x=0.89, y=0.5, font_size=24, showarrow=False)
#                                ])

fig.show()

# write and download
filename="spreadability_by_macroarea"
# fig.write_html(filename + ".html")
# files.download(filename + ".html")

fig.write_image(path_out_pdf+filename + ".pdf", engine="kaleido")
# files.download(filename + ".pdf")

In [238]:
# Variables
font_size = 24

fig = px.bar(df, 
             x="id", y="spreadability", 
             color="family", 
            #  text="language",
             color_discrete_sequence = prism_extended,
             template="plotly_white",
             hover_name='id',
             hover_data={'species':True, 'region of origin':True, 'native regions':False, 'no. of native regions':True, 'introduced regions':False, 'no. of introduced regions':True, 'spreadability':True},
            #  labels = dict(lang="Language",total="Times")
             )


# colors = ['lightslategray',] * 24
# colors[1] = 'crimson'

fig.update_traces(
                #  marker_color=colors
                  )

fig.update_layout(
    font_family=font_family,
    font_color=font_color,
    font_size=font_size)


fig.update_xaxes(visible=True, title=None)
fig.update_yaxes(visible=True, title=None, showticklabels=True)
fig.update_xaxes(ticklabelposition="inside", tickangle = 270)
fig.update_yaxes(ticklabelposition="inside")
fig.update_xaxes(tickfont = dict(family=font_family, color=font_color, size = font_size-2))
fig.update_yaxes(tickfont = dict(family=font_family, color=font_color, size = font_size-2))
# fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='#f8f8f8') #f0f0f0
# fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='#f8f8f8')
# fig.update_layout(paper_bgcolor="white") # transparent background rgb(0,0,0,0)
# fig.update_layout(plot_bgcolor="white")#f6f6f6

fig.update_layout(
    width = 1000, height=400,
    margin={"r":0,"t":0,"l":0,"b":0},
    # showlegend=False,
    # legend=dict(
        # bgcolor=half_transparent,
        # x=0, y=1, xanchor="right", yanchor="top",
        # font = dict(size=font_size-2),
        # title = "Spreadability",
        # # traceorder = 'normal', 
        # orientation="v"
        # ))
    legend=dict(
        bgcolor=half_transparent,
        x=0, y=0, xanchor="left", yanchor="top",
        font = dict(size=font_size-2),
        # traceorder = 'normal', 
        orientation="h"
        ))


# fig.update_layout(annotations=[dict(text='English', x=0.11, y=0.5, font_size=24, showarrow=False), 
#                                dict(text='Arabic', x=0.50, y=0.5, font_size=24, showarrow=False),
#                                dict(text='Chinese', x=0.89, y=0.5, font_size=24, showarrow=False)
#                                ])

fig.show()

# write and download
filename="spreadability_by_family"
# fig.write_html(filename + ".html")
# files.download(filename + ".html")

fig.write_image(path_out_pdf+filename + ".pdf", engine="kaleido")
# files.download(filename + ".pdf")

## Analysis of names

### Statistics

In [239]:
# Read and store content of an excel file 
read_file = pd.read_excel(path_in+"names.xlsx")
# Write the dataframe object into csv file
read_file.to_csv (path_in+"names.csv", index = None, header=True)
# Load in dataset of names
df=pd.read_csv(path_in+'names.csv', header =[0], delimiter=',', encoding="utf-8")
# Include ones to include
df = df.loc[df['include'] == 'yes']

#### Languages

In [240]:
names_total = df.shape[0]

# Count things
names_en = df[df.language == 'English'].shape[0]
names_ar = df[df.language == 'Arabic'].shape[0]
names_zh = df[df.language == 'Chinese'].shape[0]

print("Total:\t\t",names_total)
print(df.language.value_counts())
so = "As a result of the data collection set forth in \cref{ch:data}, the spice name dataset now contains "+str(names_total)+" spice names. Of these, "+str(names_en)+" are in English, "+str(names_ar)+" are in Arabic, and "+str(names_zh)+" are in Chinese; \cref{fig:languages_pie} shows this distribution."
so


Total:		 360
English    154
Chinese    120
Arabic      86
Name: language, dtype: int64


'As a result of the data collection set forth in \\cref{ch:data}, the spice name dataset now contains 360 spice names. Of these, 154 are in English, 86 are in Arabic, and 120 are in Chinese; \\cref{fig:languages_pie} shows this distribution.'

In [241]:
# Plots to count things
series = df['language'].value_counts()
df_result = pd.DataFrame(series)
df_result = df_result.reset_index()  
df_result.columns = ['language', 'total']
# print(df_result)
# df_result.loc[5:, 'language'] = 'other' # Replace everything with 'others' after the fifth!

################################################################################

# Variables
font_size = 40
hole_size = 0.45

fig = px.pie(df_result,
             values='total',
             names='language',
             color_discrete_sequence=[p2,p6,p4],
            #  sort = True,
             hole=hole_size,
            #  title='Ratio of spice names'
             )

# fig = go.Figure(data=[go.Pie(labels=df_result.language, 
#                               values=df_result.total, 
#                               direction ='clockwise', 
#                               hole=.4, 
#                               marker_colors=[p2,p6,p4], 
#                               sort=True)])

fig.update_traces(textinfo='value',
                  direction ='clockwise',
                  sort=False)

fig.update_layout(
    margin={"r":0,"t":0,"l":0,"b":0},
    font_family=font_family,
    font_color=font_color,
    font_size=font_size,
    # legend=dict(
    #     bgcolor=half_transparent,
    #     y=0, x=0, xanchor="left", yanchor="bottom", 
    #     # traceorder = 'normal', 
    #     # orientation="h"
    #     ))
    legend=dict(
        bgcolor=half_transparent,
        y=1, x=0, xanchor="left", yanchor="bottom", 
        # traceorder = 'normal', 
        orientation="h"
        ))

fig.update_layout(annotations=[dict(text=str(names_total), x=0.5, y=0.5, font_size=font_size, showarrow=False)])
                               
# fig.show()
# write and download
filename="languages_pie"
# fig.write_html(filename + ".html")
# files.download(filename + ".html")

fig.update_layout(
    width = 600, height=600,
    )

fig.show()
fig.write_image(path_out_pdf+filename + ".pdf", engine="kaleido")
# files.download(filename + ".pdf")

### Items

In [242]:
print(len(df.id.value_counts()))
print(df.id.value_counts())
print(df.id.value_counts().mean())
print(df.id.value_counts().max())
print(df.id.value_counts().min())

# print(len(df.category.value_counts()))
# print(df.category.value_counts())
# print(df.category.value_counts().mean())
# print(df.category.value_counts().max())
# print(df.category.value_counts().min())

26
chile             44
Sichuan pepper    30
cassia            25
false cardamom    21
allspice          19
cumin             15
asafoetida        15
pepper            14
star anise        14
fennel            14
black cardamom    13
turmeric          13
cardamom          12
coriander         12
anise             12
cinnamon          11
caraway           11
ginger            11
saffron           11
long pepper       10
vanilla            7
nutmeg             6
dill               6
clove              5
mace               4
fenugreek          4
Name: id, dtype: int64
13.807692307692308
44
4


#### Top 10 and Bottom 10 Items

In [243]:
# Plots to count things
series = df['id'].value_counts()
df_result = pd.DataFrame(series)
df_result = df_result.reset_index()  
df_result.columns = ['id', 'total']
# print(df_result)
df_result.loc[10:, 'id'] = 'other' # Replace everything with 'others' after the fifth!

################################################################################

# Variables
font_size = 28
hole_size = 0.45

# fig = px.pie(df_result,
#              values='total',
#              names='id',
#             #  color_discrete_sequence=[p2,p6,p4],
#              color_discrete_sequence=px.colors.qualitative.Prism,
#             #  direction ='clockwise',
#             #  sort = True,
#              hole=hole_size
#             #  title='Ratio of spice names'
#              )

fig = go.Figure(data=[go.Pie(labels=df_result.id, 
                              values=df_result.total, 
                              direction ='clockwise', 
                              hole=hole_size, 
                              marker_colors=px.colors.qualitative.Prism,
                              # marker_colors=[p1,p2,p3,p4,p5,p11],
                              sort=False)])

fig.update_traces(textinfo='value')

fig.update_layout(
    # width = 500, height=400,
    margin={"r":0,"t":0,"l":0,"b":0},
    font_family=font_family,
    font_color=font_color,
    font_size=font_size,
    legend=dict(
        bgcolor=half_transparent,
        x=0, y=1, xanchor="right", yanchor="top",
        font = dict(size=32),
        # traceorder = 'normal', 
        orientation="v",
        ))

# fig.show()
# write and download
filename="ids_top_pie"
# fig.write_html(filename + ".html")
# files.download(filename + ".html")

# fig.update_layout(
#     
#     )

fig.show()
fig.write_image(path_out_pdf+filename + ".pdf", engine="kaleido")
# files.download(filename + ".pdf")

In [244]:
# Plots to count things
series = df['id'].value_counts()
series = series.sort_values(ascending=True)
df_result = pd.DataFrame(series)
df_result = df_result.reset_index()  
df_result.columns = ['id', 'total']
# print(df_result)
df_result.loc[10:, 'id'] = 'other' # Replace everything with 'others' after the fifth!

################################################################################

# Variables
font_size = 28
hole_size = 0.45

# fig = px.pie(df_result,
#              values='total',
#              names='id',
#             #  color_discrete_sequence=[p2,p6,p4],
#              color_discrete_sequence=px.colors.qualitative.Prism,
#             #  direction ='clockwise',
#             #  sort = True,
#              hole=hole_size
#             #  title='Ratio of spice names'
#              )

fig = go.Figure(data=[go.Pie(labels=df_result.id, 
                              values=df_result.total, 
                              direction ='clockwise', 
                              hole=hole_size, 
                              marker_colors=px.colors.qualitative.Prism,
                              # marker_colors=[p1,p2,p3,p4,p5,p11],
                              sort=False)])

fig.update_traces(textinfo='value')

fig.update_layout(
    margin={"r":0,"t":0,"l":0,"b":0},
    font_family=font_family,
    font_color=font_color,
    font_size=font_size,
    legend=dict(
        bgcolor=half_transparent,
        x=0, y=1, xanchor="right", yanchor="top",
        font = dict(size=32)
        # traceorder = 'normal', 
        # orientation="v"
        ))

# fig.show()
# write and download
filename="ids_bottom_pie"
# fig.write_html(filename + ".html")
# files.download(filename + ".html")

# fig.update_layout(
#     width = 600, height=600,
#     )

fig.show()
fig.write_image(path_out_pdf+filename + ".pdf", engine="kaleido")
# files.download(filename + ".pdf")

In [245]:
# Plots to count things
series = df['id'].value_counts()
df_result = pd.DataFrame(series)
df_result = df_result.reset_index()  
df_result.columns = ['id', 'total']
# print(df_result)
df_result.loc[10:, 'id'] = 'other' # Replace everything with 'others' after the fifth!

################################################################################

# Variables
font_size = 30
font_color = "black"
font_family = "Serif"
hole_size = 0.25
half_transparent = 'rgba(255,255,255,0.5)'

# fig = px.pie(df_result,
#              values='total',
#              names='id',
#             #  color_discrete_sequence=[p2,p6,p4],
#              color_discrete_sequence=px.colors.qualitative.Prism,
#             #  direction ='clockwise',
#             #  sort = True,
#              hole=hole_size
#             #  title='Ratio of spice names'
#              )

fig = go.Figure(data=[go.Pie(labels=df_result.id, 
                              values=df_result.total, 
                              direction ='clockwise', 
                              hole=hole_size, 
                              marker_colors=px.colors.qualitative.Prism,
                              # marker_colors=[p1,p2,p3,p4,p5,p11],
                              sort=False)])

fig.update_traces(textinfo='label')

fig.update_layout(
    width = 600, height=600,
    margin={"r":0,"t":0,"l":0,"b":0},
    font_family=font_family,
    font_color=font_color,
    font_size=font_size,
    showlegend=False,
    legend=dict(
        bgcolor=half_transparent,
        x=0, y=1, xanchor="right", yanchor="top",
        font = dict(size=font_size),
        # traceorder = 'normal', 
        orientation="v",
        ))

# fig.show()
# write and download
filename="ids_top_pie_ann"
# fig.write_html(filename + ".html")
# files.download(filename + ".html")

fig.show()
fig.write_image(path_out_pdf+filename + ".pdf", engine="kaleido")
# files.download(filename + ".pdf")

In [246]:
# Plots to count things
series = df['id'].value_counts()
series = series.sort_values(ascending=True)
df_result = pd.DataFrame(series)
df_result = df_result.reset_index()  
df_result.columns = ['id', 'total']
# print(df_result)
df_result.loc[10:, 'id'] = 'other' # Replace everything with 'others' after the fifth!

################################################################################

# Variables
font_size = 32
hole_size = 0.25

# fig = px.pie(df_result,
#              values='total',
#              names='id',
#             #  color_discrete_sequence=[p2,p6,p4],
#              color_discrete_sequence=px.colors.qualitative.Prism,
#             #  direction ='clockwise',
#             #  sort = True,
#              hole=hole_size
#             #  title='Ratio of spice names'
#              )

fig = go.Figure(data=[go.Pie(labels=df_result.id, 
                              values=df_result.total, 
                              direction ='clockwise', 
                              hole=hole_size, 
                              marker_colors=px.colors.qualitative.Prism,
                              # marker_colors=[p1,p2,p3,p4,p5,p11],
                              sort=False)])

fig.update_traces(textinfo='label')

fig.update_layout(
    width = 800, height=800,
    margin={"r":0,"t":0,"l":0,"b":0},
    font_family=font_family,
    font_color=font_color,
    font_size=font_size,
    showlegend=False,
    legend=dict(
        bgcolor=half_transparent,
        x=0, y=1, xanchor="right", yanchor="top",
        font = dict(size=font_size),
        # traceorder = 'normal', 
        orientation="v",
        ))

# fig.show()
# write and download
filename="ids_bottom_pie_ann"
# fig.write_html(filename + ".html")
# files.download(filename + ".html")

fig.show()
fig.write_image(path_out_pdf+filename + ".pdf", engine="kaleido")
# files.download(filename + ".pdf")

In [247]:
# labels_en = df_en.id.value_counts().index.tolist()
# values_en = df_en.id.value_counts().tolist()
# labels_ar = df_ar.id.value_counts().index.tolist()
# values_ar = df_ar.id.value_counts().tolist()
# # labels_en = df_en.id.value_counts().index.tolist()
# # values_en = df_en.id.value_counts().tolist()

In [248]:
# series = df_en['id'].value_counts()
# df_result = pd.DataFrame(series)
# df_result = df_result.reset_index()  
# df_result.columns = ['id', 'value']
# df_result.loc[5:, 'id'] = 'other'
# df_resulta

# # df_result.loc[df['id'] == 'other', 'value'].sum()
# df_result=df_result.groupby('id')['value'].sum()
# df_result

In [249]:
# import plotly.graph_objects as go
# from plotly.subplots import make_subplots

# series = df_en['id'].value_counts()
# df_result = pd.DataFrame(series)
# df_result = df_result.reset_index()  
# df_result.columns = ['id', 'value']
# df_result.loc[5:, 'id'] = 'other'

# labels_en = df_result.id.tolist()
# values_en = df_result.value.tolist()

# series = df_ar['id'].value_counts()
# df_result = pd.DataFrame(series)
# df_result = df_result.reset_index()  
# df_result.columns = ['id', 'value']
# df_result.loc[5:, 'id'] = 'other'

# labels_ar = df_result.id.tolist()
# values_ar = df_result.value.tolist()

# series = df_zh['id'].value_counts()
# df_result = pd.DataFrame(series)
# df_result = df_result.reset_index()  
# df_result.columns = ['id', 'value']
# df_result.loc[5:, 'id'] = 'other'

# labels_zh = df_result.id.tolist()
# values_zh = df_result.value.tolist()

# ################################################################################

# # Variables
# font_size = 20
# font_color = "black"
# font_family = "Serif"
# hole_size = 0.45
# half_transparent = 'rgba(255,255,255,0.5)'

# # Create subplots: use 'domain' type for Pie subplot
# fig = make_subplots(rows=1, cols=3, horizontal_spacing = 0.025, specs=[[{'type':'domain'}, {'type':'domain'}, {'type':'domain'}]])


# fig.add_trace(go.Pie(labels=labels_en, values=values_en, name="EN",
#                      sort=True,
#                      marker_colors=[p1,p2,p3,p4,p5,p11],
#                      ),
#               1, 1)

# fig.add_trace(go.Pie(labels=labels_ar, values=values_ar, name="AR",
#                      marker_colors=[p1,p6,p7,p3,p8,p11],
#                      ),
#               1, 2)

# fig.add_trace(go.Pie(labels=labels_zh, values=values_zh, name="ZH",
#                      marker_colors=[p1,p2,p9,p4,p3,p11],
#                      ),
#               1, 3)
# # Use `hole` to create a donut-like pie chart
# fig.update_traces(
#     textinfo = "value",
#     direction ='counterclockwise', 
#     hole=hole_size, 
#     hoverinfo="label+percent+name",
#     # marker_colors=px.colors.qualitative.Prism,
#     )

# # fig.update_layout(
# #     title_text="Global Emissions 1990-2011",
# #     # Add annotations in the center of the donut pies.
# #     annotations=[dict(text='GHG', x=0.18, y=0.5, font_size=20, showarrow=False),
# #                  dict(text='CO2', x=0.82, y=0.5, font_size=20, showarrow=False)])

# fig.update_layout(
#     width = 1000, height=300,
#     margin={"r":0,"t":0,"l":0,"b":0},
#     font_family=font_family,
#     font_color=font_color,
#     font_size=font_size,
#     # legend_traceorder="reversed",
#     # showlegend=False,
#     legend=dict(
#         bgcolor=half_transparent,
#         x=0, y=0.5, xanchor="right", yanchor="middle",
#         font = dict(size=font_size-4),
#         # traceorder = 'normal', 
#         orientation="v"
#         ))
#     # legend=dict(
#     #     bgcolor=half_transparent,
#     #     x=0, y=0, xanchor="left", yanchor="top",
#     #     font = dict(size=font_size-2),
#     #     # traceorder = 'normal', 
#     #     orientation="h"
#     #     ))

# fig.update_layout(annotations=[dict(text='English', x=0.11, y=0.5, font_size=24, showarrow=False), 
#                                dict(text='Arabic', x=0.50, y=0.5, font_size=24, showarrow=False),
#                                dict(text='Chinese', x=0.89, y=0.5, font_size=24, showarrow=False)
#                                ])



# fig.show()

# # write and download
# filename="ids_trio"
# # fig.write_html(filename + ".html")
# # files.download(filename + ".html")

# # fig.update_layout(
# #     width = 600, height=600,
# #     )



# fig.write_image(filename + ".pdf", engine="kaleido")
# files.download(filename + ".pdf")

#### Top 5 by language

In [250]:
df_en = df.loc[df['language'] == 'English']
df_ar = df.loc[df['language'] == 'Arabic']
df_zh = df.loc[df['language'] == 'Chinese']

In [251]:
# # Second try
# # Plots to count things
# series = df_en['id'].value_counts()
# series = series.sort_values(ascending=False)
# df_result = pd.DataFrame(series)
# df_result = df_result.reset_index()  
# df_result.columns = ['id', 'total']
# df_result.loc[5:, 'id'] = 'other' # Replace everything with 'others' after the fifth!
# df_result_en = df_result

# series = df_ar['id'].value_counts()
# series = series.sort_values(ascending=False)
# df_result = pd.DataFrame(series)
# df_result = df_result.reset_index()  
# df_result.columns = ['id', 'total']
# df_result.loc[5:, 'id'] = 'other' # Replace everything with 'others' after the fifth!
# df_result_ar = df_result

# series = df_zh['id'].value_counts()
# series = series.sort_values(ascending=False)
# df_result = pd.DataFrame(series)
# df_result = df_result.reset_index()  
# df_result.columns = ['id', 'total']
# df_result.loc[5:, 'id'] = 'other' # Replace everything with 'others' after the fifth!
# df_result_zh = df_result

# ################################################################################

# # Variables
# font_size = 20
# font_color = "black"
# font_family = "Serif"
# hole_size = 0.45
# half_transparent = 'rgba(255,255,255,0.5)'

# # Create subplots: use 'domain' type for Pie subplot
# fig = make_subplots(rows=1, cols=3, horizontal_spacing = 0.025, specs=[[{'type':'domain'}, {'type':'domain'}, {'type':'domain'}]])

# fig.add_trace(go.Pie(name='English',
#                      labels=df_result_en.id,
#                      values=df_result_en.total,
#                      marker_colors=[p1,p2,p3,p4,p5,p11],
#                      ), 1, 1)

# fig.add_trace(go.Pie(name='Arabic',
#                      labels=df_result_ar.id, 
#                      values=df_result_ar.total,
#                      marker_colors=[p1,p6,p7,p3,p8,p11],  
#                      ), 1, 2)

# fig.add_trace(go.Pie(labels=df_result_zh.id, 
#                      values=df_result_zh.total,
#                      marker_colors=[p1,p2,p9,p4,p3,p11],  
#                      ), 1, 3)

# fig.update_traces(
#     direction ='clockwise',
#     textinfo = "value",
#     hoverinfo="label+percent+name",
#     hole=hole_size, 
#     sort=False,
#     # marker_colors=px.colors.qualitative.Prism,
#     )

# fig.update_layout(
#     width = 1000, height=300,
#     margin={"r":0,"t":0,"l":0,"b":0},
#     font_family=font_family,
#     font_color=font_color,
#     font_size=font_size,
#     # legend_traceorder="reversed",
#     # showlegend=False,
#     legend=dict(
#         bgcolor=half_transparent,
#         x=0, y=0.5, xanchor="right", yanchor="middle",
#         font = dict(size=font_size-4),
#         # traceorder = 'normal', 
#         orientation="v"
#         ))
#     # legend=dict(
#     #     bgcolor=half_transparent,
#     #     x=0, y=0, xanchor="left", yanchor="top",
#     #     font = dict(size=font_size-2),
#     #     # traceorder = 'normal', 
#     #     orientation="h"
#     #     ))

# fig.update_layout(annotations=[dict(text='English', x=0.11, y=0.5, font_size=24, showarrow=False), 
#                                dict(text='Arabic', x=0.50, y=0.5, font_size=24, showarrow=False),
#                                dict(text='Chinese', x=0.89, y=0.5, font_size=24, showarrow=False)
#                                ])

# fig.show()

# # write and download
# filename="ids_trio_v"
# # fig.write_html(filename + ".html")
# # files.download(filename + ".html")

# # fig.write_image(filename + ".png", engine="kaleido")
# fig.write_image(filename + ".pdf", engine="kaleido")
# # files.download(filename + ".png")
# files.download(filename + ".pdf")

#### Trilingual

In [252]:
# Plots to count things
series = df_en['id'].value_counts()
series = series.sort_values(ascending=False)
df_result = pd.DataFrame(series)
df_result = df_result.reset_index()  
df_result.columns = ['id', 'total']
df_result.loc[5:, 'id'] = 'other' # Replace everything with 'others' after the fifth!
# df_result['id'] = pd.Categorical(df_result['id'], ["yes", "maybe", "no"]) 
# df_result.sort_values(['id'], inplace = True,)
df_result_en = df_result

series = df_ar['id'].value_counts()
series = series.sort_values(ascending=False)
df_result = pd.DataFrame(series)
df_result = df_result.reset_index()  
df_result.columns = ['id', 'total']
df_result.loc[5:, 'id'] = 'other' # Replace everything with 'others' after the fifth!
# df_result['id'] = pd.Categorical(df_result['id'], ["yes", "maybe", "no"]) 
# df_result.sort_values(['id'], inplace = True,)
df_result_ar = df_result

series = df_zh['id'].value_counts()
series = series.sort_values(ascending=False)
df_result = pd.DataFrame(series)
df_result = df_result.reset_index()  
df_result.columns = ['id', 'total']
df_result.loc[5:, 'id'] = 'other' # Replace everything with 'others' after the fifth!
# df_result['id'] = pd.Categorical(df_result['id'], ["yes", "maybe", "no"]) 
# df_result.sort_values(['id'], inplace = True,)
df_result_zh = df_result

################################################################################

# Variables
font_size = 24
hole_size = 0.45

# Create subplots: use 'domain' type for Pie subplot
fig = make_subplots(rows=1, cols=3, horizontal_spacing = 0.01, specs=[[{'type':'domain'}, {'type':'domain'}, {'type':'domain'}]])

fig.add_trace(go.Pie(name='English',
                     labels=df_result_en.id,
                     values=df_result_en.total,
                    #  marker_colors=[p1,p2,p3,p4,p5,p11],
                     ), 1, 1)

fig.add_trace(go.Pie(name='Arabic',
                     labels=df_result_ar.id, 
                     values=df_result_ar.total,
                    #  marker_colors=[p1,p6,p7,p3,p8,p11],  
                     ), 1, 2)

fig.add_trace(go.Pie(labels=df_result_zh.id, 
                     values=df_result_zh.total,
                    #  marker_colors=[p1,p2,p9,p4,p3,p11],  
                     ), 1, 3)

fig.update_traces(
    direction ='clockwise',
    textinfo = "value",
    hoverinfo="label+percent+name",
    hole=hole_size, 
    sort=False,
    # marker_colors=px.colors.qualitative.Prism,
    )

fig.update_layout(
    template = 'top5',
    width = 1000, height=400,
    margin={"r":0,"t":0,"l":0,"b":0},
    font_family=font_family,
    font_color=font_color,
    font_size=font_size,
    # legend_traceorder="reversed",
    # showlegend=False,
    # legend=dict(
    #     bgcolor=half_transparent,
    #     x=0, y=0.5, xanchor="right", yanchor="middle",
    #     font = dict(size=font_size-4),
    #     # traceorder = 'normal', 
    #     orientation="v"
    #     ))
    legend=dict(
        bgcolor=half_transparent,
        x=0, y=0, xanchor="left", yanchor="top",
        font = dict(size=font_size-2),
        # traceorder = 'normal', 
        orientation="h"
        ))

fig.update_layout(annotations=[dict(text='English', x=0.12, y=0.5, font_size=24, showarrow=False), 
                               dict(text='Arabic', x=0.50, y=0.5, font_size=24, showarrow=False),
                               dict(text='Chinese', x=0.88, y=0.5, font_size=24, showarrow=False)
                               ])

fig.show()

# write and download
filename="ids_tripie"
# fig.write_html(filename + ".html")

fig.write_image(path_out_pdf+filename + ".pdf", engine="kaleido")
# files.download(filename + ".pdf")

#### Mono

In [253]:
# df_en = df.loc[df['language'] == 'English']
# df_ar = df.loc[df['language'] == 'Arabic']
# df_zh = df.loc[df['language'] == 'Chinese']

##### English

In [254]:
# Plots to count things
series = df_en['id'].value_counts()
series = series.sort_values(ascending=False)
df_result = pd.DataFrame(series)
df_result = df_result.reset_index()  
df_result.columns = ['id', 'total']
df_result.loc[5:, 'id'] = 'other' # Replace everything with 'others' after the fifth!
# df_result_en = df_result

################################################################################

# Variables
font_size = 28
hole_size = 0.45

fig = go.Figure()

# fig = px.pie(df_result,
#              values='total',
#              names='id',
#             #  color_discrete_sequence=[p2,p6,p4],
#              color_discrete_sequence=px.colors.qualitative.Prism,
#             #  direction ='clockwise',
#             #  sort = True,
#              hole=hole_size
#             #  title='Ratio of spice names'
#              )

# fig = go.Figure(data=[go.Pie(labels=df_result.id, 
#                               values=df_result.total, 
#                               # direction ='clockwise', 
#                               hole=hole_size, 
#                               # marker_colors=px.colors.qualitative.Prism,
#                               marker_colors=[p1,p2,p3,p4,p5,p11],
#                               sort=False)])

# fig.update_traces(textinfo = "label",
#                   textposition='outside')

common_props = dict(labels=df_result.id,
                    values=df_result.total,
                    )

fig.add_trace(go.Pie(
    **common_props,
    marker_colors=[p1,p2,p3,p4,p5,p11],
    textinfo='value',
    textposition='inside'))

fig.add_trace(go.Pie(
    **common_props,
    marker_colors=[transparent,transparent,transparent,transparent,transparent,transparent],
    textinfo='label',
    textposition='outside'))

fig.update_traces(
    # hoverinfo="label+percent+name",
    direction ='clockwise',
    hole=hole_size, 
    sort=False,
    # marker_colors=px.colors.qualitative.Prism,
    )

fig.update_layout(
    width = 600, height=500,
    margin={"r":0,"t":0,"l":0,"b":0},
    font_family=font_family,
    font_color=font_color,
    font_size=font_size,
    showlegend=False,
    legend=dict(
        bgcolor=half_transparent,
        x=0, y=1, xanchor="right", yanchor="top",
        font = dict(size=font_size-2),
        # traceorder = 'normal', 
        orientation="v"
        ))

fig.update_layout(annotations=[dict(text='English', x=0.5, y=0.5, font_size=font_size, showarrow=False)])


# write and download
filename="ids_top_en"
# fig.write_html(filename + ".html")
# files.download(filename + ".html")

# fig.update_layout(
#     width = 600, height=600,
#     )

fig.show()
fig.write_image(path_out_pdf+filename + ".pdf", engine="kaleido")
# files.download(filename + ".pdf")

##### Arabic

In [255]:
# Plots to count things
series = df_ar['id'].value_counts()
series = series.sort_values(ascending=False)
df_result = pd.DataFrame(series)
df_result = df_result.reset_index()  
df_result.columns = ['id', 'total']
df_result.loc[5:, 'id'] = 'other' # Replace everything with 'others' after the fifth!
# df_result_ar = df_result

################################################################################

# Variables
font_size = 28
hole_size = 0.45

fig = go.Figure()

# fig = px.pie(df_result,
#              values='total',
#              names='id',
#             #  color_discrete_sequence=[p2,p6,p4],
#              color_discrete_sequence=px.colors.qualitative.Prism,
#             #  direction ='clockwise',
#             #  sort = True,
#              hole=hole_size
#             #  title='Ratio of spice names'
#              )

# fig = go.Figure(data=[go.Pie(labels=df_result.id, 
#                               values=df_result.total, 
#                               # direction ='clockwise', 
#                               hole=hole_size, 
#                               # marker_colors=px.colors.qualitative.Prism,
#                               marker_colors=[p1,p2,p3,p4,p5,p11],
#                               sort=False)])

# fig.update_traces(textinfo = "label",
#                   textposition='outside')

common_props = dict(labels=df_result.id,
                    values=df_result.total,
                    )

fig.add_trace(go.Pie(
    **common_props,
    marker_colors=[p1,p6,p7,p3,p8,p11], 
    textinfo='value',
    textposition='inside'))

fig.add_trace(go.Pie(
    **common_props,
    marker_colors=[transparent,transparent,transparent,transparent,transparent,transparent],
    textinfo='label',
    textposition='outside'))

fig.update_traces(
    # hoverinfo="label+percent+name",
    direction ='clockwise',
    hole=hole_size, 
    sort=False,
    # marker_colors=px.colors.qualitative.Prism,
    )

fig.update_layout(
    width = 600, height=500,
    margin={"r":0,"t":0,"l":0,"b":0},
    font_family=font_family,
    font_color=font_color,
    font_size=font_size,
    showlegend=False,
    legend=dict(
        bgcolor=half_transparent,
        x=0, y=1, xanchor="right", yanchor="top",
        font = dict(size=font_size-2),
        # traceorder = 'normal', 
        orientation="v"
        ))

fig.update_layout(annotations=[dict(text='Arabic', x=0.5, y=0.5, font_size=font_size, showarrow=False)])


# write and download
filename="ids_top_ar"
# fig.write_html(filename + ".html")
# files.download(filename + ".html")

# fig.update_layout(
#     width = 600, height=600,
#     )

fig.show()
fig.write_image(path_out_pdf+filename + ".pdf", engine="kaleido")
# files.download(filename + ".pdf")

##### Chinese

In [256]:
# Plots to count things
series = df_zh['id'].value_counts()
series = series.sort_values(ascending=False)
df_result = pd.DataFrame(series)
df_result = df_result.reset_index()  
df_result.columns = ['id', 'total']
df_result.loc[5:, 'id'] = 'other' # Replace everything with 'others' after the fifth!
# df_result_en = df_result

################################################################################

# Variables
font_size = 28
hole_size = 0.45

fig = go.Figure()

# fig = px.pie(df_result,
#              values='total',
#              names='id',
#             #  color_discrete_sequence=[p2,p6,p4],
#              color_discrete_sequence=px.colors.qualitative.Prism,
#             #  direction ='clockwise',
#             #  sort = True,
#              hole=hole_size
#             #  title='Ratio of spice names'
#              )

# fig = go.Figure(data=[go.Pie(labels=df_result.id, 
#                               values=df_result.total, 
#                               # direction ='clockwise', 
#                               hole=hole_size, 
#                               # marker_colors=px.colors.qualitative.Prism,
#                               marker_colors=[p1,p2,p3,p4,p5,p11],
#                               sort=False)])

# fig.update_traces(textinfo = "label",
#                   textposition='outside')

common_props = dict(labels=df_result.id,
                    values=df_result.total,
                    )

fig.add_trace(go.Pie(
    **common_props,
    marker_colors=[p1,p2,p9,p4,p3,p11],  
    textinfo='value',
    textposition='inside'))

fig.add_trace(go.Pie(
    **common_props,
    marker_colors=[transparent,transparent,transparent,transparent,transparent,transparent],
    textinfo='label',
    textposition='outside'))

fig.update_traces(
    # hoverinfo="label+percent+name",
    direction ='clockwise',
    hole=hole_size, 
    sort=False,
    # marker_colors=px.colors.qualitative.Prism,
    )

fig.update_layout(
    width = 600, height=500,
    margin={"r":0,"t":0,"l":0,"b":0},
    font_family=font_family,
    font_color=font_color,
    font_size=font_size,
    showlegend=False,
    legend=dict(
        bgcolor=half_transparent,
        x=0, y=1, xanchor="right", yanchor="top",
        font = dict(size=font_size-2),
        # traceorder = 'normal', 
        orientation="v"
        ))

fig.update_layout(annotations=[dict(text='Chinese', x=0.5, y=0.5, font_size=font_size, showarrow=False)])


# write and download
filename="ids_top_zh"
# fig.write_html(filename + ".html")
# files.download(filename + ".html")

# fig.update_layout(
#     width = 600, height=600,
#     )

fig.show()
fig.write_image(path_out_pdf+filename + ".pdf", engine="kaleido")
# files.download(filename + ".pdf")

## Analysis of name features

In [257]:
#Change key

key = 'analyzability'

# Read and store content of an excel file 
read_file = pd.read_excel(path_in+"names.xlsx")
# Write the dataframe object into csv file
read_file.to_csv (path_in+"names.csv", index = None, header=True)
# Load in dataset of names
df=pd.read_csv(path_in+'names.csv', header =[0], delimiter=',', encoding="utf-8")
# Include ones to include
df = df.loc[df['include'] == 'yes']

#filter
# df = df.loc[(df['analyzability'] == 'analyzable') | (df['analyzability'] == 'semi-analyzable')]

#Explode layered items
df[key] = df[key].str.split('; ')
df = df.explode(key)

df_en = df.loc[df['language'] == 'English']
df_ar = df.loc[df['language'] == 'Arabic']
df_zh = df.loc[df['language'] == 'Chinese']

count = df_en[key].value_counts()
count = count.sort_values(ascending=False)
# count[:10]
table_en = pd.DataFrame(count)
table_en.columns = ['English']

count = df_ar[key].value_counts()
count = count.sort_values(ascending=False)
# count[:10]
table_ar = pd.DataFrame(count)
table_ar.columns = ['Arabic']

count = df_zh[key].value_counts()
count = count.sort_values(ascending=False)
# count[:10]
table_zh = pd.DataFrame(count)
table_zh.columns = ['Chinese']

table = pd.concat([table_en, table_ar, table_zh], axis=1)
table.index.names = [key]
table.reset_index(inplace=True)
# table.to_csv('table.csv')
table

Unnamed: 0,analyzability,English,Arabic,Chinese
0,analyzable,111,50,99
1,unanalyzable,39,32,20
2,semi-analyzable,3,4,1


In [258]:
# file = 'table.csv'
# df = pd.read_csv(file)
# data=df.to_dict(orient='records')
# markdownTable(data).setParams(row_sep = 'markdown', quote = False).getMarkdown()

##### Pie

In [259]:
series = df_en[key].value_counts()
series = series.sort_values(ascending=False)
df_result = pd.DataFrame(series)
df_result = df_result.reset_index()  
df_result.columns = ['id', 'total']
df_result.loc[5:, 'id'] = 'other' # Replace everything with 'others' after the fifth!
# df_result['id'] = pd.Categorical(df_result['id'], ["yes", "semi", "no"]) 
# df_result.sort_values(['id'], inplace = True,)
df_result_en = df_result

series = df_ar[key].value_counts()
series = series.sort_values(ascending=False)
df_result = pd.DataFrame(series)
df_result = df_result.reset_index()  
df_result.columns = ['id', 'total']
df_result.loc[5:, 'id'] = 'other' # Replace everything with 'others' after the fifth!
# df_result['id'] = pd.Categorical(df_result['id'], ["yes", "semi", "no"]) 
# df_result.sort_values(['id'], inplace = True,)
df_result_ar = df_result

series = df_zh[key].value_counts()
series = series.sort_values(ascending=False)
df_result = pd.DataFrame(series)
df_result = df_result.reset_index()  
df_result.columns = ['id', 'total']
df_result.loc[5:, 'id'] = 'other' # Replace everything with 'others' after the fifth!
# df_result['id'] = pd.Categorical(df_result['id'], ["yes", "semi", "no"]) 
# df_result.sort_values(['id'], inplace = True,)
df_result_zh = df_result

In [260]:
# Variables
font_size = 24
hole_size = 0.45

# Create subplots: use 'domain' type for Pie subplot
fig = make_subplots(rows=1, cols=3, horizontal_spacing=0.09, specs=[[{'type':'domain'}, {'type':'domain'}, {'type':'domain'}]])

fig.add_trace(go.Pie(name='English',
                     labels=df_result_en.id,
                     values=df_result_en.total,
                     sort=False,
                    #  marker_colors=[p2, p11, p8],
                     ), 1, 1)

fig.add_trace(go.Pie(name='Arabic',
                     labels=df_result_ar.id, 
                     values=df_result_ar.total,
                    #  marker_colors=[p2, p11, p8],
                     ), 1, 2)

fig.add_trace(go.Pie(name='Chinese',
                     labels=df_result_zh.id, 
                     values=df_result_zh.total,
                    #  marker_colors=[p2, p11, p8],
                     ), 1, 3)

fig.update_traces(
    direction ='clockwise',
    textinfo = "value",
    hoverinfo="label+percent+name",
    hole=hole_size, 
    sort=False,
    # marker_colors=px.colors.qualitative.Prism,
    # marker_colors=['crimson', 'pink', 'darkslategray'],
    )

fig.update_layout(
    template = 'plotly_white+top5',
    width = 1000, 
    height=270,
    margin={"r":0,"t":0,"l":0,"b":0},
    font_family=font_family,
    font_color=font_color,
    font_size=font_size,
    # legend_traceorder="reversed",
    # showlegend=False,
    legend=dict(
        bgcolor=half_transparent,
        x=0, y=1, xanchor="right", yanchor="top",
        font = dict(size=font_size-2),
        # traceorder = 'normal', 
        orientation="v"
        ))
    # legend=dict(
    #     bgcolor=half_transparent,
    #     x=0, y=0, xanchor="left", yanchor="top",
    #     font = dict(size=font_size-4),
    #     # traceorder = 'normal', 
    #     orientation="h"
    #     ))

fig.update_layout(annotations=[dict(text='English', x=0.09, y=0.5, font_size=font_size, showarrow=False), 
                               dict(text='Arabic', x=0.5, y=0.5, font_size=font_size, showarrow=False),
                               dict(text='Chinese', x=0.91, y=0.5, font_size=font_size, showarrow=False)
                               ])

fig.show()

# write and download
filename = key + "_tripie"
filename = re.sub(" ", "_", filename)
# fig.write_html(filename + ".html")
# files.download(filename + ".html")

fig.write_image(path_out_pdf+filename + ".pdf", engine="kaleido")
# files.download(filename + ".pdf")

#### Bar

In [261]:
top = 5

# Bar plots
series = df_en[key].value_counts()
series = series.sort_values(ascending=False)
df_result = pd.DataFrame(series)
df_result = df_result.reset_index()  
df_result.columns = ['item', 'total']
df_result['lang'] = 'English'
df_result.loc[top:, 'item'] = 'other'
df_result_en = df_result

series = df_ar[key].value_counts()
series = series.sort_values(ascending=False)
df_result = pd.DataFrame(series)
df_result = df_result.reset_index()  
df_result.columns = ['item', 'total']
df_result['lang'] = 'Arabic'
df_result.loc[top:, 'item'] = 'other'
df_result_ar = df_result

series = df_zh[key].value_counts()
series = series.sort_values(ascending=False)
df_result = pd.DataFrame(series)
df_result = df_result.reset_index()  
df_result.columns = ['item', 'total']
df_result['lang'] = 'Chinese'
df_result.loc[top:, 'item'] = 'other'
df
df_result_zh = df_result

df_result_all = pd.concat([df_result_en, df_result_ar, df_result_zh])
df_result_all = df_result_all.loc[df_result_all['item'] != 'other']
df=df_result_all

In [262]:
# Variables
font_size = 20

fig = px.bar(df, 
             x="lang", y="total", 
             color="item", 
             text="item",
            #  color_discrete_sequence = px.colors.qualitative.Prism,
             color_discrete_sequence = prism_extended,
             template="plotly_white",
            #  labels = dict(lang="Language",total="Times")
             )

fig.update_layout(
    font_family=font_family,
    font_color=font_color,
    font_size=font_size)

fig.update_traces(textfont_size=font_size-2, textangle=0, textposition="inside")

fig.update_xaxes(visible=True, title=None)
fig.update_yaxes(visible=True, title=None, showticklabels=True)
fig.update_xaxes(ticklabelposition="inside")
fig.update_yaxes(ticklabelposition="inside") # tickangle = 0
fig.update_xaxes(tickfont = dict(family=font_family, color=font_color, size = font_size-2))
fig.update_yaxes(tickfont = dict(family=font_family, color=font_color, size = font_size-2))
# fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='#f8f8f8') #f0f0f0
# fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='#f8f8f8')
# fig.update_layout(paper_bgcolor="white") # transparent background rgb(0,0,0,0)
# fig.update_layout(plot_bgcolor="white")#f6f6f6

fig.update_layout(
    width = 800, height=300,
    margin={"r":0,"t":0,"l":0,"b":0},
    # showlegend=False,
    legend=dict(
        bgcolor=half_transparent,
        x=0, y=1, xanchor="right", yanchor="top",
        font = dict(size=font_size-4),
        title = key,
        # traceorder = 'normal', 
        orientation="v"
        ))
    # legend=dict(
    #     bgcolor=half_transparent,
    #     x=0, y=0, xanchor="left", yanchor="top",
    #     font = dict(size=font_size-2),
    #     # traceorder = 'normal', 
    #     orientation="h"
    #     ))


# fig.update_layout(annotations=[dict(text='English', x=0.11, y=0.5, font_size=24, showarrow=False), 
#                                dict(text='Arabic', x=0.50, y=0.5, font_size=24, showarrow=False),
#                                dict(text='Chinese', x=0.89, y=0.5, font_size=24, showarrow=False)
#                                ])

fig.show()

# write and download
filename = key + "_bar"
filename = re.sub(" ", "_", filename)
# fig.write_html(filename + ".html")
# files.download(filename + ".html")

fig.write_image(path_out_pdf+filename + ".pdf", engine="kaleido")
# files.download(filename + ".pdf")

### Most frequent name blueprint

In [263]:
# Read and store content of an excel file 
read_file = pd.read_excel(path_in+"names.xlsx")
# Write the dataframe object into csv file
read_file.to_csv (path_in+"names.csv", index = None, header=True)
# Load in dataset of names
df=pd.read_csv(path_in+'names.csv', header =[0], delimiter=',', encoding="utf-8")
# Include ones to include
df = df.loc[df['include'] == 'yes']

#filter
df = df.loc[(df['analyzability'] == 'analyzable') | (df['analyzability'] == 'semi-analyzable')]


df_en = df.loc[df['language'] == 'English']
df_ar = df.loc[df['language'] == 'Arabic']
df_zh = df.loc[df['language'] == 'Chinese']

In [264]:
print(df.groupby(['headword type','modifier type']).size().idxmax())
top = df.groupby(['headword type','modifier type']).size().nlargest(10)
top = pd.DataFrame(top)
top.reset_index(inplace=True)
top
# top.to_csv('table.csv')

('prototype similarity', 'geographic origin')


Unnamed: 0,headword type,modifier type,0
0,prototype similarity,geographic origin,45
1,prototype,geographic origin,19
2,prototype,plant part,18
3,prototype similarity,color,16
4,prototype,color,14
5,prototype similarity,taste,11
6,prototype similarity,shape,10
7,prototype similarity,source,9
8,prototype,positive authenticity,6
9,prototype similarity,ecology,5


In [265]:
# fig = px.imshow(df,
#                 # labels=dict(x="Day of Week", y="Time of Day", color="Productivity"),
#                 x='headword type',
#                 y='modifier type'
#                )
# fig.update_xaxes(side="top")
# fig.show()

### Features one by one

#### Analyzability

In [266]:
# Read and store content of an excel file 
read_file = pd.read_excel(path_in+"names.xlsx")
# Write the dataframe object into csv file
read_file.to_csv (path_in+"names.csv", index = None, header=True)
# Load in dataset of names
df=pd.read_csv(path_in+'names.csv', header =[0], delimiter=',', encoding="utf-8")
# Include ones to include
df = df.loc[df['include'] == 'yes']

In [267]:
names_total = df.shape[0]

print("Total:\t\t",names_total)
print(df.analyzability.value_counts())

Total:		 360
analyzable         260
unanalyzable        91
semi-analyzable      8
Name: analyzability, dtype: int64


In [268]:
# # Plots to count things
# series = df['analyzability'].value_counts()
# df_result = pd.DataFrame(series)
# df_result = df_result.reset_index()  
# df_result.columns = ['language', 'total']
# # print(df_result)
# # df_result.loc[5:, 'language'] = 'other' # Replace everything with 'others' after the fifth!

# ################################################################################

# # Variables
# font_size = 40
# font_color = "black"
# font_family = "Serif"
# hole_size = 0.45
# half_transparent = 'rgba(255,255,255,0.5)'

# fig = px.pie(df_result,
#              values='total',
#              names='language',
#              color_discrete_sequence=[p1,p11,p5],
#             #  sort = True,
#              hole=hole_size,
#             #  title='Ratio of spice names'
#              )

# # fig = go.Figure(data=[go.Pie(labels=df_result.language, 
# #                               values=df_result.total, 
# #                               direction ='clockwise', 
# #                               hole=.4, 
# #                               marker_colors=[p2,p6,p4], 
# #                               sort=True)])

# fig.update_traces(textinfo='value',
#                   direction ='clockwise',
#                   sort=False)

# fig.update_layout(
#     margin={"r":0,"t":0,"l":0,"b":0},
#     font_family=font_family,
#     font_color=font_color,
#     font_size=font_size,
#     # legend=dict(
#     #     bgcolor=half_transparent,
#     #     y=0, x=0, xanchor="left", yanchor="bottom", 
#     #     # traceorder = 'normal', 
#     #     # orientation="h"
#     #     ))
#     legend=dict(
#         bgcolor=half_transparent,
#         y=1, x=0, xanchor="left", yanchor="bottom", 
#         # traceorder = 'normal', 
#         orientation="h"
#         ))

# fig.update_layout(annotations=[dict(text=str(names_total), x=0.5, y=0.5, font_size=font_size, showarrow=False)])
                               
# # fig.show()
# # write and download
# filename="analyzability_pie"
# # fig.write_html(filename + ".html")
# # files.download(filename + ".html")

# fig.update_layout(
#     width = 600, height=600,
#     )

# fig.show()
# fig.write_image(filename + ".pdf", engine="kaleido")
# files.download(filename + ".pdf")

In [269]:
df_en = df.loc[df['language'] == 'English']
df_ar = df.loc[df['language'] == 'Arabic']
df_zh = df.loc[df['language'] == 'Chinese']

series = df_en['analyzability'].value_counts()
series = series.sort_values(ascending=False)
df_result = pd.DataFrame(series)
df_result = df_result.reset_index()  
df_result.columns = ['id', 'total']
df_result.loc[5:, 'id'] = 'other' # Replace everything with 'others' after the fifth!
# df_result['id'] = pd.Categorical(df_result['id'], ["yes", "semi", "no"]) 
df_result.sort_values(['id'], inplace = True,)
df_result_en = df_result

series = df_ar['analyzability'].value_counts()
series = series.sort_values(ascending=False)
df_result = pd.DataFrame(series)
df_result = df_result.reset_index()  
df_result.columns = ['id', 'total']
df_result.loc[5:, 'id'] = 'other' # Replace everything with 'others' after the fifth!
# df_result['id'] = pd.Categorical(df_result['id'], ["yes", "semi", "no"]) 
df_result.sort_values(['id'], inplace = True,)
df_result_ar = df_result

series = df_zh['analyzability'].value_counts()
series = series.sort_values(ascending=False)
df_result = pd.DataFrame(series)
df_result = df_result.reset_index()  
df_result.columns = ['id', 'total']
df_result.loc[5:, 'id'] = 'other' # Replace everything with 'others' after the fifth!
# df_result['id'] = pd.Categorical(df_result['id'], ["yes", "semi", "no"]) 
df_result.sort_values(['id'], inplace = True,)
df_result_zh = df_result


In [270]:
################################################################################
# Variables
font_size = 26
hole_size = 0.45

# Create subplots: use 'domain' type for Pie subplot
fig = make_subplots(rows=1, cols=3, horizontal_spacing=0.1, specs=[[{'type':'domain'}, {'type':'domain'}, {'type':'domain'}]])

fig.add_trace(go.Pie(name='English',
                     labels=df_result_en.id,
                     values=df_result_en.total,
                    #  marker_colors=[p2, p11, p8],
                     ), 1, 1)

fig.add_trace(go.Pie(name='Arabic',
                     labels=df_result_ar.id, 
                     values=df_result_ar.total,
                    #  marker_colors=[p2, p11, p8],
                     ), 1, 2)

fig.add_trace(go.Pie(name='Chinese',
                     labels=df_result_zh.id, 
                     values=df_result_zh.total,
                    #  marker_colors=[p2, p11, p8],
                     ), 1, 3)

fig.update_traces(
    direction ='clockwise',
    textinfo = "value",
    hoverinfo="label+percent+name",
    hole=hole_size, 
    sort=False,
    # marker_colors=px.colors.qualitative.Prism,
    # marker_colors=['crimson', 'pink', 'darkslategray'],
    )

fig.update_layout(
    template = 'plotly_white+yesnomaybe',
    width = 1000, height=225,
    margin={"r":0,"t":0,"l":0,"b":0},
    font_family=font_family,
    font_color=font_color,
    font_size=font_size,
    # legend_traceorder="reversed",
    # showlegend=False,
    legend=dict(
        bgcolor=half_transparent,
        x=0, y=1, xanchor="right", yanchor="top",
        font = dict(size=font_size),
        # traceorder = 'normal', 
        orientation="v"
        ))
    # legend=dict(
    #     bgcolor=half_transparent,
    #     x=0, y=0, xanchor="left", yanchor="top",
    #     font = dict(size=font_size),
    #     # traceorder = 'normal', 
    #     orientation="h"
    #     ))

fig.update_layout(annotations=[dict(text='English', x=0.085, y=0.5, font_size=font_size-2, showarrow=False), 
                               dict(text='Arabic', x=0.5, y=0.5, font_size=font_size-2, showarrow=False),
                               dict(text='Chinese', x=0.9175, y=0.5, font_size=font_size-2, showarrow=False)
                               ])

fig.show()

# write and download
filename="analyzability_tripie"
# fig.write_html(filename + ".html")
# files.download(filename + ".html")

fig.write_image(path_out_pdf+filename + ".pdf", engine="kaleido")

#### Structure

In [271]:
series = df_en['structure'].value_counts()
series = series.sort_values(ascending=False)
df_result = pd.DataFrame(series)
df_result = df_result.reset_index()  
df_result.columns = ['id', 'total']
df_result.loc[5:, 'id'] = 'other' # Replace everything with 'others' after the fifth!
# df_result['id'] = pd.Categorical(df_result['id'], ["yes", "semi", "no"]) 
df_result.sort_values(['id'], inplace = True,ascending=False)
df_result_en = df_result

series = df_ar['structure'].value_counts()
series = series.sort_values(ascending=False)
df_result = pd.DataFrame(series)
df_result = df_result.reset_index()  
df_result.columns = ['id', 'total']
df_result.loc[5:, 'id'] = 'other' # Replace everything with 'others' after the fifth!
# df_result['id'] = pd.Categorical(df_result['id'], ["yes", "semi", "no"]) 
df_result.sort_values(['id'], inplace = True,ascending=False)
df_result_ar = df_result

series = df_zh['structure'].value_counts()
series = series.sort_values(ascending=False)
df_result = pd.DataFrame(series)
df_result = df_result.reset_index()  
df_result.columns = ['id', 'total']
df_result.loc[5:, 'id'] = 'other' # Replace everything with 'others' after the fifth!
# df_result['id'] = pd.Categorical(df_result['id'], ["yes", "semi", "no"]) 
df_result.sort_values(['id'], inplace = True,ascending=False)
df_result_zh = df_result

In [272]:
################################################################################
# Variables
font_size = 26
hole_size = 0.45

# Create subplots: use 'domain' type for Pie subplot
fig = make_subplots(rows=1, cols=3, horizontal_spacing=0.1, specs=[[{'type':'domain'}, {'type':'domain'}, {'type':'domain'}]])

fig.add_trace(go.Pie(name='English',
                     labels=df_result_en.id,
                     values=df_result_en.total,
                    #  marker_colors=[p1, p3, p5, p7],
                     ), 1, 1)

fig.add_trace(go.Pie(name='Arabic',
                     labels=df_result_ar.id, 
                     values=df_result_ar.total,
                    #  marker_colors=[p1, p3, p5, p7],
                     ), 1, 2)

fig.add_trace(go.Pie(name='Chinese',
                     labels=df_result_zh.id, 
                     values=df_result_zh.total,
                    #  marker_colors=[p1, p3, p5, p7],
                     ), 1, 3)

fig.update_traces(
    direction ='clockwise',
    textinfo = "value",
    hoverinfo="label+percent+name",
    hole=hole_size, 
    sort=False,
    # marker_colors=px.colors.qualitative.Prism,
    # marker_colors=['crimson', 'pink', 'darkslategray'],
    )

fig.update_layout(
    template='odd',
    width = 1000, height=225,
    margin={"r":0,"t":0,"l":0,"b":0},
    font_family=font_family,
    font_color=font_color,
    font_size=font_size,
    # legend_traceorder="reversed",
    # showlegend=False,
    legend=dict(
        bgcolor=half_transparent,
        x=0, y=1, xanchor="right", yanchor="top",
        font = dict(size=font_size),
        # traceorder = 'normal', 
        orientation="v"
        ))
    # legend=dict(
    #     bgcolor=half_transparent,
    #     x=0, y=0, xanchor="left", yanchor="top",
    #     font = dict(size=font_size),
    #     # traceorder = 'normal', 
    #     orientation="h"
    #     ))

fig.update_layout(annotations=[dict(text='English', x=0.085, y=0.5, font_size=font_size-2, showarrow=False), 
                               dict(text='Arabic', x=0.5, y=0.5, font_size=font_size-2, showarrow=False),
                               dict(text='Chinese', x=0.9175, y=0.5, font_size=font_size-2, showarrow=False)
                               ])

fig.show()

# write and download
filename="structure_tripie"
# fig.write_html(filename + ".html")
# files.download(filename + ".html")

fig.write_image(path_out_pdf+filename + ".pdf", engine="kaleido")

#### Headword


In [273]:
key = 'headword'

# Read and store content of an excel file 
read_file = pd.read_excel(path_in+"names.xlsx")
# Write the dataframe object into csv file
read_file.to_csv (path_in+"names.csv", index = None, header=True)
# Load in dataset of names
df=pd.read_csv(path_in+'names.csv', header =[0], delimiter=',', encoding="utf-8")
# Include ones to include
df = df.loc[df['include'] == 'yes']

#filter
df = df.loc[(df['analyzability'] == 'analyzable') | (df['analyzability'] == 'semi-analyzable')]

df_en = df.loc[df['language'] == 'English']
df_ar = df.loc[df['language'] == 'Arabic']
df_zh = df.loc[df['language'] == 'Chinese']

count = df_en[key].value_counts()
count = count.sort_values(ascending=False)
# count[:10]
table_en = pd.DataFrame(count)
table_en.columns = ['English']

count = df_ar[key].value_counts()
count = count.sort_values(ascending=False)
# count[:10]
table_ar = pd.DataFrame(count)
table_ar.columns = ['Arabic']

count = df_zh[key].value_counts()
count = count.sort_values(ascending=False)
# count[:10]
table_zh = pd.DataFrame(count)
table_zh.columns = ['Chinese']

table = pd.concat([table_en, table_ar, table_zh], axis=1)
table.index.names = [key]
table.reset_index(inplace=True)
# table.to_csv('table.csv')
table

Unnamed: 0,headword,English,Arabic,Chinese
0,pepper,28.0,12.0,18.0
1,cardamom,24.0,3.0,11.0
2,cassia,7.0,1.0,7.0
3,cumin,6.0,1.0,
4,cinnamon,6.0,9.0,
...,...,...,...,...
63,ingredient,,,1.0
64,yellow,,,1.0
65,húlúbā,,,1.0
66,eggplant,,,1.0


##### Pie

In [274]:
# series = df_en[key].value_counts()
# series = series.sort_values(ascending=False)
# df_result = pd.DataFrame(series)
# df_result = df_result.reset_index()  
# df_result.columns = ['id', 'total']
# df_result.loc[5:, 'id'] = 'other' # Replace everything with 'others' after the fifth!
# # df_result['id'] = pd.Categorical(df_result['id'], ["yes", "semi", "no"]) 
# # df_result.sort_values(['id'], inplace = True,)
# df_result_en = df_result

# series = df_ar[key].value_counts()
# series = series.sort_values(ascending=False)
# df_result = pd.DataFrame(series)
# df_result = df_result.reset_index()  
# df_result.columns = ['id', 'total']
# df_result.loc[5:, 'id'] = 'other' # Replace everything with 'others' after the fifth!
# # df_result['id'] = pd.Categorical(df_result['id'], ["yes", "semi", "no"]) 
# # df_result.sort_values(['id'], inplace = True,)
# df_result_ar = df_result

# series = df_zh[key].value_counts()
# series = series.sort_values(ascending=False)
# df_result = pd.DataFrame(series)
# df_result = df_result.reset_index()  
# df_result.columns = ['id', 'total']
# df_result.loc[5:, 'id'] = 'other' # Replace everything with 'others' after the fifth!
# # df_result['id'] = pd.Categorical(df_result['id'], ["yes", "semi", "no"]) 
# # df_result.sort_values(['id'], inplace = True,)
# df_result_zh = df_result


In [275]:
# ################################################################################
# # Variables
# font_size = 24
# font_color = "black"
# font_family = "Serif"
# hole_size = 0.45
# half_transparent = 'rgba(255,255,255,0.5)'

# # Create subplots: use 'domain' type for Pie subplot
# fig = make_subplots(rows=1, cols=3, horizontal_spacing=0.1, specs=[[{'type':'domain'}, {'type':'domain'}, {'type':'domain'}]])

# fig.add_trace(go.Pie(name='English',
#                      labels=df_result_en.id,
#                      values=df_result_en.total,
#                      sort=False,
#                     #  marker_colors=[p2, p11, p8],
#                      ), 1, 1)

# fig.add_trace(go.Pie(name='Arabic',
#                      labels=df_result_ar.id, 
#                      values=df_result_ar.total,
#                     #  marker_colors=[p2, p11, p8],
#                      ), 1, 2)

# fig.add_trace(go.Pie(name='Chinese',
#                      labels=df_result_zh.id, 
#                      values=df_result_zh.total,
#                     #  marker_colors=[p2, p11, p8],
#                      ), 1, 3)

# fig.update_traces(
#     direction ='clockwise',
#     textinfo = "value",
#     hoverinfo="label+percent+name",
#     hole=hole_size, 
#     sort=False,
#     # marker_colors=px.colors.qualitative.Prism,
#     # marker_colors=['crimson', 'pink', 'darkslategray'],
#     )

# fig.update_layout(
#     template = 'plotly_white+top5',
#     width = 1000, 
#     height=270,
#     margin={"r":0,"t":0,"l":0,"b":0},
#     font_family=font_family,
#     font_color=font_color,
#     font_size=font_size,
#     # legend_traceorder="reversed",
#     # showlegend=False,
#     legend=dict(
#         bgcolor=half_transparent,
#         x=0, y=1, xanchor="right", yanchor="top",
#         font = dict(size=font_size-4),
#         # traceorder = 'normal', 
#         orientation="v"
#         ))
#     # legend=dict(
#     #     bgcolor=half_transparent,
#     #     x=0, y=0, xanchor="left", yanchor="top",
#     #     font = dict(size=font_size-4),
#     #     # traceorder = 'normal', 
#     #     orientation="h"
#     #     ))

# fig.update_layout(annotations=[dict(text='English', x=0.09, y=0.5, font_size=font_size-2, showarrow=False), 
#                                dict(text='Arabic', x=0.5, y=0.5, font_size=font_size-2, showarrow=False),
#                                dict(text='Chinese', x=0.91, y=0.5, font_size=font_size-2, showarrow=False)
#                                ])

# fig.show()

# # write and download
# filename = key + "_tripie"
# filename = re.sub(" ", "_", filename)
# # fig.write_html(filename + ".html")
# # files.download(filename + ".html")

# fig.write_image(filename + ".pdf", engine="kaleido")
# files.download(filename + ".pdf")

##### Bar

In [276]:
top = 5

# Bar plots
series = df_en[key].value_counts()
series = series.sort_values(ascending=False)
df_result = pd.DataFrame(series)
df_result = df_result.reset_index()  
df_result.columns = ['item', 'total']
df_result['lang'] = 'English'
df_result.loc[top:, 'item'] = 'other'
df_result_en = df_result

series = df_ar[key].value_counts()
series = series.sort_values(ascending=False)
df_result = pd.DataFrame(series)
df_result = df_result.reset_index()  
df_result.columns = ['item', 'total']
df_result['lang'] = 'Arabic'
df_result.loc[top:, 'item'] = 'other'
df_result_ar = df_result

series = df_zh[key].value_counts()
series = series.sort_values(ascending=False)
df_result = pd.DataFrame(series)
df_result = df_result.reset_index()  
df_result.columns = ['item', 'total']
df_result['lang'] = 'Chinese'
df_result.loc[top:, 'item'] = 'other'
df
df_result_zh = df_result

df_result_all = pd.concat([df_result_en, df_result_ar, df_result_zh])
df_result_all = df_result_all.loc[df_result_all['item'] != 'other']
df=df_result_all

In [277]:
# Variables
font_size = 20

fig = px.bar(df, 
             x="lang", y="total", 
             color="item", 
             text="item",
             color_discrete_sequence = prism_extended,
             template="plotly_white",
            #  labels = dict(lang="Language",total="Times")
             )

fig.update_layout(
    font_family=font_family,
    font_color=font_color,
    font_size=font_size)

fig.update_traces(textfont_size=font_size-2, textangle=0, textposition="inside")

fig.update_xaxes(visible=True, title=None)
fig.update_yaxes(visible=True, title=None, showticklabels=True)
fig.update_xaxes(ticklabelposition="inside")
fig.update_yaxes(ticklabelposition="inside") # tickangle = 0
fig.update_xaxes(tickfont = dict(family=font_family, color=font_color, size = font_size-2))
fig.update_yaxes(tickfont = dict(family=font_family, color=font_color, size = font_size-2))
# fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='#f8f8f8') #f0f0f0
# fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='#f8f8f8')
# fig.update_layout(paper_bgcolor="white") # transparent background rgb(0,0,0,0)
# fig.update_layout(plot_bgcolor="white")#f6f6f6

fig.update_layout(
    width = 800, height=300,
    margin={"r":0,"t":0,"l":0,"b":0},
    # showlegend=False,
    legend=dict(
        bgcolor=half_transparent,
        x=0, y=1, xanchor="right", yanchor="top",
        font = dict(size=font_size-4),
        title = key,
        # traceorder = 'normal', 
        orientation="v"
        ))
    # legend=dict(
    #     bgcolor=half_transparent,
    #     x=0, y=0, xanchor="left", yanchor="top",
    #     font = dict(size=font_size-2),
    #     # traceorder = 'normal', 
    #     orientation="h"
    #     ))


# fig.update_layout(annotations=[dict(text='English', x=0.11, y=0.5, font_size=24, showarrow=False), 
#                                dict(text='Arabic', x=0.50, y=0.5, font_size=24, showarrow=False),
#                                dict(text='Chinese', x=0.89, y=0.5, font_size=24, showarrow=False)
#                                ])

fig.show()

# write and download
filename = key + "_bar"
filename = re.sub(" ", "_", filename)
# fig.write_html(path_out_html + filename + ".html")

fig.write_image(path_out_pdf + filename + ".pdf", engine="kaleido")

#### Headword type

In [278]:
key = 'headword type'

# Read and store content of an excel file 
read_file = pd.read_excel(path_in+"names.xlsx")
# Write the dataframe object into csv file
read_file.to_csv (path_in+"names.csv", index = None, header=True)
# Load in dataset of names
df=pd.read_csv(path_in+'names.csv', header =[0], delimiter=',', encoding="utf-8")
# Include ones to include
df = df.loc[df['include'] == 'yes']

#filter
df = df.loc[(df['analyzability'] == 'analyzable') | (df['analyzability'] == 'semi-analyzable')]

df_en = df.loc[df['language'] == 'English']
df_ar = df.loc[df['language'] == 'Arabic']
df_zh = df.loc[df['language'] == 'Chinese']

count = df_en[key].value_counts()
count = count.sort_values(ascending=False)
# count[:10]
table_en = pd.DataFrame(count)
table_en.columns = ['English']

count = df_ar[key].value_counts()
count = count.sort_values(ascending=False)
# count[:10]
table_ar = pd.DataFrame(count)
table_ar.columns = ['Arabic']

count = df_zh[key].value_counts()
count = count.sort_values(ascending=False)
# count[:10]
table_zh = pd.DataFrame(count)
table_zh.columns = ['Chinese']

table = pd.concat([table_en, table_ar, table_zh], axis=1)
table.index.names = [key]
table.reset_index(inplace=True)
# table.to_csv('table.csv')
table

Unnamed: 0,headword type,English,Arabic,Chinese
0,prototype similarity,67.0,18.0,50.0
1,prototype,38.0,17.0,30.0
2,plant part,3.0,9.0,3.0
3,function,2.0,1.0,11.0
4,taste,,3.0,1.0
5,shape,,1.0,1.0
6,plant part (metaphor),,1.0,
7,color,,,1.0
8,prototype similarity (metaphor),,,1.0


##### Pie

In [279]:
series = df_en[key].value_counts()
series = series.sort_values(ascending=False)
df_result = pd.DataFrame(series)
df_result = df_result.reset_index()  
df_result.columns = ['id', 'total']
df_result.loc[5:, 'id'] = 'other' # Replace everything with 'others' after the fifth!
# df_result['id'] = pd.Categorical(df_result['id'], ["yes", "semi", "no"]) 
# df_result.sort_values(['id'], inplace = True,)
df_result_en = df_result

series = df_ar[key].value_counts()
series = series.sort_values(ascending=False)
df_result = pd.DataFrame(series)
df_result = df_result.reset_index()  
df_result.columns = ['id', 'total']
df_result.loc[5:, 'id'] = 'other' # Replace everything with 'others' after the fifth!
# df_result['id'] = pd.Categorical(df_result['id'], ["yes", "semi", "no"]) 
# df_result.sort_values(['id'], inplace = True,)
df_result_ar = df_result

series = df_zh[key].value_counts()
series = series.sort_values(ascending=False)
df_result = pd.DataFrame(series)
df_result = df_result.reset_index()  
df_result.columns = ['id', 'total']
df_result.loc[5:, 'id'] = 'other' # Replace everything with 'others' after the fifth!
# df_result['id'] = pd.Categorical(df_result['id'], ["yes", "semi", "no"]) 
# df_result.sort_values(['id'], inplace = True,)
df_result_zh = df_result


In [280]:
################################################################################
# Variables
font_size = 24
hole_size = 0.45

# Create subplots: use 'domain' type for Pie subplot
fig = make_subplots(rows=1, cols=3, horizontal_spacing=0.1, specs=[[{'type':'domain'}, {'type':'domain'}, {'type':'domain'}]])

fig.add_trace(go.Pie(name='English',
                     labels=df_result_en.id,
                     values=df_result_en.total,
                     sort=False,
                    #  marker_colors=[p2, p11, p8],
                     ), 1, 1)

fig.add_trace(go.Pie(name='Arabic',
                     labels=df_result_ar.id, 
                     values=df_result_ar.total,
                    #  marker_colors=[p2, p11, p8],
                     ), 1, 2)

fig.add_trace(go.Pie(name='Chinese',
                     labels=df_result_zh.id, 
                     values=df_result_zh.total,
                    #  marker_colors=[p2, p11, p8],
                     ), 1, 3)

fig.update_traces(
    direction ='clockwise',
    textinfo = "value",
    hoverinfo="label+percent+name",
    hole=hole_size, 
    sort=False,
    # marker_colors=px.colors.qualitative.Prism,
    # marker_colors=['crimson', 'pink', 'darkslategray'],
    )

fig.update_layout(
    template = 'plotly_white+top5',
    width = 1000, 
    height=270,
    margin={"r":0,"t":0,"l":0,"b":0},
    font_family=font_family,
    font_color=font_color,
    font_size=font_size,
    # legend_traceorder="reversed",
    # showlegend=False,
    legend=dict(
        bgcolor=half_transparent,
        x=0, y=1, xanchor="right", yanchor="top",
        font = dict(size=font_size-4),
        # traceorder = 'normal', 
        orientation="v"
        ))
    # legend=dict(
    #     bgcolor=half_transparent,
    #     x=0, y=0, xanchor="left", yanchor="top",
    #     font = dict(size=font_size-4),
    #     # traceorder = 'normal', 
    #     orientation="h"
    #     ))

fig.update_layout(annotations=[dict(text='English', x=0.09, y=0.5, font_size=font_size-2, showarrow=False), 
                               dict(text='Arabic', x=0.5, y=0.5, font_size=font_size-2, showarrow=False),
                               dict(text='Chinese', x=0.91, y=0.5, font_size=font_size-2, showarrow=False)
                               ])

fig.show()

# write and download
filename = key + "_tripie"
filename = re.sub(" ", "_", filename)
# fig.write_html(path_out_html + filename + ".html")

fig.write_image(path_out_pdf + filename + ".pdf", engine="kaleido")


##### Bar

In [281]:
# top = 5

# # Bar plots
# series = df_en[key].value_counts()
# series = series.sort_values(ascending=False)
# df_result = pd.DataFrame(series)
# df_result = df_result.reset_index()  
# df_result.columns = ['item', 'total']
# df_result['lang'] = 'English'
# df_result.loc[top:, 'item'] = 'other'
# df_result_en = df_result

# series = df_ar[key].value_counts()
# series = series.sort_values(ascending=False)
# df_result = pd.DataFrame(series)
# df_result = df_result.reset_index()  
# df_result.columns = ['item', 'total']
# df_result['lang'] = 'Arabic'
# df_result.loc[top:, 'item'] = 'other'
# df_result_ar = df_result

# series = df_zh[key].value_counts()
# series = series.sort_values(ascending=False)
# df_result = pd.DataFrame(series)
# df_result = df_result.reset_index()  
# df_result.columns = ['item', 'total']
# df_result['lang'] = 'Chinese'
# df_result.loc[top:, 'item'] = 'other'
# df
# df_result_zh = df_result

# df_result_all = pd.concat([df_result_en, df_result_ar, df_result_zh])
# df_result_all = df_result_all.loc[df_result_all['item'] != 'other']
# df=df_result_all

In [282]:
# # Variables
# font_size = 20
# font_color = "black"
# font_family = "Serif"
# half_transparent = 'rgba(255,255,255,0.5)'

# fig = px.bar(df, 
#              x="lang", y="total", 
#              color="item", 
#              text="item",
#             #  color_discrete_sequence = px.colors.qualitative.Prism,
#              color_discrete_sequence = PRISM,
#              template="plotly_white",
#             #  labels = dict(lang="Language",total="Times")
#              )

# fig.update_layout(
#     font_family=font_family,
#     font_color=font_color,
#     font_size=font_size)

# fig.update_traces(textfont_size=font_size-2, textangle=0, textposition="inside")

# fig.update_xaxes(visible=True, title=None)
# fig.update_yaxes(visible=True, title=None, showticklabels=True)
# fig.update_xaxes(ticklabelposition="inside")
# fig.update_yaxes(ticklabelposition="inside") # tickangle = 0
# fig.update_xaxes(tickfont = dict(family=font_family, color=font_color, size = font_size-2))
# fig.update_yaxes(tickfont = dict(family=font_family, color=font_color, size = font_size-2))
# # fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='#f8f8f8') #f0f0f0
# # fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='#f8f8f8')
# # fig.update_layout(paper_bgcolor="white") # transparent background rgb(0,0,0,0)
# # fig.update_layout(plot_bgcolor="white")#f6f6f6

# fig.update_layout(
#     width = 800, height=300,
#     margin={"r":0,"t":0,"l":0,"b":0},
#     # showlegend=False,
#     legend=dict(
#         bgcolor=half_transparent,
#         x=0, y=1, xanchor="right", yanchor="top",
#         font = dict(size=font_size-4),
#         title = key,
#         # traceorder = 'normal', 
#         orientation="v"
#         ))
#     # legend=dict(
#     #     bgcolor=half_transparent,
#     #     x=0, y=0, xanchor="left", yanchor="top",
#     #     font = dict(size=font_size-2),
#     #     # traceorder = 'normal', 
#     #     orientation="h"
#     #     ))


# # fig.update_layout(annotations=[dict(text='English', x=0.11, y=0.5, font_size=24, showarrow=False), 
# #                                dict(text='Arabic', x=0.50, y=0.5, font_size=24, showarrow=False),
# #                                dict(text='Chinese', x=0.89, y=0.5, font_size=24, showarrow=False)
# #                                ])

# fig.show()

# # write and download
# filename = key + "_bar"
# filename = re.sub(" ", "_", filename)
# # fig.write_html(filename + ".html")
# # files.download(filename + ".html")

# # fig.write_image(filename + ".png", engine="kaleido")
# fig.write_image(filename + ".pdf", engine="kaleido")
# # files.download(filename + ".png")
# files.download(filename + ".pdf")

#### Modifier

In [283]:
key = 'modifier'

# Read and store content of an excel file 
read_file = pd.read_excel(path_in+"names.xlsx")
# Write the dataframe object into csv file
read_file.to_csv (path_in+"names.csv", index = None, header=True)
# Load in dataset of names
df=pd.read_csv(path_in+'names.csv', header =[0], delimiter=',', encoding="utf-8")
# Include ones to include
df = df.loc[df['include'] == 'yes']

#filter
df = df.loc[(df['analyzability'] == 'analyzable') | (df['analyzability'] == 'semi-analyzable')]


df_en = df.loc[df['language'] == 'English']
df_ar = df.loc[df['language'] == 'Arabic']
df_zh = df.loc[df['language'] == 'Chinese']

count = df_en[key].value_counts()
count = count.sort_values(ascending=False)
# count[:10]
table_en = pd.DataFrame(count)
table_en.columns = ['English']

count = df_ar[key].value_counts()
count = count.sort_values(ascending=False)
# count[:10]
table_ar = pd.DataFrame(count)
table_ar.columns = ['Arabic']

count = df_zh[key].value_counts()
count = count.sort_values(ascending=False)
# count[:10]
table_zh = pd.DataFrame(count)
table_zh.columns = ['Chinese']

table = pd.concat([table_en, table_ar, table_zh], axis=1)
table.index.names = [key]
table.reset_index(inplace=True)
# table.to_csv('table.csv')
table

Unnamed: 0,modifier,English,Arabic,Chinese
0,Indian,8.0,1.0,
1,Chinese,8.0,2.0,
2,green,4.0,1.0,6.0
3,sweet,3.0,5.0,3.0
4,black,3.0,2.0,1.0
...,...,...,...,...
127,ginger,,,1.0
128,cliff,,,1.0
129,bamboo leaf,,,1.0
130,eight horned,,,1.0


##### Pie

In [284]:
# series = df_en[key].value_counts()
# series = series.sort_values(ascending=False)
# df_result = pd.DataFrame(series)
# df_result = df_result.reset_index()  
# df_result.columns = ['id', 'total']
# df_result.loc[5:, 'id'] = 'other' # Replace everything with 'others' after the fifth!
# # df_result['id'] = pd.Categorical(df_result['id'], ["yes", "semi", "no"]) 
# # df_result.sort_values(['id'], inplace = True,)
# df_result_en = df_result

# series = df_ar[key].value_counts()
# series = series.sort_values(ascending=False)
# df_result = pd.DataFrame(series)
# df_result = df_result.reset_index()  
# df_result.columns = ['id', 'total']
# df_result.loc[5:, 'id'] = 'other' # Replace everything with 'others' after the fifth!
# # df_result['id'] = pd.Categorical(df_result['id'], ["yes", "semi", "no"]) 
# # df_result.sort_values(['id'], inplace = True,)
# df_result_ar = df_result

# series = df_zh[key].value_counts()
# series = series.sort_values(ascending=False)
# df_result = pd.DataFrame(series)
# df_result = df_result.reset_index()  
# df_result.columns = ['id', 'total']
# df_result.loc[5:, 'id'] = 'other' # Replace everything with 'others' after the fifth!
# # df_result['id'] = pd.Categorical(df_result['id'], ["yes", "semi", "no"]) 
# # df_result.sort_values(['id'], inplace = True,)
# df_result_zh = df_result


In [285]:
# ################################################################################
# # Variables
# font_size = 24
# font_color = "black"
# font_family = "Serif"
# hole_size = 0.45
# half_transparent = 'rgba(255,255,255,0.5)'

# # Create subplots: use 'domain' type for Pie subplot
# fig = make_subplots(rows=1, cols=3, horizontal_spacing=0.1, specs=[[{'type':'domain'}, {'type':'domain'}, {'type':'domain'}]])

# fig.add_trace(go.Pie(name='English',
#                      labels=df_result_en.id,
#                      values=df_result_en.total,
#                      sort=False,
#                     #  marker_colors=[p2, p11, p8],
#                      ), 1, 1)

# fig.add_trace(go.Pie(name='Arabic',
#                      labels=df_result_ar.id, 
#                      values=df_result_ar.total,
#                     #  marker_colors=[p2, p11, p8],
#                      ), 1, 2)

# fig.add_trace(go.Pie(name='Chinese',
#                      labels=df_result_zh.id, 
#                      values=df_result_zh.total,
#                     #  marker_colors=[p2, p11, p8],
#                      ), 1, 3)

# fig.update_traces(
#     direction ='clockwise',
#     textinfo = "value",
#     hoverinfo="label+percent+name",
#     hole=hole_size, 
#     sort=False,
#     # marker_colors=px.colors.qualitative.Prism,
#     # marker_colors=['crimson', 'pink', 'darkslategray'],
#     )

# fig.update_layout(
#     template = 'plotly_white+top5',
#     width = 1000, 
#     height=270,
#     margin={"r":0,"t":0,"l":0,"b":0},
#     font_family=font_family,
#     font_color=font_color,
#     font_size=font_size,
#     # legend_traceorder="reversed",
#     # showlegend=False,
#     legend=dict(
#         bgcolor=half_transparent,
#         x=0, y=1, xanchor="right", yanchor="top",
#         font = dict(size=font_size-4),
#         # traceorder = 'normal', 
#         orientation="v"
#         ))
#     # legend=dict(
#     #     bgcolor=half_transparent,
#     #     x=0, y=0, xanchor="left", yanchor="top",
#     #     font = dict(size=font_size-4),
#     #     # traceorder = 'normal', 
#     #     orientation="h"
#     #     ))

# fig.update_layout(annotations=[dict(text='English', x=0.09, y=0.5, font_size=font_size-2, showarrow=False), 
#                                dict(text='Arabic', x=0.5, y=0.5, font_size=font_size-2, showarrow=False),
#                                dict(text='Chinese', x=0.91, y=0.5, font_size=font_size-2, showarrow=False)
#                                ])

# fig.show()

# # write and download
# filename = key + "_tripie"
# filename = re.sub(" ", "_", filename)
# # fig.write_html(filename + ".html")
# # files.download(filename + ".html")

# fig.write_image(filename + ".pdf", engine="kaleido")
# files.download(filename + ".pdf")

##### Bar

In [286]:
top = 5

# Bar plots
series = df_en[key].value_counts()
series = series.sort_values(ascending=False)
df_result = pd.DataFrame(series)
df_result = df_result.reset_index()  
df_result.columns = ['item', 'total']
df_result['lang'] = 'English'
df_result.loc[top:, 'item'] = 'other'
df_result_en = df_result

series = df_ar[key].value_counts()
series = series.sort_values(ascending=False)
df_result = pd.DataFrame(series)
df_result = df_result.reset_index()  
df_result.columns = ['item', 'total']
df_result['lang'] = 'Arabic'
df_result.loc[top:, 'item'] = 'other'
df_result_ar = df_result

series = df_zh[key].value_counts()
series = series.sort_values(ascending=False)
df_result = pd.DataFrame(series)
df_result = df_result.reset_index()  
df_result.columns = ['item', 'total']
df_result['lang'] = 'Chinese'
df_result.loc[top:, 'item'] = 'other'
df
df_result_zh = df_result

df_result_all = pd.concat([df_result_en, df_result_ar, df_result_zh])
df_result_all = df_result_all.loc[df_result_all['item'] != 'other']
df=df_result_all

In [287]:
# Variables
font_size = 20

fig = px.bar(df, 
             x="lang", y="total", 
             color="item", 
             text="item",
            #  color_discrete_sequence = px.colors.qualitative.Prism,
             color_discrete_sequence = prism_extended,
             template="plotly_white",
            #  labels = dict(lang="Language",total="Times")
             )

fig.update_layout(
    font_family=font_family,
    font_color=font_color,
    font_size=font_size)

fig.update_traces(textfont_size=font_size-2, textangle=0, textposition="inside")

fig.update_xaxes(visible=True, title=None)
fig.update_yaxes(visible=True, title=None, showticklabels=True)
fig.update_xaxes(ticklabelposition="inside")
fig.update_yaxes(ticklabelposition="inside") # tickangle = 0
fig.update_xaxes(tickfont = dict(family=font_family, color=font_color, size = font_size-2))
fig.update_yaxes(tickfont = dict(family=font_family, color=font_color, size = font_size-2))
# fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='#f8f8f8') #f0f0f0
# fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='#f8f8f8')
# fig.update_layout(paper_bgcolor="white") # transparent background rgb(0,0,0,0)
# fig.update_layout(plot_bgcolor="white")#f6f6f6

fig.update_layout(
    width = 800, height=300,
    margin={"r":0,"t":0,"l":0,"b":0},
    # showlegend=False,
    legend=dict(
        bgcolor=half_transparent,
        x=0, y=1, xanchor="right", yanchor="top",
        font = dict(size=font_size-4),
        title = key,
        # traceorder = 'normal', 
        orientation="v"
        ))
    # legend=dict(
    #     bgcolor=half_transparent,
    #     x=0, y=0, xanchor="left", yanchor="top",
    #     font = dict(size=font_size-2),
    #     # traceorder = 'normal', 
    #     orientation="h"
    #     ))


# fig.update_layout(annotations=[dict(text='English', x=0.11, y=0.5, font_size=24, showarrow=False), 
#                                dict(text='Arabic', x=0.50, y=0.5, font_size=24, showarrow=False),
#                                dict(text='Chinese', x=0.89, y=0.5, font_size=24, showarrow=False)
#                                ])

fig.show()

# write and download
filename = key + "_bar"
filename = re.sub(" ", "_", filename)
# fig.write_html(path_out_pdf + filename + ".html")
fig.write_image(path_out_pdf + filename + ".pdf", engine="kaleido")

#### Modifier type

In [288]:
key = 'modifier type'

# Read and store content of an excel file 
read_file = pd.read_excel(path_in + "names.xlsx")
# Write the dataframe object into csv file
read_file.to_csv (path_in + "names.csv", index = None, header=True)
# Load in dataset of names
df=pd.read_csv(path_in + 'names.csv', header =[0], delimiter=',', encoding="utf-8")
# Include ones to include
df = df.loc[df['include'] == 'yes']

#filter
df = df.loc[(df['analyzability'] == 'analyzable') | (df['analyzability'] == 'semi-analyzable')]


df_en = df.loc[df['language'] == 'English']
df_ar = df.loc[df['language'] == 'Arabic']
df_zh = df.loc[df['language'] == 'Chinese']

count = df_en[key].value_counts()
count = count.sort_values(ascending=False)
# count[:10]
table_en = pd.DataFrame(count)
table_en.columns = ['English']

count = df_ar[key].value_counts()
count = count.sort_values(ascending=False)
# count[:10]
table_ar = pd.DataFrame(count)
table_ar.columns = ['Arabic']

count = df_zh[key].value_counts()
count = count.sort_values(ascending=False)
# count[:10]
table_zh = pd.DataFrame(count)
table_zh.columns = ['Chinese']

table = pd.concat([table_en, table_ar, table_zh], axis=1)
table.index.names = [key]
table.reset_index(inplace=True)
# table.to_csv('table.csv')
table

Unnamed: 0,modifier type,English,Arabic,Chinese
0,geographic origin,46.0,6.0,15.0
1,color,13.0,8.0,13.0
2,plant part,10.0,3.0,8.0
3,shape,8.0,2.0,4.0
4,taste,5.0,7.0,7.0
5,type,4.0,,
6,ecology,4.0,2.0,3.0
7,similarity in taste,3.0,1.0,
8,source,2.0,1.0,10.0
9,positive authenticity,2.0,4.0,


##### Pie

In [289]:
series = df_en[key].value_counts()
series = series.sort_values(ascending=False)
df_result = pd.DataFrame(series)
df_result = df_result.reset_index()  
df_result.columns = ['id', 'total']
df_result.loc[5:, 'id'] = 'other' # Replace everything with 'others' after the fifth!
# df_result['id'] = pd.Categorical(df_result['id'], ["yes", "semi", "no"]) 
# df_result.sort_values(['id'], inplace = True,)
df_result_en = df_result

series = df_ar[key].value_counts()
series = series.sort_values(ascending=False)
df_result = pd.DataFrame(series)
df_result = df_result.reset_index()  
df_result.columns = ['id', 'total']
df_result.loc[5:, 'id'] = 'other' # Replace everything with 'others' after the fifth!
# df_result['id'] = pd.Categorical(df_result['id'], ["yes", "semi", "no"]) 
# df_result.sort_values(['id'], inplace = True,)
df_result_ar = df_result

series = df_zh[key].value_counts()
series = series.sort_values(ascending=False)
df_result = pd.DataFrame(series)
df_result = df_result.reset_index()  
df_result.columns = ['id', 'total']
df_result.loc[5:, 'id'] = 'other' # Replace everything with 'others' after the fifth!
# df_result['id'] = pd.Categorical(df_result['id'], ["yes", "semi", "no"]) 
# df_result.sort_values(['id'], inplace = True,)
df_result_zh = df_result


In [290]:
################################################################################
# Variables
font_size = 24
hole_size = 0.45

# Create subplots: use 'domain' type for Pie subplot
fig = make_subplots(rows=1, cols=3, horizontal_spacing=0.1, specs=[[{'type':'domain'}, {'type':'domain'}, {'type':'domain'}]])

fig.add_trace(go.Pie(name='English',
                     labels=df_result_en.id,
                     values=df_result_en.total,
                     sort=False,
                    #  marker_colors=[p2, p11, p8],
                     ), 1, 1)

fig.add_trace(go.Pie(name='Arabic',
                     labels=df_result_ar.id, 
                     values=df_result_ar.total,
                    #  marker_colors=[p2, p11, p8],
                     ), 1, 2)

fig.add_trace(go.Pie(name='Chinese',
                     labels=df_result_zh.id, 
                     values=df_result_zh.total,
                    #  marker_colors=[p2, p11, p8],
                     ), 1, 3)

fig.update_traces(
    direction ='clockwise',
    textinfo = "value",
    hoverinfo="label+percent+name",
    hole=hole_size, 
    sort=False,
    # marker_colors=px.colors.qualitative.Prism,
    # marker_colors=['crimson', 'pink', 'darkslategray'],
    )

fig.update_layout(
    template = 'plotly_white+top5',
    width = 1000, 
    height=270,
    margin={"r":0,"t":0,"l":0,"b":0},
    font_family=font_family,
    font_color=font_color,
    font_size=font_size,
    # legend_traceorder="reversed",
    # showlegend=False,
    legend=dict(
        bgcolor=half_transparent,
        x=0, y=1, xanchor="right", yanchor="top",
        font = dict(size=font_size-4),
        # traceorder = 'normal', 
        orientation="v"
        ))
    # legend=dict(
    #     bgcolor=half_transparent,
    #     x=0, y=0, xanchor="left", yanchor="top",
    #     font = dict(size=font_size-4),
    #     # traceorder = 'normal', 
    #     orientation="h"
    #     ))

fig.update_layout(annotations=[dict(text='English', x=0.09, y=0.5, font_size=font_size-2, showarrow=False), 
                               dict(text='Arabic', x=0.5, y=0.5, font_size=font_size-2, showarrow=False),
                               dict(text='Chinese', x=0.91, y=0.5, font_size=font_size-2, showarrow=False)
                               ])

fig.show()

# write and download
filename = key + "_tripie"
filename = re.sub(" ", "_", filename)
# fig.write_html(filename + ".html")

fig.write_image(path_out_pdf + filename + ".pdf", engine="kaleido")

#### Sensory

In [291]:
key = 'sensory'

# Read and store content of an excel file 
read_file = pd.read_excel(path_in + "names.xlsx")
# Write the dataframe object into csv file
read_file.to_csv (path_in + "names.csv", index = None, header=True)
# Load in dataset of names
df=pd.read_csv(path_in + 'names.csv', header =[0], delimiter=',', encoding="utf-8")
# Include ones to include
df = df.loc[df['include'] == 'yes']

#filter
df = df.loc[(df['analyzability'] == 'analyzable') | (df['analyzability'] == 'semi-analyzable')]


df_en = df.loc[df['language'] == 'English']
df_ar = df.loc[df['language'] == 'Arabic']
df_zh = df.loc[df['language'] == 'Chinese']

count = df_en[key].value_counts()
count = count.sort_values(ascending=False)
# count[:10]
table_en = pd.DataFrame(count)
table_en.columns = ['English']

count = df_ar[key].value_counts()
count = count.sort_values(ascending=False)
# count[:10]
table_ar = pd.DataFrame(count)
table_ar.columns = ['Arabic']

count = df_zh[key].value_counts()
count = count.sort_values(ascending=False)
# count[:10]
table_zh = pd.DataFrame(count)
table_zh.columns = ['Chinese']

table = pd.concat([table_en, table_ar, table_zh], axis=1)
table.index.names = [key]
table.reset_index(inplace=True)
# table.to_csv('table.csv')
table

Unnamed: 0,sensory,English,Arabic,Chinese
0,visual,25,11,17.0
1,gustatory,5,7,3.0
2,olfactory,1,2,4.0
3,thermal,1,2,
4,tactile,1,1,1.0


##### Pie

In [292]:
series = df_en[key].value_counts()
series = series.sort_values(ascending=False)
df_result = pd.DataFrame(series)
df_result = df_result.reset_index()  
df_result.columns = ['id', 'total']
df_result.loc[5:, 'id'] = 'other' # Replace everything with 'others' after the fifth!
# df_result['id'] = pd.Categorical(df_result['id'], ["yes", "semi", "no"]) 
# df_result.sort_values(['id'], inplace = True,)
df_result_en = df_result

series = df_ar[key].value_counts()
series = series.sort_values(ascending=False)
df_result = pd.DataFrame(series)
df_result = df_result.reset_index()  
df_result.columns = ['id', 'total']
df_result.loc[5:, 'id'] = 'other' # Replace everything with 'others' after the fifth!
# df_result['id'] = pd.Categorical(df_result['id'], ["yes", "semi", "no"]) 
# df_result.sort_values(['id'], inplace = True,)
df_result_ar = df_result

series = df_zh[key].value_counts()
series = series.sort_values(ascending=False)
df_result = pd.DataFrame(series)
df_result = df_result.reset_index()  
df_result.columns = ['id', 'total']
df_result.loc[5:, 'id'] = 'other' # Replace everything with 'others' after the fifth!
# df_result['id'] = pd.Categorical(df_result['id'], ["yes", "semi", "no"]) 
# df_result.sort_values(['id'], inplace = True,)
df_result_zh = df_result


In [293]:
################################################################################
# Variables
font_size = 24
hole_size = 0.45

# Create subplots: use 'domain' type for Pie subplot
fig = make_subplots(rows=1, cols=3, horizontal_spacing=0.1, specs=[[{'type':'domain'}, {'type':'domain'}, {'type':'domain'}]])

fig.add_trace(go.Pie(name='English',
                     labels=df_result_en.id,
                     values=df_result_en.total,
                     sort=False,
                    #  marker_colors=[p2, p11, p8],
                     ), 1, 1)

fig.add_trace(go.Pie(name='Arabic',
                     labels=df_result_ar.id, 
                     values=df_result_ar.total,
                    #  marker_colors=[p2, p11, p8],
                     ), 1, 2)

fig.add_trace(go.Pie(name='Chinese',
                     labels=df_result_zh.id, 
                     values=df_result_zh.total,
                    #  marker_colors=[p2, p11, p8],
                     ), 1, 3)

fig.update_traces(
    direction ='clockwise',
    textinfo = "value",
    hoverinfo="label+percent+name",
    hole=hole_size, 
    sort=False,
    # marker_colors=px.colors.qualitative.Prism,
    # marker_colors=['crimson', 'pink', 'darkslategray'],
    )

fig.update_layout(
    template = 'plotly_white+top5',
    width = 1000, 
    height=270,
    margin={"r":0,"t":0,"l":0,"b":0},
    font_family=font_family,
    font_color=font_color,
    font_size=font_size,
    # legend_traceorder="reversed",
    # showlegend=False,
    legend=dict(
        bgcolor=half_transparent,
        x=0, y=1, xanchor="right", yanchor="top",
        font = dict(size=font_size-4),
        # traceorder = 'normal', 
        orientation="v"
        ))
    # legend=dict(
    #     bgcolor=half_transparent,
    #     x=0, y=0, xanchor="left", yanchor="top",
    #     font = dict(size=font_size-4),
    #     # traceorder = 'normal', 
    #     orientation="h"
    #     ))

fig.update_layout(annotations=[dict(text='English', x=0.09, y=0.5, font_size=font_size-2, showarrow=False), 
                               dict(text='Arabic', x=0.5, y=0.5, font_size=font_size-2, showarrow=False),
                               dict(text='Chinese', x=0.91, y=0.5, font_size=font_size-2, showarrow=False)
                               ])

fig.show()

# write and download
filename = key + "_tripie"
filename = re.sub(" ", "_", filename)
# fig.write_html(filename + ".html")

fig.write_image(path_out_pdf + filename + ".pdf", engine="kaleido")

##### Bar

In [294]:
# top = 5

# # Bar plots
# series = df_en[key].value_counts()
# series = series.sort_values(ascending=False)
# df_result = pd.DataFrame(series)
# df_result = df_result.reset_index()  
# df_result.columns = ['item', 'total']
# df_result['lang'] = 'English'
# df_result.loc[top:, 'item'] = 'other'
# df_result_en = df_result

# series = df_ar[key].value_counts()
# series = series.sort_values(ascending=False)
# df_result = pd.DataFrame(series)
# df_result = df_result.reset_index()  
# df_result.columns = ['item', 'total']
# df_result['lang'] = 'Arabic'
# df_result.loc[top:, 'item'] = 'other'
# df_result_ar = df_result

# series = df_zh[key].value_counts()
# series = series.sort_values(ascending=False)
# df_result = pd.DataFrame(series)
# df_result = df_result.reset_index()  
# df_result.columns = ['item', 'total']
# df_result['lang'] = 'Chinese'
# df_result.loc[top:, 'item'] = 'other'
# df
# df_result_zh = df_result

# df_result_all = pd.concat([df_result_en, df_result_ar, df_result_zh])
# df_result_all = df_result_all.loc[df_result_all['item'] != 'other']
# df=df_result_all

In [295]:
# # Variables
# font_size = 20

# fig = px.bar(df, 
#              x="lang", y="total", 
#              color="item", 
#              text="item",
#              color_discrete_sequence = prism_extended,
#              template="plotly_white",
#             #  labels = dict(lang="Language",total="Times")
#              )

# fig.update_layout(
#     font_family=font_family,
#     font_color=font_color,
#     font_size=font_size)

# fig.update_traces(textfont_size=font_size-2, textangle=0, textposition="inside")

# fig.update_xaxes(visible=True, title=None)
# fig.update_yaxes(visible=True, title=None, showticklabels=True)
# fig.update_xaxes(ticklabelposition="inside")
# fig.update_yaxes(ticklabelposition="inside") # tickangle = 0
# fig.update_xaxes(tickfont = dict(family=font_family, color=font_color, size = font_size-2))
# fig.update_yaxes(tickfont = dict(family=font_family, color=font_color, size = font_size-2))
# # fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='#f8f8f8') #f0f0f0
# # fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='#f8f8f8')
# # fig.update_layout(paper_bgcolor="white") # transparent background rgb(0,0,0,0)
# # fig.update_layout(plot_bgcolor="white")#f6f6f6

# fig.update_layout(
#     width = 800, height=300,
#     margin={"r":0,"t":0,"l":0,"b":0},
#     # showlegend=False,
#     legend=dict(
#         bgcolor=half_transparent,
#         x=0, y=1, xanchor="right", yanchor="top",
#         font = dict(size=font_size-4),
#         title = key,
#         # traceorder = 'normal', 
#         orientation="v"
#         ))
#     # legend=dict(
#     #     bgcolor=half_transparent,
#     #     x=0, y=0, xanchor="left", yanchor="top",
#     #     font = dict(size=font_size-2),
#     #     # traceorder = 'normal', 
#     #     orientation="h"
#     #     ))


# # fig.update_layout(annotations=[dict(text='English', x=0.11, y=0.5, font_size=24, showarrow=False), 
# #                                dict(text='Arabic', x=0.50, y=0.5, font_size=24, showarrow=False),
# #                                dict(text='Chinese', x=0.89, y=0.5, font_size=24, showarrow=False)
# #                                ])

# fig.show()

# # # write and download
# # filename = key + "_bar"
# # filename = re.sub(" ", "_", filename)
# # # fig.write_html(filename + ".html")
# # fig.write_image(filename + ".pdf", engine="kaleido")

## Attestation

In [296]:
# Read and store content of an excel file 
read_file = pd.read_excel(path_in+"names.xlsx")
# Write the dataframe object into csv file
read_file.to_csv (path_in+"names.csv", index = None, header=True)
# Load in dataset of names
df_names=pd.read_csv(path_in+'names.csv', header =[0], delimiter=',', encoding="utf-8")
# Include ones to include
df_names = df_names.loc[df_names['include'] == 'yes']


### Monolingual

In [297]:
# fig.update_yaxes(ticklabelposition="inside") # tickangle = 0
# fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='#f8f8f8') #f0f0f0
# fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='#f8f8f8')
# fig.update_layout(paper_bgcolor="white") # transparent background rgb(0,0,0,0)
# fig.update_layout(plot_bgcolor="white")#f6f6f6

# https://www.tutorialspoint.com/plotly/plotly_format_axis_and_ticks.htm

#### English

In [298]:
# Plot base set of spices
df = df_names.copy()

# Selections
df = df.loc[df['language'] == 'English']
# df = df.loc[df['language'] == 'Chinese']
# df = df.loc[(df['language'] == 'English') | (df['language'] == 'Chinese')]

# df = df.loc[df['status'] == 'default']
df = df.loc[(df['status'] == 'default') | (df['status'] == 'historic')]

lang = "en"

# drop rows where date is missing
# df.drop(df[df['date'] == ''].index, inplace = True)
df = df.dropna(subset=['date'])

# Manipulate dates
df['year'] = df['date']
df['year'] = df['year'].str.replace(r'[ac\?]', '', regex=True)
df['year'] = df['year'].str.replace(r'.*-', '-', regex=True) # keep latest year if range
df['year'] = df['year'].str.replace(r'eOE', '800', regex=True)
df['year'] = df['year'].astype(int)

# Sort data
# df['class'] = pd.Categorical(df['class'], ["n.", "v.", "adj.", "adv.", "phrase"]) 
df.sort_values(["year"], inplace = True, ascending=True)

# Add mock column for size
df['size'] = 1
df.reset_index(inplace=True, drop=True)

print(df.shape[0])

# df.drop(df[df['id'] == 'black cardamom'].index, inplace = True)

28


##### Document

In [299]:
# Variables #
marker_size = 24
edge_size = 2
edge_color = transparent
font_size = 24
opacity = 0.33

fig = px.scatter(df, x="year", y="id", # y=df.index, 
                 size="size", 
                 text="term",
                 size_max=marker_size,
                #  color="area",
                #  color_discrete_sequence=px.colors.qualitative.Prism,
                 color_discrete_sequence=[p2],
                #  symbol = "borrowing",
                #  symbol_sequence = ['circle', 'circle-open-dot', 'circle-open'], 
                 opacity = opacity,
                 hover_name='term',
                 hover_data={'id':True, 'literal':False, 'date':True, 'year':False, 'id':True, 'species':True, 'size':False},
                #  marginal_x="histogram",
                 template="plotly_white"
                 )

fig.update_traces(
    marker_line_width=edge_size,
    marker_line_color=edge_color,
    # selector=dict(mode='markers')
    )

# fig.update_traces(
#                   marker=dict(
#                       # symbol=marker_symbol, 
#                       # size=marker_size, 
#                       line=dict(color=edge_color, width=edge_size)
#                               ),
#                   )

fig.update_layout(
    font_family=font_family,
    font_color=font_color,
    font_size=font_size)

fig.update_xaxes(visible=True, showticklabels=True, automargin=True)
fig.update_yaxes(visible=True, showticklabels=True, automargin=True, title=None)
fig.update_xaxes(tickfont = dict(family=font_family, color=font_color, size = font_size))
fig.update_yaxes(tickfont = dict(family=font_family, color=font_color, size = font_size))
fig.update_xaxes(ticklabelposition="inside")


# full size for html ---------------------------------------------------------
fig.update_layout(
    legend=dict(y=1, x=1, 
                xanchor="left", yanchor="top", 
                orientation="v")
    )


# pdf size #####################################################################
fig.update_layout(
    width = 1000, height=900,
    margin={"r":0,"t":0,"l":0,"b":20},
    )

fig.show()

# write and download
filename = "attestation_"
fig.write_image(path_out_pdf + filename + lang + ".pdf", engine="kaleido")

##### HTML

In [300]:
# Variables #
marker_size = 14
font_size = 14

fig = px.scatter(df, x="year", y="id", # y=df.index, 
                 size="size", 
                 text="term",
                 size_max=marker_size,
                #  color="area",
                #  color_discrete_sequence=px.colors.qualitative.Prism,
                 color_discrete_sequence=[p2],
                #  symbol="exotic",
                 symbol_sequence = ['circle'], 
                 opacity = opacity,
                 hover_name='term',
                 hover_data={'id':True, 'literal':False, 'date':True, 'year':False, 'id':True, 'species':True, 'size':False},
                #  marginal_x="histogram",
                 template="plotly_white"
                 )

fig.update_traces(
                  marker=dict(
                      # symbol=marker_symbol, 
                      # size=marker_size, 
                      line=dict(color=edge_color, width=edge_size)
                              ),
                  )

fig.update_layout(
    font_family=font_family,
    font_color=font_color,
    font_size=font_size)

fig.update_xaxes(visible=True, showticklabels=True, automargin=True)
fig.update_yaxes(visible=True, showticklabels=True, automargin=True, title=None)
fig.update_xaxes(tickfont = dict(family=font_family, color=font_color, size = font_size))
fig.update_yaxes(tickfont = dict(family=font_family, color=font_color, size = font_size))
fig.update_xaxes(ticklabelposition="inside")

fig.add_annotation(
    xref = "paper", yref = "paper",
    x=0, y=0, #x=1
    xanchor="right", yanchor="top", align="center",
    text="© Parti Gábor, 2022",
    font={"color": "gainsboro", "size": 8, "family": font_family},
    showarrow=False)

# add images
fig.add_layout_image(
    source="https://upload.wikimedia.org/wikipedia/en/thumb/9/9e/PolyU_Logo_with_wordmark.svg/1024px-PolyU_Logo_with_wordmark.svg.png",
    sizex=0.1, sizey=0.1,
    # source="https://upload.wikimedia.org/wikipedia/en/thumb/5/52/PolyU.svg/759px-PolyU.svg.png",
    # sizex=0.15, sizey=0.15,
    x=1, y=0, 
    xanchor="right", yanchor="top", 
)

fig.update_layout(
    # width = 1200, 
    height=550,
    margin={"r":0,"t":0,"l":0,"b":35},
    title=dict(y=1, x=0, xanchor='left', yanchor='top', 
               text='Dates of attestation of a set of spice terms in English, according to the OED',   
               font={"color": font_color, "size": font_size+4, "family": font_family}),
    # legend=dict(x=0, y=-0.15, xanchor="left", yanchor="bottom",
    #             font={"color": font_color, "size": font_size-2, "family": font_family},
    #             orientation="h")
    )

fig.show()

# write and save
filename = "attestation_"
fig.write_html(path_out_html + filename + lang + ".html")

#### Arabic

In [301]:
# Plot base set of spices
df = df_names.copy()

# Selections
# df = df.loc[df['language'] == 'English']
df = df.loc[df['language'] == 'Arabic']
# df = df.loc[(df['language'] == 'English') | (df['language'] == 'Chinese')]

lang = "ar"
color = [p4]

# df = df.loc[df['status'] == 'default']
df = df.loc[(df['status'] == 'default') | (df['status'] == 'historic')]

# drop rows where date is missing
# df.drop(df[df['date'] == ''].index, inplace = True)
df = df.dropna(subset=['date'])

# Manipulate dates
df['year'] = df['date']
df['year'] = df['year'].str.replace(r'[ac\?]', '', regex=True)
df['year'] = df['year'].str.replace(r'.*-', '-', regex=True)
df['year'] = df['year'].str.replace(r'eOE', '800', regex=True)
df['year'] = df['year'].astype(int)

# Sort data
# df['class'] = pd.Categorical(df['class'], ["n.", "v.", "adj.", "adv.", "phrase"]) 
df.sort_values(["year"], inplace = True, ascending=True)

# Add mock column for size
df['size'] = 1
df.reset_index(inplace=True, drop=True)

print(df.shape[0])

# df.drop(df[df['id'] == 'black cardamom'].index, inplace = True)



27


##### Document

In [302]:
# Variables #
marker_size = 24
font_size = 24

fig = px.scatter(df, x="year", y="id", # y=df.index, 
                 size="size", 
                 text="term",
                 size_max=marker_size,
                #  color="area",
                #  color_discrete_sequence=px.colors.qualitative.Prism,
                 color_discrete_sequence=color,
                #  symbol="exotic",
                 symbol_sequence = ['circle'], 
                 opacity = opacity,
                 hover_name='term',
                 hover_data={'id':True, 'literal':False, 'date':True, 'year':False, 'id':True, 'species':True, 'size':False},
                #  marginal_x="histogram",
                 template="plotly_white"
                 )

fig.update_traces(
                  marker=dict(
                      # symbol=marker_symbol, 
                      # size=marker_size, 
                      line=dict(color=edge_color, width=edge_size)
                              ),
                  )

fig.update_layout(
    font_family=font_family,
    font_color=font_color,
    font_size=font_size)

fig.update_xaxes(visible=True, showticklabels=True, automargin=True)
fig.update_yaxes(visible=True, showticklabels=True, automargin=True, title=None)
fig.update_xaxes(tickfont = dict(family=font_family, color=font_color, size = font_size))
fig.update_yaxes(tickfont = dict(family=font_family, color=font_color, size = font_size))
fig.update_xaxes(ticklabelposition="inside")


# full size for html ---------------------------------------------------------
fig.update_layout(
    legend=dict(y=1, x=1, 
                xanchor="left", yanchor="top", 
                orientation="v")
    )


# pdf size #####################################################################
fig.update_layout(
    width = 1000, height=900,
    margin={"r":0,"t":0,"l":0,"b":20},
    )

fig.show()

# write and download
filename = "attestation_"
fig.write_image(path_out_pdf + filename + lang + ".pdf", engine="kaleido")

##### HTML

In [303]:
# Variables #
marker_size = 14
font_size = 14

fig = px.scatter(df, x="year", y="id", # y=df.index, 
                 size="size", 
                 text="term",
                 size_max=marker_size,
                #  color="area",
                #  color_discrete_sequence=px.colors.qualitative.Prism,
                 color_discrete_sequence=color,
                #  symbol="exotic",
                 symbol_sequence = ['circle'], 
                 opacity = opacity,
                 hover_name='term',
                 hover_data={'id':True, 'literal':False, 'date':True, 'year':False, 'id':True, 'species':True, 'size':False},
                #  marginal_x="histogram",
                 template="plotly_white"
                 )

fig.update_traces(
                  marker=dict(
                      # symbol=marker_symbol, 
                      # size=marker_size, 
                      line=dict(color=edge_color, width=edge_size)
                              ),
                  )

fig.update_layout(
    font_family=font_family,
    font_color=font_color,
    font_size=font_size)

fig.update_xaxes(visible=True, showticklabels=True, automargin=True)
fig.update_yaxes(visible=True, showticklabels=True, automargin=True, title=None)
fig.update_xaxes(tickfont = dict(family=font_family, color=font_color, size = font_size))
fig.update_yaxes(tickfont = dict(family=font_family, color=font_color, size = font_size))
fig.update_xaxes(ticklabelposition="inside")

fig.add_annotation(
    xref = "paper", yref = "paper",
    x=0, y=0, #x=1
    xanchor="right", yanchor="top", align="center",
    text="© Parti Gábor, 2022",
    font={"color": "gainsboro", "size": 8, "family": font_family},
    showarrow=False)

# add images
fig.add_layout_image(
    source="https://upload.wikimedia.org/wikipedia/en/thumb/9/9e/PolyU_Logo_with_wordmark.svg/1024px-PolyU_Logo_with_wordmark.svg.png",
    sizex=0.1, sizey=0.1,
    # source="https://upload.wikimedia.org/wikipedia/en/thumb/5/52/PolyU.svg/759px-PolyU.svg.png",
    # sizex=0.15, sizey=0.15,
    x=1, y=0, 
    xanchor="right", yanchor="top", 
)

fig.update_layout(
    # width = 1200, 
    height=550,
    margin={"r":0,"t":0,"l":0,"b":35},
    title=dict(y=1, x=0, xanchor='left', yanchor='top', 
               text='Dates of attestation of a set of spice terms in English, according to the OED',   
               font={"color": font_color, "size": font_size+4, "family": font_family}),
    # legend=dict(x=0, y=-0.15, xanchor="left", yanchor="bottom",
    #             font={"color": font_color, "size": font_size-2, "family": font_family},
    #             orientation="h")
    )

fig.show()

# write and save
filename = "attestation_"
fig.write_html(path_out_html + filename + lang + ".html")

#### Chinese

In [304]:
# Practice
# fig = px.scatter(df, x="year", y="id", color="exotic", size="size")
# fig.show()

# Plot base set of spices
df = df_names.copy()

# Selections
# df = df.loc[df['language'] == 'English']
df = df.loc[df['language'] == 'Chinese']
# df = df.loc[(df['language'] == 'English') | (df['language'] == 'Chinese')]

lang = "zh"
color = [p6]

# df = df.loc[df['status'] == 'default']
df = df.loc[(df['status'] == 'default') | (df['status'] == 'historic')]

# drop rows where date is missing
# df.drop(df[df['date'] == ''].index, inplace = True)
df = df.dropna(subset=['date'])

# Manipulate dates
df['year'] = df['date']
df['year'] = df['year'].str.replace(r'[ac\?]', '', regex=True)
df['year'] = df['year'].str.replace(r'.*-', '-', regex=True)
df['year'] = df['year'].str.replace(r'eOE', '800', regex=True)
# print(df.year)

df['year'] = df['year'].astype(int)

# Sort data
# df['class'] = pd.Categorical(df['class'], ["n.", "v.", "adj.", "adv.", "phrase"]) 
df.sort_values(["year"], inplace = True, ascending=True)

# Add mock column for size
df['size'] = 1
df.reset_index(inplace=True, drop=True)

print(df.shape[0])

# df.drop(df[df['id'] == 'black cardamom'].index, inplace = True)

# df['ann'] = df['script'] + "<br>" + df['term'] # this causes the warning

32


##### Document

In [305]:
# Variables #
marker_size = 24
font_size = 24

fig = px.scatter(df, x="year", y="id", # y=df.index, 
                 size="size", 
                 text="term",
                 size_max=marker_size,
                #  color="area",
                #  color_discrete_sequence=px.colors.qualitative.Prism,
                 color_discrete_sequence=color,
                #  symbol="exotic",
                 symbol_sequence = ['circle'], 
                 opacity = opacity,
                 hover_name='term',
                 hover_data={'id':True, 'literal':False, 'date':True, 'year':False, 'id':True, 'species':True, 'size':False},
                #  marginal_x="histogram",
                 template="plotly_white"
                 )

fig.update_traces(
                  marker=dict(
                      # symbol=marker_symbol, 
                      # size=marker_size, 
                      line=dict(color=edge_color, width=edge_size)
                              ),
                  )

fig.update_layout(
    font_family=font_family,
    font_color=font_color,
    font_size=font_size)

fig.update_xaxes(visible=True, showticklabels=True, automargin=True)
fig.update_yaxes(visible=True, showticklabels=True, automargin=True, title=None)
fig.update_xaxes(tickfont = dict(family=font_family, color=font_color, size = font_size))
fig.update_yaxes(tickfont = dict(family=font_family, color=font_color, size = font_size))
fig.update_xaxes(ticklabelposition="inside")


# full size for html ---------------------------------------------------------
fig.update_layout(
    legend=dict(y=1, x=1, 
                xanchor="left", yanchor="top", 
                orientation="v")
    )


# pdf size #####################################################################
fig.update_layout(
    width = 1000, height=900,
    margin={"r":0,"t":0,"l":0,"b":20},
    )

fig.show()

# write and download
filename = "attestation_"
fig.write_image(path_out_pdf + filename + lang + ".pdf", engine="kaleido")

##### HTML

In [306]:
# Variables #
marker_size = 14
font_size = 14

fig = px.scatter(df, x="year", y="id", # y=df.index, 
                 size="size", 
                 text="term",
                 size_max=marker_size,
                #  color="area",
                #  color_discrete_sequence=px.colors.qualitative.Prism,
                 color_discrete_sequence=color,
                #  symbol="exotic",
                 symbol_sequence = ['circle'], 
                 opacity = opacity,
                 hover_name='term',
                 hover_data={'id':True, 'literal':False, 'date':True, 'year':False, 'id':True, 'species':True, 'size':False},
                #  marginal_x="histogram",
                 template="plotly_white"
                 )

fig.update_traces(
                  marker=dict(
                      # symbol=marker_symbol, 
                      # size=marker_size, 
                      line=dict(color=edge_color, width=edge_size)
                              ),
                  )

fig.update_layout(
    font_family=font_family,
    font_color=font_color,
    font_size=font_size)

fig.update_xaxes(visible=True, showticklabels=True, automargin=True)
fig.update_yaxes(visible=True, showticklabels=True, automargin=True, title=None)
fig.update_xaxes(tickfont = dict(family=font_family, color=font_color, size = font_size))
fig.update_yaxes(tickfont = dict(family=font_family, color=font_color, size = font_size))
fig.update_xaxes(ticklabelposition="inside")

fig.add_annotation(
    xref = "paper", yref = "paper",
    x=0, y=0, #x=1
    xanchor="right", yanchor="top", align="center",
    text="© Parti Gábor, 2022",
    font={"color": "gainsboro", "size": 8, "family": font_family},
    showarrow=False)

# add images
fig.add_layout_image(
    source="https://upload.wikimedia.org/wikipedia/en/thumb/9/9e/PolyU_Logo_with_wordmark.svg/1024px-PolyU_Logo_with_wordmark.svg.png",
    sizex=0.1, sizey=0.1,
    # source="https://upload.wikimedia.org/wikipedia/en/thumb/5/52/PolyU.svg/759px-PolyU.svg.png",
    # sizex=0.15, sizey=0.15,
    x=1, y=0, 
    xanchor="right", yanchor="top", 
)

fig.update_layout(
    # width = 1200, 
    height=550,
    margin={"r":0,"t":0,"l":0,"b":35},
    title=dict(y=1, x=0, xanchor='left', yanchor='top', 
               text='Dates of attestation of a set of spice terms in English, according to the OED',   
               font={"color": font_color, "size": font_size+4, "family": font_family}),
    # legend=dict(x=0, y=-0.15, xanchor="left", yanchor="bottom",
    #             font={"color": font_color, "size": font_size-2, "family": font_family},
    #             orientation="h")
    )

fig.show()

# write and save
filename = "attestation_"
fig.write_html(path_out_html + filename + lang + ".html")

### Trilingual

In [307]:
# Plot base set of spices
df = df_names.copy()

# Selections
# df = df.loc[df['language'] == 'English']
# df = df.loc[df['language'] == 'Arabic']
df = df.loc[(df['language'] == 'English') | (df['language'] == 'Arabic') | (df['language'] == 'Chinese')]
lang = ""
color = [p6,p4,p2],

# df = df.loc[df['status'] == 'default']
df = df.loc[(df['status'] == 'default') | (df['status'] == 'historic')]

# drop rows where date is missing
# df.drop(df[df['date'] == ''].index, inplace = True)
df = df.dropna(subset=['date'])

# Manipulate dates
df['year'] = df['date']
df['year'] = df['year'].str.replace(r'[ac\?]', '', regex=True)
df['year'] = df['year'].str.replace(r'.*-', '-', regex=True)
df['year'] = df['year'].str.replace(r'eOE', '800', regex=True)
df['year'] = df['year'].astype(int)

# Sort data
# df['class'] = pd.Categorical(df['class'], ["n.", "v.", "adj.", "adv.", "phrase"]) 
df.sort_values(["year"], inplace = True, ascending=True)


# Add mock column for size
df['size'] = 1
df.reset_index(inplace=True, drop=True)

print(df.shape[0])

# df.drop(df[df['id'] == 'black cardamom'].index, inplace = True)

87


#### Full

##### Document

In [308]:
# Variables #
marker_size = 24
edge_size = 2
edge_color = transparent
font_size = 24
opacity = 0.5

fig = px.scatter(df, x="year", y="id", # y=df.index, 
                 size="size", 
                #  text="term",
                 size_max=marker_size,
                 color="language",
                #  color_discrete_sequence=px.colors.qualitative.Prism,
                 color_discrete_sequence=[p6,p4,p2],
                #  symbol="borrowing",
                #  symbol_sequence = ['circle-open', 'circle', 'circle-open-dot'], 
                 opacity = opacity,
                 hover_name='term',
                 hover_data={'id':True, 'literal':False, 'date':True, 'year':False, 'id':True, 'species':True, 'size':False},
                #  marginal_x="histogram",
                 template="plotly_white"
                 )

fig.update_traces(
    marker_line_width=edge_size,
    marker_line_color=edge_color,
    # selector=dict(mode='markers')
    )

fig.update_layout(
    font_family=font_family,
    font_color=font_color,
    font_size=font_size)

# fig.update_layout( yaxis={'categoryorder':'array', 'categoryarray':df.index})

fig.update_xaxes(visible=True, showticklabels=True, automargin=True)
fig.update_yaxes(visible=True, showticklabels=True, automargin=True, title=None)
fig.update_xaxes(tickfont = dict(family=font_family, color=font_color, size = font_size-4))
fig.update_yaxes(tickfont = dict(family=font_family, color=font_color, size = font_size-4))
fig.update_xaxes(ticklabelposition="inside")

# full size for html ---------------------------------------------------------
fig.update_layout(
    legend=dict(x=0, y=-0.05, xanchor="left", yanchor="top", 
                title=dict(side='top'),
                font={"color": font_color, "size": font_size-2, "family": font_family},
                orientation="h")
    )


# pdf size #####################################################################
fig.update_layout(
    width = 1000, height=900,
    margin={"r":0,"t":0,"l":0,"b":20},
    )

fig.show()

# write and download
filename = "attestation"
fig.write_image(path_out_pdf + filename + lang + ".pdf", engine="kaleido")

In [309]:
# Variables #
marker_size = 24
edge_size = 2
edge_color = transparent
font_size = 24
opacity = 0.5

fig = px.scatter(df, x="year", y="id", # y=df.index, 
                 size="size", 
                #  text="term",
                 size_max=marker_size,
                 color="language",
                #  color_discrete_sequence=px.colors.qualitative.Prism,
                 color_discrete_sequence=[p6,p4,p2],
                 symbol="borrowing",
                 symbol_sequence = ['circle-open', 'circle', 'circle-open-dot'], 
                 opacity = opacity,
                 hover_name='term',
                 hover_data={'id':True, 'literal':False, 'date':True, 'year':False, 'id':True, 'species':True, 'size':False},
                #  marginal_x="histogram",
                 template="plotly_white"
                 )

fig.update_traces(
    marker_line_width=edge_size,
    marker_line_color=edge_color,
    # selector=dict(mode='markers')
    )

fig.update_layout(
    font_family=font_family,
    font_color=font_color,
    font_size=font_size)

# fig.update_layout( yaxis={'categoryorder':'array', 'categoryarray':df.index})

fig.update_xaxes(visible=True, showticklabels=True, automargin=True)
fig.update_yaxes(visible=True, showticklabels=True, automargin=True, title=None)
fig.update_xaxes(tickfont = dict(family=font_family, color=font_color, size = font_size-4))
fig.update_yaxes(tickfont = dict(family=font_family, color=font_color, size = font_size-4))
fig.update_xaxes(ticklabelposition="inside")

# full size for html ---------------------------------------------------------
fig.update_layout(
    legend=dict(x=0, y=-0.05, xanchor="left", yanchor="top", 
                title=dict(side='top'),
                font={"color": font_color, "size": font_size-2, "family": font_family},
                orientation="h")
    )


# pdf size #####################################################################
fig.update_layout(
    width = 1000, height=900,
    margin={"r":0,"t":0,"l":0,"b":20},
    )

fig.show()

# write and download
filename = "attestation_and_borrowing"
fig.write_image(path_out_pdf + filename + lang + ".pdf", engine="kaleido")

##### HTML

In [310]:
# Variables #
marker_size = 16
font_size = 16

fig = px.scatter(df, x="year", y="id", # y=df.index, 
                 size="size", 
                #  text="term",
                 size_max=marker_size,
                 color="language",
                #  color_discrete_sequence=px.colors.qualitative.Prism,
                 color_discrete_sequence=[p6,p4,p2],
                #  symbol="borrowing",
                #  symbol_sequence = ['circle-open', 'circle', 'circle-open-dot'], 
                 opacity = opacity,
                 hover_name='term',
                 hover_data={'id':True, 'literal':False, 'date':True, 'year':False, 'id':True, 'species':True, 'size':False},
                #  marginal_x="histogram",
                 template="plotly_white"
                 )

fig.update_traces(
    marker_line_width=edge_size,
    marker_line_color=edge_color,
    # selector=dict(mode='markers')
    )

fig.update_layout(
    font_family=font_family,
    font_color=font_color,
    font_size=font_size)

# fig.update_layout( yaxis={'categoryorder':'array', 'categoryarray':df.index})

fig.update_xaxes(visible=True, showticklabels=True, automargin=True)
fig.update_yaxes(visible=True, showticklabels=True, automargin=True, title=None)
fig.update_xaxes(tickfont = dict(family=font_family, color=font_color, size = font_size-4))
fig.update_yaxes(tickfont = dict(family=font_family, color=font_color, size = font_size-4))
fig.update_xaxes(ticklabelposition="inside")

fig.add_annotation(
    xref = "paper", yref = "paper",
    x=0, y=0, #x=1
    xanchor="left", yanchor="top", align="center",
    text="© Parti Gábor, 2022",
    font={"color": "gainsboro", "size": 8, "family": font_family},
    showarrow=False)

# add images
fig.add_layout_image(
    source="https://upload.wikimedia.org/wikipedia/en/thumb/9/9e/PolyU_Logo_with_wordmark.svg/1024px-PolyU_Logo_with_wordmark.svg.png",
    sizex=0.1, sizey=0.1,
    # source="https://upload.wikimedia.org/wikipedia/en/thumb/5/52/PolyU.svg/759px-PolyU.svg.png",
    # sizex=0.15, sizey=0.15,
    x=1, y=0, 
    xanchor="right", yanchor="top", 
)

fig.update_layout(
    # width = 1200, 
    height=550,
    margin={"r":0,"t":0,"l":0,"b":40},
    title=dict(y=1, x=0, xanchor='left', yanchor='top', 
               text='Dates of attestation of a set of spice terms in English, according to the OED',   
               font={"color": font_color, "size": font_size+4, "family": font_family}),
    # legend=dict(x=1, y=0.9, xanchor="left", yanchor="top",
    #             font={"color": font_color, "size": font_size-2, "family": font_family},
    #             orientation="v")
    # )
    legend=dict(x=0, y=-0.05, xanchor="left", yanchor="top",
                title=dict(side='top'),
                font={"color": font_color, "size": font_size-2, "family": font_family},
                orientation="h")
    )

fig.show()

# write and save
filename = "attestation"
fig.write_html(path_out_html + filename + lang + ".html")

In [311]:
# Variables #
marker_size = 16
font_size = 16

fig = px.scatter(df, x="year", y="id", # y=df.index, 
                 size="size", 
                #  text="term",
                 size_max=marker_size,
                 color="language",
                #  color_discrete_sequence=px.colors.qualitative.Prism,
                 color_discrete_sequence=[p6,p4,p2],
                 symbol="borrowing",
                 symbol_sequence = ['circle-open', 'circle', 'circle-open-dot'], 
                 opacity = opacity,
                 hover_name='term',
                 hover_data={'id':True, 'literal':False, 'date':True, 'year':False, 'id':True, 'species':True, 'size':False},
                #  marginal_x="histogram",
                 template="plotly_white"
                 )

fig.update_traces(
    marker_line_width=edge_size,
    marker_line_color=edge_color,
    # selector=dict(mode='markers')
    )

fig.update_layout(
    font_family=font_family,
    font_color=font_color,
    font_size=font_size)

# fig.update_layout( yaxis={'categoryorder':'array', 'categoryarray':df.index})

fig.update_xaxes(visible=True, showticklabels=True, automargin=True)
fig.update_yaxes(visible=True, showticklabels=True, automargin=True, title=None)
fig.update_xaxes(tickfont = dict(family=font_family, color=font_color, size = font_size-4))
fig.update_yaxes(tickfont = dict(family=font_family, color=font_color, size = font_size-4))
fig.update_xaxes(ticklabelposition="inside")

fig.add_annotation(
    xref = "paper", yref = "paper",
    x=0, y=0, #x=1
    xanchor="left", yanchor="top", align="center",
    text="© Parti Gábor, 2022",
    font={"color": "gainsboro", "size": 8, "family": font_family},
    showarrow=False)

# add images
fig.add_layout_image(
    source="https://upload.wikimedia.org/wikipedia/en/thumb/9/9e/PolyU_Logo_with_wordmark.svg/1024px-PolyU_Logo_with_wordmark.svg.png",
    sizex=0.1, sizey=0.1,
    # source="https://upload.wikimedia.org/wikipedia/en/thumb/5/52/PolyU.svg/759px-PolyU.svg.png",
    # sizex=0.15, sizey=0.15,
    x=1, y=0, 
    xanchor="right", yanchor="top", 
)

fig.update_layout(
    # width = 1200, 
    height=550,
    margin={"r":0,"t":0,"l":0,"b":40},
    title=dict(y=1, x=0, xanchor='left', yanchor='top', 
               text='Dates of attestation of a set of spice terms in English, according to the OED',   
               font={"color": font_color, "size": font_size+4, "family": font_family}),
    # legend=dict(x=1, y=0.9, xanchor="left", yanchor="top",
    #             font={"color": font_color, "size": font_size-2, "family": font_family},
    #             orientation="v")
    # )
    legend=dict(x=0, y=-0.05, xanchor="left", yanchor="top",
                title=dict(side='top'),
                font={"color": font_color, "size": font_size-2, "family": font_family},
                orientation="h")
    )

fig.show()

# write and save
filename = "attestation_and_borrowing"
fig.write_html(path_out_html + filename + lang + ".html")

#### Compact

##### Document

In [312]:
# Variables #
marker_size = 24
font_size = 24

fig = px.scatter(df, x="year", y="language", # y=df.index, 
                 size="size", 
                #  text="term",
                 size_max=marker_size,
                 color="language",
                #  color_discrete_sequence=px.colors.qualitative.Prism,
                 color_discrete_sequence=[p6,p4,p2],
                #  symbol="borrowing",
                #  symbol_sequence = ['circle-open', 'circle', 'circle-open-dot'], 
                 opacity = opacity,
                 hover_name='term',
                 hover_data={'id':True, 'literal':False, 'date':True, 'year':False, 'id':True, 'species':True, 'size':False},
                 marginal_x="histogram",
                 template="plotly_white"
                 )

fig.update_traces(
    marker_line_width=edge_size,
    marker_line_color=edge_color,
    # selector=dict(mode='markers')
    )

fig.update_layout(
    font_family=font_family,
    font_color=font_color,
    font_size=font_size)

# fig.update_layout( yaxis={'categoryorder':'array', 'categoryarray':df.index})

fig.update_xaxes(visible=True, showticklabels=True, automargin=True)
fig.update_yaxes(visible=True, showticklabels=True, automargin=True, title=None)
fig.update_xaxes(tickfont = dict(family=font_family, color=font_color, size = font_size-4))
fig.update_yaxes(tickfont = dict(family=font_family, color=font_color, size = font_size-4))
fig.update_xaxes(ticklabelposition="inside")


# pdf size #####################################################################
fig.update_layout(
    width = 1000, height=600,
    margin={"r":0,"t":0,"l":0,"b":20},
    # title=dict(y=1, x=0, xanchor='left', yanchor='top', 
              #  text='Dates of attestation of a set of spice terms in English, according to the OED',   
              #  font={"color": font_color, "size": font_size+2, "family": font_family}),
    legend=dict(x=0, y=-0.1, xanchor="left", yanchor="top",
                title=dict(side='top'),
                font={"color": font_color, "size": font_size-2, "family": font_family},
                orientation="h")
    )

fig.show()

# write and download
filename = "attestation_compact"
fig.write_image(path_out_pdf + filename + ".pdf", engine="kaleido")

In [313]:
# Variables #
marker_size = 24
font_size = 24

fig = px.scatter(df, x="year", y="language", # y=df.index, 
                 size="size", 
                #  text="term",
                 size_max=marker_size,
                 color="language",
                #  color_discrete_sequence=px.colors.qualitative.Prism,
                 color_discrete_sequence=[p6,p4,p2],
                 symbol="borrowing",
                 symbol_sequence = ['circle-open', 'circle', 'circle-open-dot'], 
                 opacity = opacity,
                 hover_name='term',
                 hover_data={'id':True, 'literal':False, 'date':True, 'year':False, 'id':True, 'species':True, 'size':False},
                 marginal_x="histogram",
                 template="plotly_white"
                 )

fig.update_traces(
    marker_line_width=edge_size,
    marker_line_color=edge_color,
    # selector=dict(mode='markers')
    )

fig.update_layout(
    font_family=font_family,
    font_color=font_color,
    font_size=font_size)

# fig.update_layout( yaxis={'categoryorder':'array', 'categoryarray':df.index})

fig.update_xaxes(visible=True, showticklabels=True, automargin=True)
fig.update_yaxes(visible=True, showticklabels=True, automargin=True, title=None)
fig.update_xaxes(tickfont = dict(family=font_family, color=font_color, size = font_size-4))
fig.update_yaxes(tickfont = dict(family=font_family, color=font_color, size = font_size-4))
fig.update_xaxes(ticklabelposition="inside")


# pdf size #####################################################################
fig.update_layout(
    width = 1000, height=600,
    margin={"r":0,"t":0,"l":0,"b":20},
    # title=dict(y=1, x=0, xanchor='left', yanchor='top', 
              #  text='Dates of attestation of a set of spice terms in English, according to the OED',   
              #  font={"color": font_color, "size": font_size+2, "family": font_family}),
    legend=dict(x=0, y=-0.1, xanchor="left", yanchor="top",
                title=dict(side='top'),
                font={"color": font_color, "size": font_size-2, "family": font_family},
                orientation="h")
    )

fig.show()

# write and download
filename = "attestation_and_borrowing_compact"
fig.write_image(path_out_pdf + filename + ".pdf", engine="kaleido")

##### HTML

In [314]:
# Variables #
marker_size = 20
font_size = 20

fig = px.scatter(df, x="year", y="language", # y=df.index, 
                 size="size", 
                #  text="term",
                 size_max=marker_size,
                 color="language",
                #  color_discrete_sequence=px.colors.qualitative.Prism,
                 color_discrete_sequence=[p6,p4,p2],
                #  symbol="borrowing",
                #  symbol_sequence = ['circle-open', 'circle', 'circle-open-dot'], 
                 opacity = opacity,
                 hover_name='term',
                 hover_data={'id':True, 'literal':False, 'date':True, 'year':False, 'id':True, 'species':True, 'size':False},
                 marginal_x="histogram",
                 template="plotly_white"
                 )

fig.update_traces(
    marker_line_width=edge_size,
    marker_line_color=edge_color,
    # selector=dict(mode='markers')
    )

fig.update_layout(
    font_family=font_family,
    font_color=font_color,
    font_size=font_size)

# fig.update_layout( yaxis={'categoryorder':'array', 'categoryarray':df.index})

fig.update_xaxes(visible=True, showticklabels=True, automargin=True)
fig.update_yaxes(visible=True, showticklabels=True, automargin=True, title=None)
fig.update_xaxes(tickfont = dict(family=font_family, color=font_color, size = font_size-4))
fig.update_yaxes(tickfont = dict(family=font_family, color=font_color, size = font_size-4))
fig.update_xaxes(ticklabelposition="inside")


fig.add_annotation(
    xref = "paper", yref = "paper",
    x=0, y=0, #x=1
    xanchor="left", yanchor="top", align="center",
    text="© Parti Gábor, 2022",
    font={"color": "gainsboro", "size": 8, "family": font_family},
    showarrow=False)

# add images
fig.add_layout_image(
    source="https://upload.wikimedia.org/wikipedia/en/thumb/9/9e/PolyU_Logo_with_wordmark.svg/1024px-PolyU_Logo_with_wordmark.svg.png",
    sizex=0.1, sizey=0.1,
    # source="https://upload.wikimedia.org/wikipedia/en/thumb/5/52/PolyU.svg/759px-PolyU.svg.png",
    # sizex=0.15, sizey=0.15,
    x=1, y=0, 
    xanchor="right", yanchor="top", 
)

fig.update_layout(
    # width = 1200, 
    height=550,
    margin={"r":0,"t":40,"l":0,"b":40},
    title=dict(y=1, x=0, xanchor='left', yanchor='top', 
               text='Dates of attestation of a set of spice terms.',   
               font={"color": font_color, "size": font_size+4, "family": font_family}),
    # legend=dict(x=1, y=0.9, xanchor="left", yanchor="top",
    #             font={"color": font_color, "size": font_size-2, "family": font_family},
    #             orientation="v")
    # )
    legend=dict(x=0, y=-0.1, xanchor="left", yanchor="top",
                title=dict(side='top'), bgcolor=transparent,
                font={"color": font_color, "size": font_size-2, "family": font_family},
                orientation="h")
    )

fig.show()

# write and save
filename = "attestation_compact"
fig.write_html(path_out_html + filename + lang + ".html")

In [315]:
# Variables #
marker_size = 20
font_size = 20

fig = px.scatter(df, x="year", y="language", # y=df.index, 
                 size="size", 
                #  text="term",
                 size_max=marker_size,
                 color="language",
                #  color_discrete_sequence=px.colors.qualitative.Prism,
                 color_discrete_sequence=[p6,p4,p2],
                 symbol="borrowing",
                 symbol_sequence = ['circle-open', 'circle', 'circle-open-dot'], 
                 opacity = opacity,
                 hover_name='term',
                 hover_data={'id':True, 'literal':False, 'date':True, 'year':False, 'id':True, 'species':True, 'size':False},
                 marginal_x="histogram",
                 template="plotly_white"
                 )

fig.update_traces(
    marker_line_width=edge_size,
    marker_line_color=edge_color,
    # selector=dict(mode='markers')
    )

fig.update_layout(
    font_family=font_family,
    font_color=font_color,
    font_size=font_size)

# fig.update_layout( yaxis={'categoryorder':'array', 'categoryarray':df.index})

fig.update_xaxes(visible=True, showticklabels=True, automargin=True)
fig.update_yaxes(visible=True, showticklabels=True, automargin=True, title=None)
fig.update_xaxes(tickfont = dict(family=font_family, color=font_color, size = font_size-4))
fig.update_yaxes(tickfont = dict(family=font_family, color=font_color, size = font_size-4))
fig.update_xaxes(ticklabelposition="inside")


fig.add_annotation(
    xref = "paper", yref = "paper",
    x=0, y=0, #x=1
    xanchor="left", yanchor="top", align="center",
    text="© Parti Gábor, 2022",
    font={"color": "gainsboro", "size": 8, "family": font_family},
    showarrow=False)

# add images
fig.add_layout_image(
    source="https://upload.wikimedia.org/wikipedia/en/thumb/9/9e/PolyU_Logo_with_wordmark.svg/1024px-PolyU_Logo_with_wordmark.svg.png",
    sizex=0.1, sizey=0.1,
    # source="https://upload.wikimedia.org/wikipedia/en/thumb/5/52/PolyU.svg/759px-PolyU.svg.png",
    # sizex=0.15, sizey=0.15,
    x=1, y=0, 
    xanchor="right", yanchor="top", 
)

fig.update_layout(
    # width = 1200, 
    height=550,
    margin={"r":0,"t":40,"l":0,"b":40},
    title=dict(y=1, x=0, xanchor='left', yanchor='top', 
               text='Dates of attestation of a set of spice terms.',   
               font={"color": font_color, "size": font_size+4, "family": font_family}),
    # legend=dict(x=1, y=0.9, xanchor="left", yanchor="top",
    #             font={"color": font_color, "size": font_size-2, "family": font_family},
    #             orientation="v")
    # )
    legend=dict(x=0, y=-0.1, xanchor="left", yanchor="top",
                title=dict(side='top'), bgcolor=transparent,
                font={"color": font_color, "size": font_size-2, "family": font_family},
                orientation="h")
    )

fig.show()

# write and save
filename = "attestation_and_borrowing_compact"
fig.write_html(path_out_html + filename + lang + ".html")

### Number of borrowings

In [316]:
# Read and store content of an excel file 
read_file = pd.read_excel(path_in+"names.xlsx")
# Write the dataframe object into csv file
read_file.to_csv (path_in+"names.csv", index = None, header=True)
# Load in dataset of names
df_names=pd.read_csv(path_in+'names.csv', header =[0], delimiter=',', encoding="utf-8")
# Include ones to include
df_names = df_names.loc[df_names['include'] == 'yes']

In [317]:
# Plot base set of spices
df = df_names.copy()

# Selections
# df = df.loc[df['language'] == 'English']
# df = df.loc[df['language'] == 'Arabic']
df = df.loc[(df['language'] == 'English') | (df['language'] == 'Arabic') | (df['language'] == 'Chinese')]

df = df.loc[df['status'] == 'default']
# df = df.loc[(df['status'] == 'default') | (df['status'] == 'historic')]

# drop rows where date is missing
# df.drop(df[df['date'] == ''].index, inplace = True)
df = df.dropna(subset=['date'])

# Manipulate dates
df['year'] = df['date']
df['year'] = df['year'].str.replace(r'[ac\?]', '', regex=True)
df['year'] = df['year'].str.replace(r'.*-', '-', regex=True)
df['year'] = df['year'].str.replace(r'eOE', '800', regex=True)
df['year'] = df['year'].astype(int)

# Sort data
# df['class'] = pd.Categorical(df['class'], ["n.", "v.", "adj.", "adv.", "phrase"]) 
df.sort_values(["year"], inplace = True, ascending=True)

# Add mock column for size
df['size'] = 1
df.reset_index(inplace=True, drop=True)

print(df.shape[0])

# df.drop(df[df['id'] == 'black cardamom'].index, inplace = True)

df.head

df_en = df.loc[df['language'] == 'English']
df_ar = df.loc[df['language'] == 'Arabic']
df_zh = df.loc[df['language'] == 'Chinese']

72


In [318]:
df_en_borr = df_en[['id', 'term', 'borrowing']]
df_en_borr = df_en_borr.sort_values(['id'], key=lambda col: col.str.lower())
df_en_borr.reset_index(inplace=True, drop=True)
df_en_borr.columns = ['id', 'English', 'en']

df_ar_borr = df_ar[['id', 'term', 'borrowing']]
df_ar_borr = df_ar_borr.sort_values(['id'], key=lambda col: col.str.lower())
df_ar_borr.reset_index(inplace=True, drop=True)
df_ar_borr.columns = ['id_ar', 'Arabic', 'ar']

df_zh_borr = df_zh[['id', 'term', 'borrowing']]
df_zh_borr = df_zh_borr.sort_values(['id'], key=lambda col: col.str.lower())
df_zh_borr.reset_index(inplace=True, drop=True)
df_zh_borr.columns = ['id_zh', 'Chinese', 'zh']

# df_borr = pd.concat([df_en_borr, df_ar_borr, df_zh_borr])
# df_borr

frames = [df_en_borr, df_ar_borr, df_zh_borr]

df_borr = pd.concat(frames, axis=1)
df_borr.drop(columns=['id_ar', 'id_zh'], inplace=True)

df_borr['en'] = [re.sub("yes", "+", str(x)) for x in df_borr['en']]
df_borr['en'] = [re.sub("no", "-", str(x)) for x in df_borr['en']]
df_borr['en'] = [re.sub("maybe", "?", str(x)) for x in df_borr['en']]

df_borr['ar'] = [re.sub("yes", "+", str(x)) for x in df_borr['ar']]
df_borr['ar'] = [re.sub("no", "-", str(x)) for x in df_borr['ar']]
df_borr['ar'] = [re.sub("maybe", "?", str(x)) for x in df_borr['ar']]

df_borr['zh'] = [re.sub("yes", "+", str(x)) for x in df_borr['zh']]
df_borr['zh'] = [re.sub("no", "-", str(x)) for x in df_borr['zh']]
df_borr['zh'] = [re.sub("maybe", "?", str(x)) for x in df_borr['zh']]

# for index,row in df_borr
# df_borr['#'] 

# df_borr.to_csv(path_out+"borrowing.csv", index = None, header=True)
df_borr

Unnamed: 0,id,English,en,Arabic,ar,Chinese,zh
0,allspice,allspice,-,fulful ifranjī,-,duōxiāngguǒ,+
1,anise,anise,+,anīsūn,+,huíqín,-
2,asafoetida,asafoetida,+,ḥiltīt,+,āwèi,+
3,caraway,caraway,+,karāwiyā,+,gělǚzi,+
4,cardamom,cardamom,+,hāl,+,dòukòu,?
5,cassia,cassia,+,salīkha,-,ròuguì,-
6,chile,chili,+,fulful ḥārr,-,làjiāo,-
7,cinnamon,cinnamon,+,qirfa,-,xīlánròuguì,+
8,clove,clove,+,qaranful,+,dīngxiāng,-
9,coriander,coriander,+,kuzbara,+,yánsuī,-


In [319]:
series = df_en['borrowing'].value_counts()
series = series.sort_values(ascending=False)
df_result = pd.DataFrame(series)
df_result = df_result.reset_index()  
df_result.columns = ['id', 'total']
df_result.loc[5:, 'id'] = 'other' # Replace everything with 'others' after the fifth!
df_result['id'] = pd.Categorical(df_result['id'], ["yes", "maybe", "no"]) 
df_result.sort_values(['id'], inplace = True,)
df_result_en = df_result

series = df_ar['borrowing'].value_counts()
series = series.sort_values(ascending=False)
df_result = pd.DataFrame(series)
df_result = df_result.reset_index()  
df_result.columns = ['id', 'total']
df_result.loc[5:, 'id'] = 'other' # Replace everything with 'others' after the fifth!
df_result['id'] = pd.Categorical(df_result['id'], ["yes", "maybe", "no"]) 
df_result.sort_values(['id'], inplace = True,)
df_result_ar = df_result

series = df_zh['borrowing'].value_counts()
series = series.sort_values(ascending=False)
df_result = pd.DataFrame(series)
df_result = df_result.reset_index()  
df_result.columns = ['id', 'total']
df_result.loc[5:, 'id'] = 'other' # Replace everything with 'others' after the fifth!
df_result['id'] = pd.Categorical(df_result['id'], ["yes", "maybe", "no"]) 
df_result.sort_values(['id'], inplace = True, ascending=False)
df_result_zh = df_result

df_result_en

Unnamed: 0,id,total
0,yes,22
1,no,2


In [320]:
################################################################################
# Variables
font_size = 26
hole_size = 0.45

# Create subplots: use 'domain' type for Pie subplot
fig = make_subplots(rows=1, cols=3, horizontal_spacing=0.1, specs=[[{'type':'domain'}, {'type':'domain'}, {'type':'domain'}]])

fig.add_trace(go.Pie(name='English',
                     labels=df_result_en.id,
                     values=df_result_en.total,
                    #  marker_colors=[p1, p3, p11],
                     ), 1, 1)

fig.add_trace(go.Pie(name='Arabic',
                     labels=df_result_ar.id, 
                     values=df_result_ar.total,
                    #  marker_colors=[p1, p11],
                     ), 1, 2)

fig.add_trace(go.Pie(name='Chinese',
                     labels=df_result_zh.id, 
                     values=df_result_zh.total,
                    #  marker_colors=[p1, p5, p11],
                     ), 1, 3)

fig.update_traces(
    direction ='clockwise',
    textinfo = "value",
    hoverinfo="label+percent+name",
    hole=hole_size, 
    sort=False,
    # marker_colors=px.colors.qualitative.Prism,
    # marker_colors=['crimson', 'pink', 'darkslategray'],
    )

fig.update_layout(
    template='yesnomaybe',
    width = 1000, height=225,
    margin={"r":0,"t":0,"l":0,"b":0},
    font_family=font_family,
    font_color=font_color,
    font_size=font_size,
    # legend_traceorder="reversed",
    # showlegend=False,
    legend=dict(
        bgcolor=half_transparent,
        x=0, y=1, xanchor="right", yanchor="top",
        font = dict(size=font_size),
        # traceorder = 'normal', 
        orientation="v"
        ))
    # legend=dict(
    #     bgcolor=half_transparent,
    #     x=0, y=0, xanchor="left", yanchor="top",
    #     font = dict(size=font_size),
    #     # traceorder = 'normal', 
    #     orientation="h"
    #     ))

fig.update_layout(annotations=[dict(text='English', x=0.085, y=0.5, font_size=font_size, showarrow=False), 
                               dict(text='Arabic', x=0.5, y=0.5, font_size=font_size, showarrow=False),
                               dict(text='Chinese', x=0.9175, y=0.5, font_size=font_size, showarrow=False)
                               ])

fig.show()

# write and download
filename="borrowing_pie"
# fig.write_html(filename + ".html")

fig.write_image(path_out_pdf + filename + ".pdf", engine="kaleido")

## Donor languages

In [321]:
# Read and store content of an excel file 
read_file = pd.read_excel(path_in + "etymologies.xlsx")

# Write the dataframe object into csv file
read_file.to_csv (path_in + "etymologies.csv", index = None, header=True)

# Load in dataset
df_etymologies=pd.read_csv(path_in + 'etymologies.csv', header =[0], delimiter=',', encoding="utf-8")

df_etymologies = df_etymologies.loc[df_etymologies['include'] == 'yes'] # include ones to include

In [322]:
df = df_etymologies.copy()
# Drop rows if all columns are empty
df.dropna(how='all', inplace=True)

#Drop the final word stages.
df.drop(df[(df.language == 'English') & (df['#'] == 1)].index, inplace=True)
df.drop(df[(df.language == 'Arabic') & (df['#'] == 1)].index, inplace=True)
df.drop(df[(df.language == 'Mandarin Chinese') & (df['#'] == 1)].index, inplace=True)
df.drop(df[df.language == ''].index, inplace=True)

df.drop(df[df.plotskip == 'yes'].index, inplace=True)


print(df.shape[0])

# Count things
df_en = df[df.lang == 'en']
df_ar = df[df.lang == 'ar']
df_zh = df[df.lang == 'zh']

# df_zh

df['language'] = [re.sub("Classical Syriac", "Aramaic", str(x)) for x in df['language']]
df['language'] = [re.sub("Medieval Latin", "Latin", str(x)) for x in df['language']]
df['language'] = [re.sub("Pahlavi", "Persian", str(x)) for x in df['language']]

donors = df.language.value_counts()#[:25]

# donors.style
# df.language
display(donors)

160


Latin                       27
Persian                     18
Sanskrit                    16
Ancient Greek               15
Aramaic                     10
French                       7
Old French                   6
Arabic                       6
Akkadian                     6
unknown                      4
Proto-Iranian*               4
Dravidian*                   3
Middle Indo-Aryan*           3
Proto-Sino-Tibetan*          3
Spanish                      3
Semitic*                     3
Anglo-Norman                 2
Pali                         2
English                      2
Egyptian (Ancient)           2
Portuguese                   1
Tokharian B                  1
Romance*                     1
Old Chinese                  1
Sauraseni Prakrit            1
Uyghur                       1
Iranian*                     1
Hindi                        1
Sogdian                      1
Proto-Dravidian*             1
Slavic*                      1
Serbian-Croatian-Bosnian     1
Hungaria

In [323]:
type(donors)
d = donors.index.to_list()
type(d)
d

['Latin',
 'Persian',
 'Sanskrit',
 'Ancient Greek',
 'Aramaic',
 'French',
 'Old French',
 'Arabic',
 'Akkadian',
 'unknown',
 'Proto-Iranian*',
 'Dravidian*',
 'Middle Indo-Aryan*',
 'Proto-Sino-Tibetan*',
 'Spanish',
 'Semitic*',
 'Anglo-Norman',
 'Pali',
 'English',
 'Egyptian (Ancient)',
 'Portuguese',
 'Tokharian B',
 'Romance*',
 'Old Chinese',
 'Sauraseni Prakrit',
 'Uyghur',
 'Iranian*',
 'Hindi',
 'Sogdian',
 'Proto-Dravidian*',
 'Slavic*',
 'Serbian-Croatian-Bosnian',
 'Hungarian',
 'Classical Nahuatl',
 'Ancient Hebrew',
 'Japanese',
 'Hellenistic Greek',
 'West Germanic*']

In [324]:
# Plots to count things
series = df['language'].value_counts()
df_result = pd.DataFrame(series)
df_result = df_result.reset_index()  
df_result.columns = ['id', 'total']
# print(df_result)
# df_result.loc[10:, 'id'] = 'other' # Replace everything with 'others' after the fifth!

################################################################################

# Variables
font_size = 20
hole_size = 0.25

fig = go.Figure(data=[go.Pie(labels=df_result.id, 
                              values=df_result.total, 
                              direction ='clockwise', 
                              hole=hole_size, 
                              # marker_colors=px.colors.qualitative.Prism,
                              marker_colors=prism*4,
                              sort=False)])

fig.update_traces(textinfo='label')

fig.update_layout(
    width = 600, height=950,
    margin={"r":0,"t":0,"l":0,"b":0},
    font_family=font_family,
    font_color=font_color,
    font_size=font_size,
    showlegend=False,
    legend=dict(
        bgcolor=half_transparent,
        x=0, y=1, xanchor="right", yanchor="top",
        font = dict(size=font_size),
        # traceorder = 'normal', 
        orientation="v",
        ))

fig.show()
# write and download

# fig.write_html(filename + ".html")

filename="donors_pie"
fig.write_image(path_out_pdf + filename + ".pdf", engine="kaleido")

### Donor barplots

In [325]:
df = df_etymologies.copy()
# Drop rows if all columns are empty
df.dropna(how='all', inplace=True)

#Drop the final word stages.
df.drop(df[(df.language == 'English') & (df['#'] == 1)].index, inplace=True)
df.drop(df[(df.language == 'Arabic') & (df['#'] == 1)].index, inplace=True)
df.drop(df[(df.language == 'Mandarin Chinese') & (df['#'] == 1)].index, inplace=True)
df.drop(df[df.language == ''].index, inplace=True)
df.drop(df[df.language == 'Unknown'].index, inplace=True)

df.drop(df[df.plotskip == 'yes'].index, inplace=True)


print(df.shape[0])

df_en = df[df.lang == 'en']
df_ar = df[df.lang == 'ar']
df_zh = df[df.lang == 'zh']

df['language'] = [re.sub("Classical Syriac", "Aramaic", str(x)) for x in df['language']]
df['language'] = [re.sub("Medieval Latin", "Latin", str(x)) for x in df['language']]
df['language'] = [re.sub("Pahlavi", "Persian", str(x)) for x in df['language']]


160


In [326]:
top = 5

# Bar plots
series = df_en['language'].value_counts()
series = series.sort_values(ascending=False)
df_result = pd.DataFrame(series)
df_result = df_result.reset_index()  
df_result.columns = ['language', 'total']
df_result['lang'] = 'English'
df_result.loc[top:, 'language'] = 'other'
df_result_en = df_result

series = df_ar['language'].value_counts()
series = series.sort_values(ascending=False)
df_result = pd.DataFrame(series)
df_result = df_result.reset_index()  
df_result.columns = ['language', 'total']
df_result['lang'] = 'Arabic'
df_result.loc[top:, 'language'] = 'other'
df_result_ar = df_result

series = df_zh['language'].value_counts()
series = series.sort_values(ascending=False)
df_result = pd.DataFrame(series)
df_result = df_result.reset_index()  
df_result.columns = ['language', 'total']
df_result['lang'] = 'Chinese'
df_result.loc[top:, 'language'] = 'other'
df
df_result_zh = df_result

df_result_all = pd.concat([df_result_en, df_result_ar, df_result_zh])
df_result_all = df_result_all.loc[df_result_all['language'] != 'other']
df=df_result_all

In [327]:
# Variables
font_size = 20

fig = px.bar(df, 
             x="lang", y="total", 
             color="language", 
             text="language",
            #  color_discrete_sequence = px.colors.qualitative.Prism,
             color_discrete_sequence = prism_extended,
             template="plotly_white",
            #  labels = dict(lang="Language",total="Times")
             )

fig.update_layout(
    font_family=font_family,
    font_color=font_color,
    font_size=font_size)

fig.update_traces(textfont_size=font_size-2, textangle=0, textposition="inside")

fig.update_xaxes(visible=True, title=None)
fig.update_yaxes(visible=True, title=None, showticklabels=True)
fig.update_xaxes(ticklabelposition="inside")
fig.update_yaxes(ticklabelposition="inside") # tickangle = 0
fig.update_xaxes(tickfont = dict(family=font_family, color=font_color, size = font_size-2))
fig.update_yaxes(tickfont = dict(family=font_family, color=font_color, size = font_size-2))
# fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='#f8f8f8') #f0f0f0
# fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='#f8f8f8')
# fig.update_layout(paper_bgcolor="white") # transparent background rgb(0,0,0,0)
# fig.update_layout(plot_bgcolor="white")#f6f6f6

fig.update_layout(
    width = 800, height=400,
    margin={"r":0,"t":0,"l":0,"b":0},
    # showlegend=False,
    legend=dict(
        bgcolor=half_transparent,
        x=0, y=1, xanchor="right", yanchor="top",
        font = dict(size=font_size-4),
        title = "donor language",
        # traceorder = 'normal', 
        orientation="v"
        ))
    # legend=dict(
    #     bgcolor=half_transparent,
    #     x=0, y=0, xanchor="left", yanchor="top",
    #     font = dict(size=font_size-2),
    #     # traceorder = 'normal', 
    #     orientation="h"
    #     ))


# fig.update_layout(annotations=[dict(text='English', x=0.11, y=0.5, font_size=24, showarrow=False), 
#                                dict(text='Arabic', x=0.50, y=0.5, font_size=24, showarrow=False),
#                                dict(text='Chinese', x=0.89, y=0.5, font_size=24, showarrow=False)
#                                ])

fig.show()

# write and download
filename="donor_bar"
# fig.write_html(filename + ".html")
fig.write_image(path_out_pdf + filename + ".pdf", engine="kaleido")

#### Only source languages

In [328]:
df = df_etymologies.copy()
# Drop rows if all columns are empty
df.dropna(how='all', inplace=True)
df.reset_index(inplace=True, drop=True)

idx = df.index[df['#'] == 1].tolist()
idx.pop(0)
idx = [x-1 for x in idx]

df = df.iloc[idx]
df = df[df['#'] != 1]
df

# #Drop the final word stages.
# df.drop(df[(df.language == 'English') & (df['#'] == 1)].index, inplace=True)
# df.drop(df[(df.language == 'Arabic') & (df['#'] == 1)].index, inplace=True)
# df.drop(df[(df.language == 'Mandarin Chinese') & (df['#'] == 1)].index, inplace=True)

# df.drop(df[df.language == ''].index, inplace=True)
# df.drop(df[df.language == 'Unknown'].index, inplace=True)
# df.drop(df[df.boxskip == 'yes'].index, inplace=True)

print(df.shape[0])

# Count things
df_en = df[df.lang == 'en']
df_ar = df[df.lang == 'ar']
df_zh = df[df.lang == 'zh']

df['language'] = [re.sub("Classical Syriac", "Aramaic", str(x)) for x in df['language']]
df['language'] = [re.sub("Medieval Latin", "Latin", str(x)) for x in df['language']]
df['language'] = [re.sub("Pahlavi", "Persian", str(x)) for x in df['language']]


58


In [329]:
top = 5

# Bar plots
series = df_en['language'].value_counts()
series = series.sort_values(ascending=False)
df_result = pd.DataFrame(series)
df_result = df_result.reset_index()  
df_result.columns = ['language', 'total']
df_result['lang'] = 'English'
df_result.loc[top:, 'language'] = 'other'
df_result_en = df_result

series = df_ar['language'].value_counts()
series = series.sort_values(ascending=False)
df_result = pd.DataFrame(series)
df_result = df_result.reset_index()  
df_result.columns = ['language', 'total']
df_result['lang'] = 'Arabic'
df_result.loc[top:, 'language'] = 'other'
df_result_ar = df_result

series = df_zh['language'].value_counts()
series = series.sort_values(ascending=False)
df_result = pd.DataFrame(series)
df_result = df_result.reset_index()  
df_result.columns = ['language', 'total']
df_result['lang'] = 'Chinese'
df_result.loc[top:, 'language'] = 'other'
df
df_result_zh = df_result

df_result_all = pd.concat([df_result_en, df_result_ar, df_result_zh])
df_result_all = df_result_all.loc[df_result_all['language'] != 'other']
df=df_result_all

In [330]:
# Variables
font_size = 20

fig = px.bar(df, 
             x="lang", y="total", 
             color="language", 
             text="language",
            #  color_discrete_sequence = px.colors.qualitative.Prism,
             color_discrete_sequence = prism_extended,
             template="plotly_white",
            #  labels = dict(lang="Language",total="Times")
             )

fig.update_layout(
    font_family=font_family,
    font_color=font_color,
    font_size=font_size)

fig.update_traces(textfont_size=font_size-2, textangle=0, textposition="inside")

fig.update_xaxes(visible=True, title=None)
fig.update_yaxes(visible=True, title=None, showticklabels=True)
fig.update_xaxes(ticklabelposition="inside")
fig.update_yaxes(ticklabelposition="inside") # tickangle = 0
fig.update_xaxes(tickfont = dict(family=font_family, color=font_color, size = font_size-2))
fig.update_yaxes(tickfont = dict(family=font_family, color=font_color, size = font_size-2))
# fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='#f8f8f8') #f0f0f0
# fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='#f8f8f8')
# fig.update_layout(paper_bgcolor="white") # transparent background rgb(0,0,0,0)
# fig.update_layout(plot_bgcolor="white")#f6f6f6

fig.update_layout(
    width = 800, height=400,
    margin={"r":0,"t":0,"l":0,"b":0},
    # showlegend=False,
    legend=dict(
        bgcolor=half_transparent,
        x=0, y=1, xanchor="right", yanchor="top",
        font = dict(size=font_size-4),
        title = "source language",
        # traceorder = 'normal', 
        orientation="v"
        ))
    # legend=dict(
    #     bgcolor=half_transparent,
    #     x=0, y=0, xanchor="left", yanchor="top",
    #     font = dict(size=font_size-2),
    #     # traceorder = 'normal', 
    #     orientation="h"
    #     ))


# fig.update_layout(annotations=[dict(text='English', x=0.11, y=0.5, font_size=24, showarrow=False), 
#                                dict(text='Arabic', x=0.50, y=0.5, font_size=24, showarrow=False),
#                                dict(text='Chinese', x=0.89, y=0.5, font_size=24, showarrow=False)
#                                ])

fig.show()

# write and download
filename="source_bar"
# fig.write_html(filename + ".html")
fig.write_image(path_out_pdf + filename + ".pdf", engine="kaleido")

# Maps

## Diffusion

In [331]:
# Variables (light) ########
transparent = 'rgba(255,255,255,0)'
half_transparent = 'rgba(255,255,255,0.5)'
quarter_transparent = 'rgba(255,255,255,0.25)'
tenth_transparent = 'rgba(255,255,255,0.1)'

marker_symbol= 'circle'
marker_size = 14
edge_size = 1
edge_color = 'white'
opacity = 0.75
line_width = 4
font_size = 14
font_family = "Serif"
font_color = "black"
water = '#ebedef'
grid_color = '#d6dbdf'
land = '#aeb6bf'
lines = '#85929e'
water = 'white'
grid_color = '#EDEDED'
land = 'gainsboro'
lines = 'gainsboro'
background_color = transparent
legend_background_color = tenth_transparent

### English

#### Preprocess datasets

In [332]:
# Load in datasets
languages=pd.read_csv(path_in+'languages/languages.csv', header =[0], delimiter=',', encoding="utf-8")
read_file = pd.read_excel(path_in+"etymologies.xlsx")
read_file.to_csv (path_in+"etymologies.csv", index = None, header=True)
df=pd.read_csv(path_in+'etymologies.csv', header =[0], delimiter=',', encoding="utf-8")

# # Select items to put on map?
#   df = df[df.lang == lang] 
#   if len(df) > 0:
#     df_list.append(df)

# Select language
lang = "en"
# lang = "ar"
# lang = "zh"

if lang == 'en':
  language = 'English'
elif lang == 'ar':
  language = 'Arabic'
elif lang == 'zh':
  language = 'Chinese'

# Split the dataset wherever an empty row is found
df_list_with_na = np.split(df, df[df.isnull().all(1)].index)

# Drop NAs and reset the index
df_list = []
for df in df_list_with_na:
  df = df[df.plotskip != "yes"] # droip the ones marked with skip
  df = df.dropna(how='all')
  df.reset_index(inplace=True, drop=True)
# Select English/Arabic/Chinese data only
  df = df[df.lang == lang] 
  if len(df) > 0:
    df_list.append(df)

# print(df)

# # Select only desired keywords
# key = "hing"
# key2 = "anjudan"
# key3 = "awei"
# df = df.loc[(df['id']==key) | (df['id']==key2) | (df['id']==key3)]

# Automatically extract IDs from the dataset
ids = []
for df in df_list:
  df = df.dropna(how='all')
  df.reset_index(drop=True)
  id = str(df['id'].iloc[0])
  ids.append(id)

# Print the number if IDs and what are they
length = len(df_list)
print(length, " items in total")
print(ids)

#Create a defaultdict of spice word etymologies
etymology=defaultdict(list)
for i in range(length):
  etymology[ids[i]]=df_list[i]

# Iterate etymology data and merge with language data for plotting maps, write it all in a callable dictionary.
names = {}

for i,j in etymology.items():
  df = etymology[i]
  # df.dropna(axis=1, inplace=True)
  j['language'] = [re.sub("\*", "", str(x)) for x in j['language']]
  df = pd.merge(df, languages, on=['language'])
  names[str(i)] = df

# # Save etymology datasets separately (for later use in R?)
# path="/content/drive/MyDrive/Thesis/Data/Etymology/"
# for i,j in names.items():
#   j.to_csv(path+str(i)+'.csv')

# Pop some not wanted items
# names.pop('long pepper')

# for e in ['amomum', 'paprika', 'hing', 'fagara', 'badian']: 
#   names.pop(e)
# print(len(names))

# pop single-stage words
one_stage_words = []
for i,j in names.items():
  if j.shape[0]==1:
    print("yes")
    one_stage_words.append(i)

for e in one_stage_words: 
  names.pop(e)
print(len(names))

c = 0
colors = prism*3

# Create text for the marker labels from the data
for i,j in names.items():
  j.fillna('', inplace=True) 
  j['doubt'] = [re.sub("yes", "?", str(x)) for x in j['doubt']]
  j['script'] = [re.sub(r'\\\w\w\w?\{', "", str(x)) for x in j['script']]
  j['script'] = [re.sub(r'\{', "", str(x)) for x in j['script']]
  j['script'] = [re.sub(r'\}', "", str(x)) for x in j['script']]
  j['text'] = j['doubt'].astype(str) + " " + j['script'].astype(str) + '<br>' + j['term'].astype(str) + '<br>' + j['language'].astype(str) + '<br>' + j['family'].astype(str)
  j['text'] = [re.sub('\\w+{', "", str(x)) for x in j['text']]
  j['text'] = [re.sub(r'\\', "", str(x)) for x in j['text']]
  j['text'] = [re.sub('[{}]', "", str(x)) for x in j['text']]
  j['text'] = [re.sub('^\s?<br>', "", str(x)) for x in j['text']]
  j['plot color'] = colors[c]
  c = c + 1

30  items in total
['allspice', 'pimento', 'anise', 'asafoetida', 'hing', 'caraway', 'cardamom', 'amomum', 'cassia', 'cinnamon', 'chile', 'paprika', 'clove', 'coriander', 'cumin', 'dill', 'fennel', 'fenugreek', 'ginger', 'long pepper', 'mace', 'nutmeg', 'pepper', 'saffron', 'Sichuan pepper', 'fagara', 'star anise', 'badian', 'turmeric', 'vanilla']
yes
yes
yes
yes
26


#### Plot

In [333]:
#### Document
fig = go.Figure()

for i,j in names.items():
  fig.add_trace(go.Scattergeo(name = j['id'].iloc[0], lon = j['lon'], lat = j['lat'],
      hoverinfo = 'text', hovertext = j['text'], text = j['text'],
      textfont={"color": font_color, "family": font_family, "size": font_size}, 
      textposition="middle right", mode = 'markers+lines',  
      opacity=opacity,
      line_color = str(j['plot color'].iloc[0]), line_width=line_width, line_dash='solid', 
      marker = dict(symbol=marker_symbol, size = marker_size, opacity=opacity, color = str(j['plot color'].iloc[0]), line = dict(width = edge_size, color = edge_color))))

fig.update_layout(ortho_layout)
fig.update_layout(document_size)

fig.show()

# write
filename = "diffusion_"
fig.write_image(path_out_pdf + filename + lang + ".pdf", engine="kaleido")

################################################################################

#### HTML
fig = go.Figure()

for i,j in names.items():
  fig.add_trace(go.Scattergeo(name = j['id'].iloc[0], lon = j['lon'], lat = j['lat'],
      hoverinfo = 'text', hovertext = j['text'], text = j['text'],
      textfont={"color": font_color, "family": font_family, "size": font_size}, 
      textposition="middle right", mode = 'markers+lines',
      opacity=opacity,
      line_color = str(j['plot color'].iloc[0]), line_width=line_width, line_dash='solid', 
      marker = dict(symbol=marker_symbol, size = marker_size, opacity=opacity, color = str(j['plot color'].iloc[0]), line = dict(width = edge_size, color = edge_color))))

fig.update_layout(ortho_layout)
fig.update_layout(title_text="Etymological stages of a few spice names in " + language)
fig.update_layout(template=cr)
fig.add_layout_image(logo)

fig.show()

# write
filename = "diffusion_"
fig.write_html(path_out_html + filename + lang + ".html")
fig.write_json(path_out_json + filename + lang + ".json", validate=True, pretty=True)

### Arabic

#### Preprocess datasets

In [334]:
# Load in datasets
languages=pd.read_csv(path_in+'languages/languages.csv', header =[0], delimiter=',', encoding="utf-8")
read_file = pd.read_excel(path_in+"etymologies.xlsx")
read_file.to_csv (path_in+"etymologies.csv", index = None, header=True)
df=pd.read_csv(path_in+'etymologies.csv', header =[0], delimiter=',', encoding="utf-8")

# # Select items to put on map?
#   df = df[df.lang == lang] 
#   if len(df) > 0:
#     df_list.append(df)

# Select language
# lang = "en"
lang = "ar"
# lang = "zh"

if lang == 'en':
  language = 'English'
elif lang == 'ar':
  language = 'Arabic'
elif lang == 'zh':
  language = 'Chinese'

# Split the dataset wherever an empty row is found
df_list_with_na = np.split(df, df[df.isnull().all(1)].index)

# Drop NAs and reset the index
df_list = []
for df in df_list_with_na:
  df = df[df.plotskip != "yes"] # droip the ones marked with skip
  df = df.dropna(how='all')
  df.reset_index(inplace=True, drop=True)
# Select English/Arabic/Chinese data only
  df = df[df.lang == lang] 
  if len(df) > 0:
    df_list.append(df)

# print(df)

# # Select only desired keywords
# key = "hing"
# key2 = "anjudan"
# key3 = "awei"
# df = df.loc[(df['id']==key) | (df['id']==key2) | (df['id']==key3)]

# Automatically extract IDs from the dataset
ids = []
for df in df_list:
  df = df.dropna(how='all')
  df.reset_index(drop=True)
  id = str(df['id'].iloc[0])
  ids.append(id)

# Print the number if IDs and what are they
length = len(df_list)
print(length, " items in total")
print(ids)

#Create a defaultdict of spice word etymologies
etymology=defaultdict(list)
for i in range(length):
  etymology[ids[i]]=df_list[i]

# Iterate etymology data and merge with language data for plotting maps, write it all in a callable dictionary.
names = {}

for i,j in etymology.items():
  df = etymology[i]
  # df.dropna(axis=1, inplace=True)
  j['language'] = [re.sub("\*", "", str(x)) for x in j['language']]
  df = pd.merge(df, languages, on=['language'])
  names[str(i)] = df

# # Save etymology datasets separately (for later use in R?)
# path="/content/drive/MyDrive/Thesis/Data/Etymology/"
# for i,j in names.items():
#   j.to_csv(path+str(i)+'.csv')

# Pop some not wanted items
# names.pop('long pepper')

# for e in ['anjudan', 'qaqulla', 'qirfa']: # DIFFERENT
#   names.pop(e)
# print(len(names))

# pop single-stage words
one_stage_words = []
for i,j in names.items():
  if j.shape[0]==1:
    print("yes")
    one_stage_words.append(i)

for e in one_stage_words: 
  names.pop(e)
print(len(names))

c = 0
colors = prism*3

# Create text for the marker labels from the data
for i,j in names.items():
  j.fillna('', inplace=True) 
  j['doubt'] = [re.sub("yes", "?", str(x)) for x in j['doubt']]
  j['script'] = [re.sub(r'\\\w\w\w?\{', "", str(x)) for x in j['script']]
  j['script'] = [re.sub(r'\{', "", str(x)) for x in j['script']]
  j['script'] = [re.sub(r'\}', "", str(x)) for x in j['script']]
  j['text'] = j['doubt'].astype(str) + " " + j['script'].astype(str) + '<br>' + j['term'].astype(str) + '<br>' + j['language'].astype(str) + '<br>' + j['family'].astype(str)
  j['text'] = [re.sub('\\w+{', "", str(x)) for x in j['text']]
  j['text'] = [re.sub(r'\\', "", str(x)) for x in j['text']]
  j['text'] = [re.sub('[{}]', "", str(x)) for x in j['text']]
  j['text'] = [re.sub('^\s?<br>', "", str(x)) for x in j['text']]
  j['plot color'] = colors[c]
  c = c + 1

27  items in total
['fulful ifranji', 'anisun', 'hiltit', 'anjudan', 'karawiya', 'hal', 'qaqulla', 'salikha', 'darsini', 'qirfa', 'fulful harr', 'qaranful', 'kuzbura', 'kammun', 'shibitt', 'shamar', 'hulba', 'zanjabil', 'darfilfil', 'basbas', 'jawz al-tib', 'fulful', 'zafaran', 'fulful sitshuwan', 'yansun najmi', 'kurkum', 'faniliya']
yes
yes
yes
yes
yes
yes
yes
20


#### Plot

In [335]:
#### Document
fig = go.Figure()

for i,j in names.items():
  fig.add_trace(go.Scattergeo(name = j['id'].iloc[0], lon = j['lon'], lat = j['lat'],
      hoverinfo = 'text', hovertext = j['text'], text = j['text'],
      textfont={"color": font_color, "family": font_family, "size": font_size}, 
      textposition="middle right", mode = 'markers+lines',  
      opacity=opacity,
      line_color = str(j['plot color'].iloc[0]), line_width=line_width, line_dash='solid', 
      marker = dict(symbol=marker_symbol, size = marker_size, opacity=opacity, color = str(j['plot color'].iloc[0]), line = dict(width = edge_size, color = edge_color))))

fig.update_layout(ortho_layout)
fig.update_layout(geo=dict(projection_rotation = {'lat': 15, 'lon': 60, 'roll': 0}))
fig.update_layout(document_size)

fig.show()

# write
filename = "diffusion_"
fig.write_image(path_out_pdf + filename + lang + ".pdf", engine="kaleido")

################################################################################

#### HTML
fig = go.Figure()

for i,j in names.items():
  fig.add_trace(go.Scattergeo(name = j['id'].iloc[0], lon = j['lon'], lat = j['lat'],
      hoverinfo = 'text', hovertext = j['text'], text = j['text'],
      textfont={"color": font_color, "family": font_family, "size": font_size}, 
      textposition="middle right", mode = 'markers+lines',
      opacity=opacity,
      line_color = str(j['plot color'].iloc[0]), line_width=line_width, line_dash='solid', 
      marker = dict(symbol=marker_symbol, size = marker_size, opacity=opacity, color = str(j['plot color'].iloc[0]), line = dict(width = edge_size, color = edge_color))))

fig.update_layout(ortho_layout)
fig.update_layout(geo=dict(projection_rotation = {'lat': 15, 'lon': 60, 'roll': 0}))
fig.update_layout(title_text="Etymological stages of a few spice names in " + language)
fig.update_layout(template=cr)
fig.add_layout_image(logo)

fig.show()

# write
filename = "diffusion_"
fig.write_html(path_out_html + filename + lang + ".html")
fig.write_json(path_out_json + filename + lang + ".json")

### Chinese

#### Preprocess datasets

In [336]:
# Load in datasets
languages=pd.read_csv(path_in+'languages/languages.csv', header =[0], delimiter=',', encoding="utf-8")
read_file = pd.read_excel(path_in+"etymologies.xlsx")
read_file.to_csv (path_in+"etymologies.csv", index = None, header=True)
df=pd.read_csv(path_in+'etymologies.csv', header =[0], delimiter=',', encoding="utf-8")

# # Select items to put on map?
#   df = df[df.lang == lang] 
#   if len(df) > 0:
#     df_list.append(df)

# Select language
# lang = "en"
# lang = "ar"
lang = "zh"

if lang == 'en':
  language = 'English'
elif lang == 'ar':
  language = 'Arabic'
elif lang == 'zh':
  language = 'Chinese'

# Split the dataset wherever an empty row is found
df_list_with_na = np.split(df, df[df.isnull().all(1)].index)

# Drop NAs and reset the index
df_list = []
for df in df_list_with_na:
  df = df[df.plotskip != "yes"] # droip the ones marked with skip
  df = df.dropna(how='all')
  df.reset_index(inplace=True, drop=True)
# Select English/Arabic/Chinese data only
  df = df[df.lang == lang] 
  if len(df) > 0:
    df_list.append(df)

# print(df)

# # Select only desired keywords
# key = "hing"
# key2 = "anjudan"
# key3 = "awei"
# df = df.loc[(df['id']==key) | (df['id']==key2) | (df['id']==key3)]

# Automatically extract IDs from the dataset
ids = []
for df in df_list:
  df = df.dropna(how='all')
  df.reset_index(drop=True)
  id = str(df['id'].iloc[0])
  ids.append(id)

# Print the number if IDs and what are they
length = len(df_list)
print(length, " items in total")
print(ids)

#Create a defaultdict of spice word etymologies
etymology=defaultdict(list)
for i in range(length):
  etymology[ids[i]]=df_list[i]

# Iterate etymology data and merge with language data for plotting maps, write it all in a callable dictionary.
names = {}

for i,j in etymology.items():
  df = etymology[i]
  # df.dropna(axis=1, inplace=True)
  j['language'] = [re.sub("\*", "", str(x)) for x in j['language']]
  df = pd.merge(df, languages, on=['language'])
  names[str(i)] = df

# # Save etymology datasets separately (for later use in R?)
# path="/content/drive/MyDrive/Thesis/Data/Etymology/"
# for i,j in names.items():
#   j.to_csv(path+str(i)+'.csv')

# Pop some not wanted items
# names.pop('long pepper')

# for e in ['xingqu', 'husui']: # DIFFERENT
#   names.pop(e)
# print(len(names))

# pop single-stage words
one_stage_words = []
for i,j in names.items():
  if j.shape[0]==1:
    print("yes")
    one_stage_words.append(i)

for e in one_stage_words: 
  names.pop(e)
print(len(names))

c = 0
colors = prism*3

# Create text for the marker labels from the data
for i,j in names.items():
  j.fillna('', inplace=True) 
  j['doubt'] = [re.sub("yes", "?", str(x)) for x in j['doubt']]
  j['script'] = [re.sub(r'\\\w\w\w?\{', "", str(x)) for x in j['script']]
  j['script'] = [re.sub(r'\{', "", str(x)) for x in j['script']]
  j['script'] = [re.sub(r'\}', "", str(x)) for x in j['script']]
  j['text'] = j['doubt'].astype(str) + " " + j['script'].astype(str) + '<br>' + j['term'].astype(str) + '<br>' + j['language'].astype(str) + '<br>' + j['family'].astype(str)
  j['text'] = [re.sub('\\w+{', "", str(x)) for x in j['text']]
  j['text'] = [re.sub(r'\\', "", str(x)) for x in j['text']]
  j['text'] = [re.sub('[{}]', "", str(x)) for x in j['text']]
  j['text'] = [re.sub('^\s?<br>', "", str(x)) for x in j['text']]
  j['plot color'] = colors[c]
  c = c + 1

25  items in total
['duoxiangguo', 'huiqin', 'awei', 'xingqu', 'geluzi', 'doukou', 'rougui', 'lajiao', 'dingxiang', 'yansui', 'husui', 'ziran', 'shiluo', 'huixiang', 'huluba', 'jiang', 'biba', 'roudoukoupi', 'roudoukou', 'hujiao', 'fanhonghua', 'huajiao', 'bajiaohuixiang', 'jianghuang', 'xiangcao']
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
11


#### Plot

In [337]:
#### Document
fig = go.Figure()

for i,j in names.items():
  fig.add_trace(go.Scattergeo(name = j['id'].iloc[0], lon = j['lon'], lat = j['lat'],
      hoverinfo = 'text', hovertext = j['text'], text = j['text'],
      textfont={"color": font_color, "family": font_family, "size": font_size}, 
      textposition="middle right", mode = 'markers+lines',  
      opacity=opacity,
      line_color = str(j['plot color'].iloc[0]), line_width=line_width, line_dash='solid', 
      marker = dict(symbol=marker_symbol, size = marker_size, opacity=opacity, color = str(j['plot color'].iloc[0]), line = dict(width = edge_size, color = edge_color))))

fig.update_layout(ortho_layout)
fig.update_layout(geo=dict(projection_rotation = {'lat': 15, 'lon': 60, 'roll': 0}))
fig.update_layout(document_size)

fig.show()

# write
filename = "diffusion_"
fig.write_image(path_out_pdf + filename + lang + ".pdf", engine="kaleido")

################################################################################

#### HTML
fig = go.Figure()

for i,j in names.items():
  fig.add_trace(go.Scattergeo(name = j['id'].iloc[0], lon = j['lon'], lat = j['lat'],
      hoverinfo = 'text', hovertext = j['text'], text = j['text'],
      textfont={"color": font_color, "family": font_family, "size": font_size}, 
      textposition="middle right", mode = 'markers+lines',
      opacity=opacity,
      line_color = str(j['plot color'].iloc[0]), line_width=line_width, line_dash='solid', 
      marker = dict(symbol=marker_symbol, size = marker_size, opacity=opacity, color = str(j['plot color'].iloc[0]), line = dict(width = edge_size, color = edge_color))))

fig.update_layout(ortho_layout)
fig.update_layout(geo=dict(projection_rotation = {'lat': 15, 'lon': 60, 'roll': 0}))
fig.update_layout(title_text="Etymological stages of a few spice names in " + language)
fig.update_layout(template=cr)
fig.add_layout_image(logo)

fig.show()

# write
filename = "diffusion_"
fig.write_html(path_out_html + filename + lang + ".html")
fig.write_json(path_out_json + filename + lang + ".json")

## Diffusion of Names of Specific Items

In [338]:
# Load in datasets
languages=pd.read_csv(path_in+'languages/languages.csv', header =[0], delimiter=',', encoding="utf-8")
read_file = pd.read_excel(path_in+"etymologies.xlsx")
read_file.to_csv (path_in+"etymologies.csv", index = None, header=True)
df=pd.read_csv(path_in+'etymologies.csv', header =[0], delimiter=',', encoding="utf-8")

# Split the dataset wherever an empty row is found
df_list_with_na = np.split(df, df[df.isnull().all(1)].index)

# Drop NAs and reset the index
df_list = []
for df in df_list_with_na:
  df = df[df.plotskip != "yes"] # droip the ones marked with skip
  df = df.dropna(how='all')
  df.reset_index(inplace=True, drop=True)
  if len(df) > 0:
    df_list.append(df)

# Automatically extract IDs from the dataset
ids = []
for df in df_list:
  df = df.dropna(how='all')
  df.reset_index(drop=True)
  id = str(df['id'].iloc[0])
  ids.append(id)

# Print the number if IDs and what are they
length = len(df_list)
print(length, " items in total")
print(ids)

#Create a defaultdict of spiceword etymologies
etymology=defaultdict(list)
for i in range(length):
  etymology[ids[i]]=df_list[i]

# Iterate etymology data and merge with language data for plotting maps,
# write it all in a callable dictionary.
names = {}

for i,j in etymology.items():
  df = etymology[i]
  # df.dropna(axis=1, inplace=True)
  df = pd.merge(df, languages, on=['language'])
  names[str(i)] = df

84  items in total
['tester', 'allspice', 'fulful ifranji', 'duoxiangguo', 'pimento', 'anise', 'anisun', 'huiqin', 'asafoetida', 'hing', 'hiltit', 'anjudan', 'awei', 'xingqu', 'caraway', 'karawiya', 'geluzi', 'cardamom', 'amomum', 'hal', 'qaqulla', 'doukou', 'cassia', 'salikha', 'rougui', 'cinnamon', 'darsini', 'qirfa', 'chile', 'fulful harr', 'lajiao', 'paprika', 'clove', 'qaranful', 'dingxiang', 'coriander', 'kuzbura', 'yansui', 'husui', 'cumin', 'kammun', 'ziran', 'dill', 'shibitt', 'shiluo', 'fennel', 'shamar', 'huixiang', 'fenugreek', 'hulba', 'huluba', 'ginger', 'zanjabil', 'jiang', 'long pepper', 'darfilfil', 'biba', 'mace', 'basbas', 'roudoukoupi', 'nutmeg', 'jawz al-tib', 'roudoukou', 'pepper', 'fulful', 'hujiao', 'bors', 'saffron', 'zafaran', 'fanhonghua', 'Sichuan pepper', 'fagara', 'fulful sitshuwan', 'huajiao', 'star anise', 'yansun najmi', 'bajiaohuixiang', 'badian', 'turmeric', 'kurkum', 'jianghuang', 'vanilla', 'faniliya', 'xiangcao']


In [339]:
# Variables (light) ########
marker_size = 18
edge_size = 1
line_width = 4
font_size = 18

In [340]:
# Add spices to plot
key="allspice"
keys = ['allspice', 'pimento', 'fulful ifranji', 'duoxiangguo']
colors = [p2,p3,p4,p6]

# key="asafoetida"
# keys = ['asafoetida', 'hing', 'hiltit', 'anjudan', 'awei', 'xingqu']
# colors = [p2,p3,p4,p5,p6,p7]

# key="cardamom"
# keys = ['cardamom', 'amomum', 'hal', 'qaqulla', 'doukou']
# colors = prism

# key="pepper"
# keys = ['pepper', 'long_pepper', 'fulful']#, 'darfilfil', 'hujiao', 'biba']
# colors = prism

# key = "turmeric"
# keys = ['turmeric', 'kurkum', 'jianghuang']
# colors = [p2,p4,p6]

################################################################################

plot_names = {k: names[k] for k in keys}

# Create text for the marker labels from the data
c = 0
for i,j in plot_names.items():
  j.fillna('', inplace=True)
  j['script'] = [re.sub(r'\\\w\w\w?\{', "", str(x)) for x in j['script']]
  j['script'] = [re.sub(r'\{', "", str(x)) for x in j['script']]
  j['script'] = [re.sub(r'\}', "", str(x)) for x in j['script']]
  j['text'] = j['script'] + '<br>' + j['term'].astype(str) + '<br>' + j['language'].astype(str) + '<br>' + j['family'].astype(str)
  j['lang color'] = colors[c]
  c = c + 1

################################################################################
  

In [341]:
#### Document
fig = go.Figure()

for i,j in plot_names.items():
  fig.add_trace(go.Scattergeo(name = j['id'].iloc[0], lon = j['lon'], lat = j['lat'],
      hoverinfo = 'text', hovertext = j['text'], text = j['term'], 
      textfont={"color": font_color, "family": font_family, "size": font_size}, 
      textposition="middle right", mode = 'markers+text', 
      line_color = str(j['lang color'].iloc[0]), line_width=line_width, line_dash='solid', 
      opacity=opacity,
      marker = dict(symbol = marker_symbol, size = marker_size, opacity = opacity, color = str(j['lang color'].iloc[0]), line = dict(width = edge_size, color = edge_color))))

fig.update_layout(ortho_layout)
fig.update_layout(geo=dict(projection_rotation = {'lat': 30, 'lon': 60, 'roll': 0}))
fig.update_layout(document_size)

fig.show()

# write
filename = "diffusion_" + key
fig.write_image(path_out_pdf + filename + ".pdf", engine="kaleido")

################################################################################

#### HTML
fig = go.Figure()

for i,j in plot_names.items():
  fig.add_trace(go.Scattergeo(name = j['id'].iloc[0], lon = j['lon'], lat = j['lat'],
      hoverinfo = 'text', hovertext = j['text'], text = j['term'], 
      textfont={"color": "black", "family": font_family, "size": font_size}, 
      textposition="middle right", mode = 'markers+text', 
      line_color = str(j['lang color'].iloc[0]), line_width=line_width, line_dash='solid', 
      opacity=opacity,
      marker = dict(symbol=marker_symbol, size = marker_size, opacity=opacity, color = str(j['lang color'].iloc[0]), line = dict(width = edge_size, color = edge_color))))

fig.update_layout(ortho_layout)
fig.update_layout(geo=dict(projection_rotation = {'lat': 30, 'lon': 60, 'roll': 0}))
fig.update_layout(title_text = "Etymological stages of " + key)
fig.update_layout(template = cr)
fig.add_layout_image(logo)

fig.show()

# write
filename = "diffusion_" + key
fig.write_html(path_out_html + filename + ".html")
fig.write_json(path_out_json + filename + ".json")

## Diffusion of a Single Name of One Specific Item

In [342]:
# Load in datasets
languages=pd.read_csv(path_in+'languages/languages.csv', header =[0], delimiter=',', encoding="utf-8")
read_file = pd.read_excel(path_in+"etymologies.xlsx")
read_file.to_csv (path_in+"etymologies.csv", index = None, header=True)
df=pd.read_csv(path_in+'etymologies.csv', header =[0], delimiter=',', encoding="utf-8")

# Split the dataset wherever an empty row is found
df_list_with_na = np.split(df, df[df.isnull().all(1)].index)

# Drop NAs and reset the index
df_list = []
for df in df_list_with_na:
  df = df[df.plotskip != "yes"] # droip the ones marked with skip
  df = df.dropna(how='all')
  df.reset_index(inplace=True, drop=True)
  if len(df) > 0:
    df_list.append(df)

# Automatically extract IDs from the dataset
ids = []
for df in df_list:
  df = df.dropna(how='all')
  df.reset_index(drop=True)
  id = str(df['id'].iloc[0])
  ids.append(id)

# Print the number if IDs and what are they
length = len(df_list)
print(length, " items in total")
print(ids)

#Create a defaultdict of spiceword etymologies
etymology=defaultdict(list)
for i in range(length):
  etymology[ids[i]]=df_list[i]

# Iterate etymology data and merge with language data for plotting maps,
# write it all in a callable dictionary.
names = {}

for i,j in etymology.items():
  df = etymology[i]
  # df.dropna(axis=1, inplace=True)
  df = pd.merge(df, languages, on=['language'])
  names[str(i)] = df

84  items in total
['tester', 'allspice', 'fulful ifranji', 'duoxiangguo', 'pimento', 'anise', 'anisun', 'huiqin', 'asafoetida', 'hing', 'hiltit', 'anjudan', 'awei', 'xingqu', 'caraway', 'karawiya', 'geluzi', 'cardamom', 'amomum', 'hal', 'qaqulla', 'doukou', 'cassia', 'salikha', 'rougui', 'cinnamon', 'darsini', 'qirfa', 'chile', 'fulful harr', 'lajiao', 'paprika', 'clove', 'qaranful', 'dingxiang', 'coriander', 'kuzbura', 'yansui', 'husui', 'cumin', 'kammun', 'ziran', 'dill', 'shibitt', 'shiluo', 'fennel', 'shamar', 'huixiang', 'fenugreek', 'hulba', 'huluba', 'ginger', 'zanjabil', 'jiang', 'long pepper', 'darfilfil', 'biba', 'mace', 'basbas', 'roudoukoupi', 'nutmeg', 'jawz al-tib', 'roudoukou', 'pepper', 'fulful', 'hujiao', 'bors', 'saffron', 'zafaran', 'fanhonghua', 'Sichuan pepper', 'fagara', 'fulful sitshuwan', 'huajiao', 'star anise', 'yansun najmi', 'bajiaohuixiang', 'badian', 'turmeric', 'kurkum', 'jianghuang', 'vanilla', 'faniliya', 'xiangcao']


In [343]:
# Variables (light) ########
marker_symbol = 'diamond'
marker_size = 20
edge_size = 1
line_width = 4
font_size = 20
opacity = 0.8

In [344]:
# Add spices to plot
key="ginger"
keys = [key]
colors = [PolyU]

################################################################################

plot_names = {k: names[k] for k in keys}

# Create text for the marker labels from the data
c = 0
for i,j in plot_names.items():
  j.fillna('', inplace=True)
  j['script'] = [re.sub(r'\\\w\w\w?\{', "", str(x)) for x in j['script']]
  j['script'] = [re.sub(r'\{', "", str(x)) for x in j['script']]
  j['script'] = [re.sub(r'\}', "", str(x)) for x in j['script']]
  j['text'] = j['script'] + '<br>' + j['term'].astype(str) + '<br>' + j['language'].astype(str) + '<br>' + j['family'].astype(str)
  j['lang color'] = colors[c]
  c = c + 1

################################################################################

#### Document
fig = go.Figure()

for i,j in plot_names.items():
  fig.add_trace(go.Scattergeo(name = j['id'].iloc[0], lon = j['lon'], lat = j['lat'],
      hoverinfo = 'text', hovertext = j['text'], text = j['term'], 
      textfont={"color": font_color, "family": font_family, "size": font_size}, 
      textposition="middle right", mode = 'markers+text', 
      line_color = str(j['lang color'].iloc[0]), line_width=line_width, line_dash='solid', 
      opacity=opacity,
      marker = dict(symbol = marker_symbol, size = marker_size, opacity = opacity, color = str(j['lang color'].iloc[0]), line = dict(width = edge_size, color = edge_color))))

fig.update_layout(ortho_layout)
fig.update_layout(geo=dict(projection_rotation = {'lat': 30, 'lon': 60, 'roll': 0}))
fig.update_layout(document_size)

fig.show()

# write
key = re.sub("_", " ", key)
filename = "diffusion_name_" + key
fig.write_image(path_out_pdf + filename + ".pdf", engine="kaleido")

################################################################################

#### HTML
fig = go.Figure()

for i,j in plot_names.items():
  fig.add_trace(go.Scattergeo(name = j['id'].iloc[0], lon = j['lon'], lat = j['lat'],
      hoverinfo = 'text', hovertext = j['text'], text = j['term'], 
      textfont={"color": font_color, "family": font_family, "size": font_size}, 
      textposition="middle right", mode = 'markers+text', 
      line_color = str(j['lang color'].iloc[0]), line_width=line_width, line_dash='solid', 
      opacity=opacity,
      marker = dict(symbol=marker_symbol, size = marker_size, opacity=opacity, color = str(j['lang color'].iloc[0]), line = dict(width = edge_size, color = edge_color))))

fig.update_layout(ortho_layout)
fig.update_layout(geo=dict(projection_rotation = {'lat': 30, 'lon': 60, 'roll': 0}))
fig.update_layout(title_text = "Etymological stages of " + key)
fig.update_layout(template = cr)
fig.add_layout_image(logo)

fig.show()

# write
key = re.sub("_", " ", key)
filename = "diffusion_name_" + key
fig.write_html(path_out_html + filename + ".html")
fig.write_json(path_out_json + filename + ".json")

In [345]:
def trail(key):
  keys = [key]
  colors = [PolyU]

  ################################################################################

  plot_names = {k: names[k] for k in keys}

  # Create text for the marker labels from the data
  c = 0
  for i,j in plot_names.items():
    j.fillna('', inplace=True)
    j['script'] = [re.sub(r'\\\w\w\w?\{', "", str(x)) for x in j['script']]
    j['script'] = [re.sub(r'\{', "", str(x)) for x in j['script']]
    j['script'] = [re.sub(r'\}', "", str(x)) for x in j['script']]
    j['text'] = j['script'] + '<br>' + j['term'].astype(str) + '<br>' + j['language'].astype(str) + '<br>' + j['family'].astype(str)
    j['lang color'] = colors[c]
    c = c + 1

  ################################################################################

  #### Document
  fig = go.Figure()

  for i,j in plot_names.items():
    fig.add_trace(go.Scattergeo(name = j['id'].iloc[0], lon = j['lon'], lat = j['lat'],
        hoverinfo = 'text', hovertext = j['text'], text = j['term'], 
        textfont={"color": font_color, "family": font_family, "size": font_size}, 
        textposition="middle right", mode = 'markers+text', 
        line_color = str(j['lang color'].iloc[0]), line_width=line_width, line_dash='solid', 
        opacity=opacity,
        marker = dict(symbol = marker_symbol, size = marker_size, opacity = opacity, color = str(j['lang color'].iloc[0]), line = dict(width = edge_size, color = edge_color))))

  fig.update_layout(ortho_layout)
  fig.update_layout(geo=dict(projection_rotation = {'lat': 30, 'lon': 60, 'roll': 0}))
  fig.update_layout(document_size)

  # fig.show()

  # write
  key = re.sub("_", " ", key)
  filename = "diffusion_name_" + key
  fig.write_image(path_out_pdf + filename + ".pdf", engine="kaleido")

  ################################################################################

  #### HTML
  fig = go.Figure()

  for i,j in plot_names.items():
    fig.add_trace(go.Scattergeo(name = j['id'].iloc[0], lon = j['lon'], lat = j['lat'],
        hoverinfo = 'text', hovertext = j['text'], text = j['term'], 
        textfont={"color": font_color, "family": font_family, "size": font_size}, 
        textposition="middle right", mode = 'markers+text', 
        line_color = str(j['lang color'].iloc[0]), line_width=line_width, line_dash='solid', 
        opacity=opacity,
        marker = dict(symbol=marker_symbol, size = marker_size, opacity=opacity, color = str(j['lang color'].iloc[0]), line = dict(width = edge_size, color = edge_color))))

  fig.update_layout(ortho_layout)
  fig.update_layout(geo=dict(projection_rotation = {'lat': 30, 'lon': 60, 'roll': 0}))
  fig.update_layout(title_text = "Etymological stages of " + key)
  fig.update_layout(template = cr)
  fig.add_layout_image(logo)

  # fig.show()

  # write
  key = re.sub("_", " ", key)
  filename = "diffusion_name_" + key
  fig.write_html(path_out_html + filename + ".html")
  fig.write_json(path_out_json + filename + ".json")
  return

In [346]:
for key in list_of_spices:
    trail(key)

## Distribution of Every Spice

In [347]:
# Read and store content of an excel file 
read_file = pd.read_excel(path_in+"spices.xlsx")

# Write the dataframe object into csv file
read_file.to_csv (path_in+"spices.csv", index = None, header=True)

# Load in dataset
df_spices=pd.read_csv(path_in+'spices.csv', header =[0], delimiter=',', encoding="utf-8")

# df_spices = df_spices.loc[df_spices['include'] == 'in'] # include ones to include

df_spices = df_spices.loc[df_spices['include'] == 'in'] # include ones to include

print(df_spices.shape)

df = df_spices.copy()

(24, 95)


In [348]:
# Variables (light) ########
marker_symbol = 'circle'
marker_size = 14
edge_size = 1
line_width = 4
font_size = 14

In [349]:
# Set size
df['size'] = 1
size_max = 10 # this counts

########################################################################################

#### Document
fig = px.scatter_geo(df,
    lat='lat', 
    lon='lon',
    text='id',
    color='family',
    color_discrete_sequence=prism_extended,
    size_max = size_max,
    size = 'size',
    opacity = opacity,
    hover_name='id',
    hover_data={'species':True, 'family':True, 'region of origin':True, 'Arabic':True, 'Chinese':True, 'lon':False, 'lat':False},
    # labels={"group": "category"}
    )

fig.update_traces(
    textposition = 'middle right',
    mode = "markers+text",
    textfont = dict(size=font_size, color=font_color, family=font_family),
    marker = dict(
        # symbol = marker_symbol,
        # size = marker_size,
        line = dict(
            color=edge_color,
            width=edge_size)
        )
    )

fig.update_layout(ortho_layout)
fig.update_layout(geo=dict(projection_rotation = {'lat': 20, 'lon': 80, 'roll': 0}))
fig.update_layout(document_size)

fig.show()

# write
filename = "distribution"
fig.write_image(path_out_pdf + filename + ".pdf", engine="kaleido")

################################################################################

#### HTML
fig = px.scatter_geo(df,
    lat='lat', 
    lon='lon',
    text='id',
    color='family',
    color_discrete_sequence=prism_extended,
    size_max = size_max,
    size = 'size',
    opacity = opacity,
    hover_name='id',
    hover_data={'species':True, 'family':True, 'region of origin':True, 'Arabic':True, 'Chinese':True, 'lon':False, 'lat':False},
    # labels={"group": "category"}
    )

fig.update_traces(mode = "markers+text",
                  textposition='middle right',
                  textfont={"size": font_size, "color": font_color, "family": font_family},
                  marker=dict(symbol=marker_symbol, 
                              # size=marker_size, 
                              line=dict(color=edge_color, width=edge_size)))

fig.update_layout(ortho_layout)
fig.update_layout(geo=dict(projection_rotation = {'lat': 20, 'lon': 80, 'roll': 0}))
# fig.update_layout(title_text = "Etymological stages of " + key)
fig.update_layout(template = cr)
fig.add_layout_image(logo)

fig.show()

# write
filename = "distribution"
fig.write_html(path_out_html + filename + ".html")
fig.write_json(path_out_json + filename + ".json")

In [350]:
# add a value to make small points bigger
df['size'] = df['spreadability']+2
size_max = 25

########################################################################################

#### Document
fig = px.scatter_geo(df,
    lat='lat', 
    lon='lon',
    text='id',
    color='family',
    color_discrete_sequence=prism_extended,
    size_max = size_max,
    size = 'size',
    opacity = opacity,
    hover_name='id',
    hover_data={'species':True, 'family':True, 'region of origin':True, 'Arabic':True, 'Chinese':True, 'lon':False, 'lat':False},
    # labels={"group": "category"}
    )

fig.update_traces(
    textposition = 'middle right',
    mode = "markers+text",
    textfont = dict(size=font_size, color=font_color, family=font_family),
    marker = dict(
        # symbol = marker_symbol,
        # size = marker_size,
        line = dict(
            color=edge_color,
            width=edge_size)
        )
    )

fig.update_layout(ortho_layout)
fig.update_layout(geo=dict(projection_rotation = {'lat': 20, 'lon': 80, 'roll': 0}))
fig.update_layout(document_size)

fig.show()

# write
filename = "distribution_with_spreadability"
fig.write_image(path_out_pdf + filename + ".pdf", engine="kaleido")

################################################################################

#### HTML
fig = px.scatter_geo(df,
    lat='lat', 
    lon='lon',
    text='id',
    color='family',
    color_discrete_sequence=prism_extended,
    size_max = size_max,
    size = 'size',
    opacity = opacity,
    hover_name='id',
    hover_data={'species':True, 'family':True, 'region of origin':True, 'Arabic':True, 'Chinese':True, 'lon':False, 'lat':False},
    # labels={"group": "category"}
    )

fig.update_traces(mode = "markers+text",
                  textposition='middle right',
                  textfont={"size": font_size, "color": font_color, "family": font_family},
                  marker=dict(symbol=marker_symbol, 
                              # size=marker_size, 
                              line=dict(color=edge_color, width=edge_size)))

fig.update_layout(ortho_layout)
fig.update_layout(geo=dict(projection_rotation = {'lat': 20, 'lon': 80, 'roll': 0}))
# fig.update_layout(title_text = "Etymological stages of " + key)
fig.update_layout(template = cr)
fig.add_layout_image(logo)

fig.show()

# write
filename = "distribution_with_spreadability"
fig.write_html(path_out_html + filename + ".html")
fig.write_json(path_out_json + filename + ".json")

A try on Natural Earth projection:

In [351]:
# # add a value to make small points bigger
# df['size'] = 5
# size_max = 10

# ########################################################################################

# # Natural Earth layout
# ne_layout = dict(
#     paper_bgcolor=background_color,
#     plot_bgcolor=background_color,
#     geo = dict(
#         resolution=110, #50 is large or 110 small
#         scope='world',
#         projection_type = 'natural earth',
#         projection_scale = 1,
#         # projection_rotation = {'lat': 15, 'lon': 30, 'roll': 0},
#         bgcolor=background_color,
#         showcoastlines=True, coastlinewidth = 1, coastlinecolor = lines,
#         showcountries=False, countrywidth = 1, countrycolor = lines, 
#         showframe=True, framewidth = 1, framecolor = lines, 
#         showlakes=True, lakecolor = water,
#         showland=True, landcolor = land, 
#         showocean=True, oceancolor = water,
#         showrivers=True, riverwidth = 1, rivercolor = water,
#         showsubunits=False, subunitwidth = 1, subunitcolor = lines, 
#         # lonaxis = dict(showgrid = True, gridwidth = 0.5, dtick = 10, gridcolor=grid_color),
#         # lataxis = dict (showgrid = True, gridwidth = 0.5, dtick = 10, gridcolor=grid_color)
#         ),
#     showlegend = True,
#     legend=dict(x=0, y=0, xanchor="left", yanchor="bottom", bgcolor=half_transparent,  
#                 font={"color": font_color, "size": font_size, "family": font_family}, traceorder = 'normal', orientation="v"),
#     title=dict(x=0.5, y=0.99, xanchor='center', yanchor='top', text='',
#                font={"color": font_color, "size": font_size+6, "family": font_family}),
#     # margin={"r":0,"t":0,"l":0,"b":0},
#     hoverlabel=dict(#bgcolor="white", 
#                     font_size=font_size, 
#                     font_family=font_family),
#     )

# ########################################################################################

# #### Document
# fig = px.scatter_geo(df,
#     lat='lat', 
#     lon='lon',
#     text='id',
#     color='family',
#     color_discrete_sequence=prism_extended,
#     size_max = size_max,
#     size = 'size',
#     opacity = opacity,
#     hover_name='id',
#     hover_data={'species':True, 'family':True, 'region of origin':True, 'Arabic':True, 'Chinese':True, 'lon':False, 'lat':False},
#     # labels={"group": "category"}
#     )

# fig.update_traces(
#     textposition = 'middle right',
#     mode = "markers+text",
#     textfont = dict(size=font_size, color=font_color, family=font_family),
#     marker = dict(
#         # symbol = marker_symbol,
#         # size = marker_size,
#         line = dict(
#             color=edge_color,
#             width=edge_size)
#         )
#     )

# fig.update_layout(ne_layout)
# # fig.update_layout(geo=dict(projection_rotation = {'lat': 20, 'lon': 80, 'roll': 0}))
# # fig.update_layout(document_size)

# fig.show()

# # write
# filename = "distribution_by_spreadability" + "_ne"
# fig.write_image(path_out_pdf + filename + ".pdf", engine="kaleido")

# ################################################################################

# #### HTML
# fig = px.scatter_geo(df,
#     lat='lat', 
#     lon='lon',
#     text='id',
#     color='family',
#     color_discrete_sequence=prism_extended,
#     size_max = size_max,
#     size = 'size',
#     opacity = opacity,
#     hover_name='id',
#     hover_data={'species':True, 'family':True, 'region of origin':True, 'Arabic':True, 'Chinese':True, 'lon':False, 'lat':False},
#     # labels={"group": "category"}
#     )

# fig.update_traces(mode = "markers+text",
#                   textposition='middle right',
#                   textfont={"size": font_size, "color": font_color, "family": font_family},
#                   marker=dict(symbol=marker_symbol, 
#                               # size=marker_size, 
#                               line=dict(color=edge_color, width=edge_size)))

# fig.update_layout(ne_layout)
# # fig.update_layout(geo=dict(projection_rotation = {'lat': 20, 'lon': 80, 'roll': 0}))
# # fig.update_layout(title_text = "Etymological stages of " + key)
# fig.update_layout(template = cr)
# fig.add_layout_image(logo)

# fig.show()

# # write
# filename = "distribution_by_spreadability" + "_ne"
# fig.write_html(path_out_html + filename + ".html")
# fig.write_json(path_out_json + filename + ".json")

### Specific spices, multilingual (Wiktionary et al.)

#### Plotly Express tries, with ideas

In [352]:
# fig = px.scatter_geo(df,
#     lat='lat', 
#     lon='lon',
#     text='item',
#     # symbol='group',
#     # symbol_sequence = ['diamond-open', 'triangle-up', 'triangle-down', 'triangle-left', 'triangle-right', 'triangle-ne', 'triangle-se'],
#     # size='pop' # Can set size by some value
#     # size_max = 20,
#     # animation_frame = 'date', # !TIMELAPSE!
#     # animation_group = '', # ?
#     # color='group',
#     # color_discrete_sequence=px.colors.sequential.Viridis,
#     opacity = 0.75,
#     hover_name='item',
#     hover_data={'term':True, 'language':True, 'family':True, 'item':False, 'lon':False, 'lat':False, 'group':False},
#     # labels={"group": "category"}
#     )

# # Without text
# fig.update_traces(mode = "markers+text")

# fig.show()
# fig.write_html(key + "_annotated.html")

#### Plotly Go tries


In [353]:
# #Different plotting
# # https://plotly.com/python/builtin-colorscales/
# prism = [p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11]
# # colors = ['rgb(95, 70, 144)', 'rgb(29, 105, 150)', 'rgb(56, 166, 165)', 'rgb(15, 133, 84)', 'rgb(115, 175, 72)', 'rgb(237, 173, 8)', 'rgb(225, 124, 5)']

# path="/content/drive/MyDrive/Thesis/Data/Multilingual/"
# df=pd.read_csv(path+'cinnamon.csv', header =[0], delimiter=',', encoding="utf-8")

# # This is for coloring by group or family:
# groups = np.unique(df['group'].values)  #set the array of unique groups in your column, df['group']
# d = dict(zip(groups, np.arange(len(groups)))) # a dict that associates a numerical value to each group
# d = {'canela': 0,
#      'kinnamon': 1,
#      'korica': 2,
#      'qirfa': 3,
#      'darchin': 4,
#      'gui': 5,
#      'other': 6}

# fig = go.Figure()

# fig.add_traces(data=go.Scattergeo(
#     name='cinnamon',
#     lon = df['lon'],
#     lat = df['lat'],
#     text = df['item'],
#     textposition = 'top right',
#     textfont={"color": "black", "size": 12, "family": font_family},
#     mode = 'markers',
#     # marker_color = [d[s] for s in df['group']], #coloring markers by group (not interactive, no legend)
#     marker_color = 'darkorange',
#     marker=dict(symbol='diamond', color=PolyU, colorscale=colors, size=12, opacity=0.75, line=dict(color='white', width=1)),
#     opacity=1, #for marker & text
#     ))

# # fig.update_traces(customdata=df.text) # ONLY WORKS HERE IN PLOTLY GO HAVE TO USE THIS BUT DISTUINGUISH BETWEEN THE TWO DF
# # fig.update_traces(hovertemplate='%{customdata}') #<extra></extra>')

# # Pepper -----------------------------------------------------------------------
# df=pd.read_csv(path+'pepper.csv', header =[0], delimiter=',', encoding="utf-8")

# groups = np.unique(df['group'].values)  #set the array of unique groups in your column, df['group']
# d = dict(zip(groups, np.arange(len(groups)))) # a dict that associates a numerical value to each group
# d = {'pippali': 0,
#      'pigment': 1,
#      'marica': 2,
#      'hujiao': 3,
#      'other': 4}

# fig.add_traces(data=go.Scattergeo(
#     name='pepper',
#     lon = df['lon'],
#     lat = df['lat'],
#     text = df['item'],
#     textposition = 'top right',
#     textfont={"color": "black", "size": 12, "family": font_family},
#     mode = 'markers',
#     # marker_color = [d[s] for s in df['group']], #coloring markers by group (not interactive, no legend)
#     marker_color = 'lightblue',
#     marker=dict(symbol='diamond', color=PolyU, colorscale=colors, size=12, opacity=0.75, line=dict(color='white', width=1)),
#     opacity=1, #for marker & text
#     ))

# # fig.update_traces(customdata=df.text) # ONLY WORKS HERE IN PLOTLY GO
# # fig.update_traces(hovertemplate='%{customdata}') #<extra></extra>')

# fig.update_layout(
#     geo = dict(
#         resolution=110, #50 is large or 110 small
#         scope='world',
#         projection_type = 'orthographic',
#         # projection_type = 'natural earth',
#         projection_scale = 0.75,
#         projection_rotation = {'lat': 12, 'lon': 60, 'roll': 0},
#         center = {'lat':12,'lon':60},
#         bgcolor='white',
#         showcoastlines=True, coastlinewidth = 1, coastlinecolor = lines,
#         showcountries=False, countrywidth = 1, countrycolor = lines, 
#         showframe=True, framewidth = 1, framecolor = lines, 
#         showlakes=True, lakecolor = water,
#         showland=True, landcolor = land, 
#         showocean=True, oceancolor = water,
#         showrivers=True, riverwidth = 1, rivercolor = water,
#         showsubunits=False, subunitwidth = 1, subunitcolor = lines, 
#         lonaxis = dict(showgrid = True, gridwidth = 0.5, dtick = 10),
#         lataxis = dict (showgrid = True, gridwidth = 0.5, dtick = 10)),
#     width = 1280, height=720,  
#     margin={"r":0,"t":0,"l":0,"b":0},
#     showlegend = True,
#     legend=dict(y=0.9, x=0.05, xanchor="left", yanchor="top",   
#                 font={"color": "black", "size": 16, "family": font_family}, traceorder = 'normal', orientation="v",),
#     title=dict(y=0.98, x=0.05, xanchor='left', yanchor='top', text='Cinnamon',#"Various groups for the names of cinnamon in different languages",   
#                font={"color": "black", "size": 20, "family": font_family}),
#     hovermode="closest", #default
#     hoverlabel=dict(
#         # bgcolor="white", 
#         font_size=12, 
#         font_family=font_family),
#     )

# fig.show()
# fig.write_html("cinnamon_go.html")

In [354]:
# import plotly.graph_objects as go
# import pandas as pd

# fig = go.Figure(data=go.Scattergeo(
#     lat = df['lat'],
#     lon = df['lon'],
#     text = df['term'].astype(str),
#     marker = dict(
#         color = [d[s] for s in df['group']],
#         colorscale = "Viridis",
#         reversescale = True,
#         opacity = 0.75,
#         size = 12,
#         colorbar = dict(
#             titleside = "right",
#             outlinecolor = "rgba(68, 68, 68, 0)",
#             ticks = "outside",
#             showticksuffix = "last",
#             dtick = 1
#         )
#     )
# ))

# fig.update_layout(
#         title = 'Title',
#         geo = dict(
#             scope='world',
#             projection_type='orthographic',
#             showland = True,
#             landcolor = "gainsboro",
#             subunitcolor = "gainsboro",
#             countrycolor = "white",
#             countrywidth = 0.5,
#             subunitwidth = 0.5,
#             showcountries = True,
#         ),
#     )
# fig.show()

### One by one

#### Tea

In [355]:
key = 'tea'

# Load in dataset
df = pd.read_csv(path_in + "languages/wals.csv", header =[0], delimiter=',', encoding="utf-8")
print(df.shape)

# df.drop(columns=['link'], inplace=True)

df = df[df['tea'].notna()]
print(df.shape)

df['tea'] = [re.sub('2 Words derived from Min Nan Chinese te', "te", str(x)) for x in df['tea']]
df['tea'] = [re.sub('1 Words derived from Sinitic cha', "cha", str(x)) for x in df['tea']]
df['tea'] = [re.sub('3 Others', "other", str(x)) for x in df['tea']]

# save
df.to_csv(path_in + "multilingual/" + key + '.csv')

################################################################################

# Variables
marker_size = 12
edge_size = 1
line_width = 4
font_size = 12

################################################################################
#### Document
fig = px.scatter_geo(df,
    lat='lat', 
    lon='lon',
    text='name',
    color='tea',
    color_discrete_sequence=[MidnightBlue, PolyU, 'gray'], #https://www.colorhexa.com/1034a6
    opacity = opacity, #???
    hover_name='name',
    # hover_data={'term':True, 'language':True, 'family':True, 'name':False, 'lon':False, 'lat':False, 'group':False}
    labels={"tea": "group"},
    )

fig.update_traces(mode = "markers",
                  textposition='middle right',
                  textfont={"color": font_color, "size": font_size, "family": font_family},
                  marker=dict(symbol=marker_symbol, size=marker_size, opacity=opacity, line=dict(color=edge_color, width=1)),
                  # customdata=df.text, # WRONG!
                  # hovertemplate='<b>%{text}</b><br>' + '%{customdata}', #<extra></extra>',
                  hovertemplate=None
                  )

fig.update_layout(ortho_layout)
fig.update_layout(geo=dict(projection_rotation = {'lat': 15, 'lon': 60, 'roll': 0}))
fig.update_layout(document_size)

fig.show()

# write 
filename = "distribution_"
fig.write_image(path_out_pdf + filename + key + ".pdf", engine="kaleido")

################################################################################
#### HTML
fig = px.scatter_geo(df,
    lat='lat', 
    lon='lon',
    text='name',
    color='tea',
    color_discrete_sequence=[MidnightBlue, PolyU, 'gray'], #https://www.colorhexa.com/1034a6
    opacity = opacity, #???
    hover_name='name',
    # hover_data={'term':True, 'language':True, 'family':True, 'name':False, 'lon':False, 'lat':False, 'group':False}
    labels={"tea": "group"},
    )

fig.update_traces(mode = "markers",
                  textposition='middle right',
                  textfont={"color": font_color, "size": font_size, "family": font_family},
                  marker=dict(symbol=marker_symbol, size=marker_size, opacity=opacity, line=dict(color=edge_color, width=1)),
                  # customdata=df.text, # WRONG!
                  # hovertemplate='<b>%{text}</b><br>' + '%{customdata}', #<extra></extra>',
                  hovertemplate=None
                  )

fig.update_layout(ortho_layout)
fig.update_layout(geo=dict(projection_rotation = {'lat': 15, 'lon': 60, 'roll': 0}))
fig.update_layout(title_text="Distribution of words for " + key + " in a few languages")
fig.update_layout(template=cr)
fig.add_layout_image(logo)

fig.add_annotation(x=0, y=0.99,
                   xanchor="left", yanchor="top", align="left",
                   text="Data:<br>Östen Dahl (2013) Tea.<br>In: Dryer, Matthew S. & Haspelmath, Martin (eds.)<br>The World Atlas of Language Structures Online.<br>Leipzig: Max Planck Institute for Evolutionary Anthropology.<br>(Available online at http://wals.info/chapter/138,<br>Accessed on 2022-07-28.)",
                   font={"color": font_color, "size": font_size-2, "family": font_family},
                   showarrow=False)

fig.show()

# write
filename = "distribution_"
fig.write_html(path_out_html + filename + key + ".html")
fig.write_json(path_out_json + filename + key + ".json")

# fig.update_layout(legend=dict(y=0, x=0, xanchor="left", yanchor="bottom"))

(2712, 13)
(230, 13)


#### Cinnamon

In [356]:
key = 'cinnamon'

df=pd.read_csv(path_in + "multilingual/" + key+'.csv', header =[0], delimiter=',', encoding="utf-8")

################################################################################

# Variables
marker_size = 12
edge_size = 1
line_width = 4
font_size = 12

################################################################################
#### Document
fig = px.scatter_geo(df,
    lat='lat', 
    lon='lon',
    text='item',
    color='group',
    color_discrete_sequence=[p1,p2,p3,p4,p5,p6,p11],
    opacity = opacity,
    hover_name='item',
    hover_data={'term':True, 'language':True, 'family':True, 'item':False, 'lon':False, 'lat':False, 'group':False},
    # labels={"group": "category"}
    )

fig.update_traces(mode = "markers",
                  textposition='middle right',
                  textfont={"color": font_color, "size": font_size, "family": font_family},
                  marker=dict(symbol=marker_symbol, size=marker_size, opacity=opacity, line=dict(color=edge_color, width=1)),
                #   hovertemplate=None
                  )

fig.update_layout(ortho_layout)
fig.update_layout(geo=dict(projection_rotation = {'lat': 15, 'lon': 60, 'roll': 0}))
fig.update_layout(document_size)

fig.show()

# write 
filename = "distribution_"
fig.write_image(path_out_pdf + filename + key + ".pdf", engine="kaleido")

################################################################################
#### HTML
fig = px.scatter_geo(df,
    lat='lat', 
    lon='lon',
    text='item',
    color='group',
    color_discrete_sequence=[p1,p2,p3,p4,p5,p6,p11],
    opacity = opacity,
    hover_name='item',
    hover_data={'term':True, 'language':True, 'family':True, 'item':False, 'lon':False, 'lat':False, 'group':False},
    # labels={"group": "category"}
    )

fig.update_traces(mode = "markers",
                  textposition='middle right',
                  textfont={"color": font_color, "size": font_size, "family": font_family},
                  marker=dict(symbol=marker_symbol, size=marker_size, opacity=opacity, line=dict(color=edge_color, width=1)),
                #   hovertemplate=None
                  )

fig.update_layout(ortho_layout)
fig.update_layout(geo=dict(projection_rotation = {'lat': 15, 'lon': 60, 'roll': 0}))
fig.update_layout(title_text="Distribution of words for " + key + " in a few languages")
fig.update_layout(template=cr)
fig.add_layout_image(logo)

fig.show()

# write
filename = "distribution_"
fig.write_html(path_out_html + filename + key + ".html")
fig.write_json(path_out_json + filename + key + ".json")

#### Pepper

In [357]:
key = 'pepper'

df=pd.read_csv(path_in + "multilingual/" + key+'.csv', header =[0], delimiter=',', encoding="utf-8")

# # df = df.fillna('') # Operative only

################################################################################

# Variables
marker_size = 12
edge_size = 1
line_width = 4
font_size = 12

################################################################################
#### Document
fig = px.scatter_geo(df,
    lat='lat', 
    lon='lon',
    text='item',
    color='group',
    color_discrete_sequence=[p1,p2,p4,p6,p11],
    opacity = opacity,
    hover_name='item',
    hover_data={'term':True, 'language':True, 'family':True, 'item':False, 'lon':False, 'lat':False, 'group':False},
    # labels={"group": "category"}
    )

fig.update_traces(mode = "markers",
                  textposition='middle right',
                  textfont={"color": font_color, "size": font_size, "family": font_family},
                  marker=dict(symbol=marker_symbol, size=marker_size, opacity=opacity, line=dict(color=edge_color, width=1)),
                #   hovertemplate=None # !!! TRY TO MAKE ONE
                  )

fig.update_layout(ortho_layout)
fig.update_layout(geo=dict(projection_rotation = {'lat': 15, 'lon': 60, 'roll': 0}))
fig.update_layout(document_size)

fig.show()

# write 
filename = "distribution_"
fig.write_image(path_out_pdf + filename + key + ".pdf", engine="kaleido")

################################################################################
#### HTML
fig = px.scatter_geo(df,
    lat='lat', 
    lon='lon',
    text='item',
    color='group',
    color_discrete_sequence=[p1,p2,p4,p6,p11],
    opacity = opacity,
    hover_name='item',
    hover_data={'term':True, 'language':True, 'family':True, 'item':False, 'lon':False, 'lat':False, 'group':False},
    # labels={"group": "category"}
    )

fig.update_traces(mode = "markers",
                  textposition='middle right',
                  textfont={"color": font_color, "size": font_size, "family": font_family},
                  marker=dict(symbol=marker_symbol, size=marker_size, opacity=opacity, line=dict(color=edge_color, width=1)),
                  # hovertemplate=None
                  )

fig.update_layout(ortho_layout)
fig.update_layout(geo=dict(projection_rotation = {'lat': 15, 'lon': 60, 'roll': 0}))
fig.update_layout(title_text="Distribution of words for " + key + " in a few languages")
fig.update_layout(template=cr)
fig.add_layout_image(logo)

fig.show()

# write
filename = "distribution_"
fig.write_html(path_out_html + filename + key + ".html")
fig.write_json(path_out_json + filename + key + ".json")


# Timelines

In [358]:
df=pd.read_csv(path_in+'oed/oed.csv', header =[0], delimiter=',', encoding="utf-8")
df = df.loc[df['year'] != '?'] # include ones to include
df.to_csv(path_in+"oed/oed_working.csv", index = None, header=True)
df=pd.read_csv(path_in+'oed/oed_working.csv', header =[0], delimiter=',', encoding="utf-8")

# df = df.loc[df['level'] == 'main'] # include ones to include
# df = df.loc[df['id'] == 'pepper'] # include ones to include
# df = df.loc[(df['id'] == 'saffron') | (df['id'] == 'pepper')] # include ones to include
print(df['id'].value_counts())
df

# Add dummy column for size
df['size'] = 1

# # # Sorting
# df.sort_values(['level'], inplace = True, key=lambda col: col.str.lower()) # sort by order of columns, ignoring casing
df['class'] = pd.Categorical(df['class'], ["n.", "v.", "adj.", "adv.", "phrase"]) # add categorical order here
df.sort_values(["class", "level", "year"], inplace = True) # sort according to the categories
df

pepper      132
ginger       87
saffron      59
cinnamon     39
Name: id, dtype: int64


Unnamed: 0,id,tier,level,entry,class,year,size
39,ginger,2,main,"ginger, n. and adj.",n.,925,1
126,pepper,3,main,"pepper, n.",n.,925,1
129,pepper,3,main,"pepper-quern, n.",n.,940,1
130,pepper,3,main,"peppercorn, n. and adj.",n.,945,1
247,saffron,4,main,"saffron, n. and adj.",n.,1200,1
...,...,...,...,...,...,...,...
310,pepper,3,sub,†to pepper a person's box (also pans),phrase,1608,1
199,pepper,3,sub,to snuff pepper,phrase,1624,1
221,pepper,3,sub,to pay a visit to Pepper Alley,phrase,1821,1
123,ginger,2,sub,to put ginger,phrase,1919,1


In [359]:
# # https://plotly.com/python/templates/
# for template in ["plotly", "plotly_white", "plotly_dark", "ggplot2", "seaborn", "simple_white", "none"]:
#     fig = px.scatter(df, x="year", y="id", color="class",
#                     #  log_x=True, size_max=60,
#                      template=template)

#     fig.show()

## OED entries

### Document

In [360]:
# Variables
marker_size = 20
font_size = 20
edge_size = 2
line_width = 4
opacity = 0.5

In [361]:
fig = px.scatter(df, x="year", y="id", 
                 size="size", size_max=marker_size,
                 color="class", color_discrete_sequence=[p2,p7,p4,p5,p1],
                 symbol="level", symbol_sequence = ['circle', 'circle-open'], opacity = opacity, #'circle', 'circle-open'
                 hover_name='entry',
                 marginal_x="histogram",
                 template = "plotly_white"
                 )

fig.update_traces(
    marker_line_width=edge_size,
    marker_line_color="rgba(0,0,0,0)",
    # selector=dict(mode='markers')
    )

fig.update_layout(
    font_family=font_family,
    font_color=font_color,
    font_size=font_size)

fig.update_xaxes(visible=True)
fig.update_yaxes(visible=True, title=None, showticklabels=True)
fig.update_xaxes(ticklabelposition="inside")
fig.update_yaxes(ticklabelposition="inside") # tickangle = 0
# fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='#f8f8f8') #f0f0f0
# fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='#f8f8f8')
# fig.update_layout(paper_bgcolor="white") # transparent background rgb(0,0,0,0)
# fig.update_layout(plot_bgcolor="white")#f6f6f6


fig.update_layout(width = 1000, height=600,
                  margin={"r":0,"t":0,"l":0,"b":0}, showlegend = True,
                  legend=dict(xanchor="left", yanchor="top", #title="class",
                              title=dict(side='top'), bgcolor=transparent,
                              # y=-0.15, x=0, 
                              font={"color": font_color, "size": font_size-1, "family": font_family}, 
                              orientation="h"))

fig.show()

# write
filename = "oed" 
fig.write_image(path_out_pdf+filename+".pdf", engine="kaleido")

### HTML

In [362]:
# Variables
marker_size = 20
edge_size = 2
line_width = 4
font_size = 20
opacity = 0.5

fig = px.scatter(df, x="year", y="id", 
                 size="size", size_max=marker_size,
                 color="class", color_discrete_sequence=[p2,p7,p4,p5,p1],
                 symbol="level", symbol_sequence = ['circle', 'circle-open'], opacity = opacity, #'circle', 'circle-open'
                 hover_name='entry',
                 marginal_x="histogram",
                 template = "plotly_white"
                 )

fig.update_traces(
    marker_line_width=edge_size,
    marker_line_color="rgba(0,0,0,0)",
    # selector=dict(mode='markers')
    )

fig.update_layout(
    font_family=font_family,
    font_color=font_color,
    font_size=font_size)

fig.update_xaxes(visible=True)
fig.update_yaxes(visible=True, title=None, showticklabels=True)
fig.update_xaxes(ticklabelposition="inside")
fig.update_yaxes(ticklabelposition="inside") # tickangle = 0
# fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='#f8f8f8') #f0f0f0
# fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='#f8f8f8')
# fig.update_layout(paper_bgcolor="white") # transparent background rgb(0,0,0,0)
# fig.update_layout(plot_bgcolor="white")#f6f6f6


fig.update_layout(
    margin={"r":0,"t":20,"l":0,"b":0}, showlegend = True,
    title=dict(y=1, x=0, xanchor='left', yanchor='top', 
               text='A timeline of words and phrases derived from spice names, based on main- and sub-level entries in the OED',   
               font={"color": "black", "size": font_size, "family": font_family}),
    legend=dict(xanchor="left", yanchor="top", #title="class",
                title=dict(side='top'),
                              # y=-0.15, x=0, 
                              font={"color": font_color, "size": font_size-1, "family": font_family}, 
                              bgcolor='rgba(0,0,0,0)', 
                              orientation="h"))


fig.add_annotation(
    xref = "paper", yref = "paper",
    x=0, y=0, #x=1
    xanchor="left", yanchor="top", align="center",
    text="© Parti Gábor, 2022",
    font={"color": "gainsboro", "size": 8, "family": font_family},
    showarrow=False)

# add images
fig.add_layout_image(
    source="https://upload.wikimedia.org/wikipedia/en/thumb/9/9e/PolyU_Logo_with_wordmark.svg/1024px-PolyU_Logo_with_wordmark.svg.png",
    sizex=0.1, sizey=0.1,
    # source="https://upload.wikimedia.org/wikipedia/en/thumb/5/52/PolyU.svg/759px-PolyU.svg.png",
    # sizex=0.15, sizey=0.15,
    x=1, y=0, 
    xanchor="right", yanchor="top", 
)

fig.show()

# write
filename = "oed" 
fig.write_html(path_out_html + filename + ".html")

## OED pepper

In [363]:
df=pd.read_csv(path_in + 'oed/oed.csv', header =[0], delimiter=',', encoding="utf-8")
df = df.loc[df['year'] != '?'] # include ones to include
df.to_csv(path_in + "oed/oed_working.csv", index = None, header=True)
df=pd.read_csv(path_in + 'oed/oed_working.csv', header =[0], delimiter=',', encoding="utf-8")

# Filter
# df = df.loc[df['level'] == 'main'] # include ones to include
df = df.loc[df['id'] == 'pepper'] # include ones to include
# Add dummy column for size
df['size'] = 1
# # # Sorting
df['class'] = pd.Categorical(df['class'], ["n.", "v.", "adj.", "adv.", "phrase"]) 
df.sort_values(["class", "level", "year"], inplace = True)
df

Unnamed: 0,id,tier,level,entry,class,year,size
126,pepper,3,main,"pepper, n.",n.,925,1
129,pepper,3,main,"pepper-quern, n.",n.,940,1
130,pepper,3,main,"peppercorn, n. and adj.",n.,945,1
132,pepper,3,main,"pepperer, n.1",n.,1309,1
134,pepper,3,main,"peppergrass, n.",n.,1500,1
...,...,...,...,...,...,...,...
238,pepper,3,sub,pepperily,adv.,1898,1
133,pepper,3,main,to have pepper in the nose,phrase,1400,1
310,pepper,3,sub,†to pepper a person's box (also pans),phrase,1608,1
199,pepper,3,sub,to snuff pepper,phrase,1624,1


### Document

In [364]:
# Variables
marker_size = 20
edge_size = 2
line_width = 4
font_size = 20
opacity = 0.5

fig = px.scatter(df, x="year", y="class", 
                 size="size", size_max=marker_size,
                 color="class", color_discrete_sequence=[p2,p7,p4,p5,p1],
                 symbol="level", symbol_sequence = ['circle', 'circle-open'], opacity = opacity,
                 hover_name='entry',
                 hover_data={'class':True, 'level':True, 'year':True, 'id':True, 'size':False},
                 marginal_x="histogram",
                 template='plotly_white')

fig.update_traces(
    marker_line_width=edge_size,
    marker_line_color="rgba(0,0,0,0)",
    # selector=dict(mode='markers')
    )

fig.update_layout(
    font_family=font_family,
    font_color=font_color,
    font_size=font_size)

fig.update_xaxes(visible=True)
fig.update_yaxes(visible=True, title=None, showticklabels=True)
fig.update_xaxes(ticklabelposition="inside")
fig.update_yaxes(ticklabelposition="inside") # tickangle = 0

fig.update_layout(width = 1000, height=500,
                  margin={"r":0,"t":0,"l":0,"b":0}, showlegend = True,
                  legend=dict(xanchor="left", yanchor="top", #title="class",
                              title=dict(side='top'),
                              y=0, x=0, 
                              font={"color": font_color, "size": font_size-2, "family": font_family}, 
                              bgcolor='rgba(0,0,0,0)', 
                              orientation="h"))

fig.show()

# write
filename = "oed_pepper" 
fig.write_image(path_out_pdf + filename+".pdf", engine="kaleido")

### HTML

In [365]:
# Variables
marker_size = 20
edge_size = 2
line_width = 4
font_size = 20
opacity = 0.5

fig = px.scatter(df, x="year", y="class", 
                 size="size", size_max=marker_size,
                 color="class", color_discrete_sequence=[p2,p7,p4,p5,p1],
                 symbol="level", symbol_sequence = ['circle', 'circle-open'], opacity = opacity,
                 hover_name='entry',
                 hover_data={'class':True, 'level':True, 'year':True, 'id':True, 'size':False},
                 marginal_x="histogram",
                 template='plotly_white')

fig.update_traces(
    marker_line_width=edge_size,
    marker_line_color="rgba(0,0,0,0)",
    # selector=dict(mode='markers')
    )

fig.update_layout(
    font_family=font_family,
    font_color=font_color,
    font_size=font_size)

fig.update_xaxes(visible=True)
fig.update_yaxes(visible=True, title=None, showticklabels=True)
fig.update_xaxes(ticklabelposition="inside")
fig.update_yaxes(ticklabelposition="inside") # tickangle = 0

fig.update_layout(
    # width = 1000, height=400,
                  margin={"r":0,"t":20,"l":0,"b":0}, showlegend = True,
                  title=dict(y=1, x=0, xanchor='left', yanchor='top', 
                  text='A timeline of words and phrases derived from pepper, based on main- and sub-level entries in the OED',   
                  font={"color": "black", "size": font_size, "family": font_family}),
                  legend=dict(xanchor="left", yanchor="top", #title="class",
                              title=dict(side='top'),
                              # y=-0.15, x=0, 
                              font={"color": font_color, "size": font_size-2, "family": font_family}, 
                              bgcolor='rgba(0,0,0,0)', 
                              orientation="h"))
                  # legend=dict(y=1, x=1, 
                  # xanchor="left", yanchor="top", 
                  # orientation="v"))


fig.add_annotation(
    xref = "paper", yref = "paper",
    x=0, y=0, #x=1
    xanchor="left", yanchor="top", align="center",
    text="© Parti Gábor, 2022",
    font={"color": "gainsboro", "size": 8, "family": font_family},
    showarrow=False)

# add images
fig.add_layout_image(
    source="https://upload.wikimedia.org/wikipedia/en/thumb/9/9e/PolyU_Logo_with_wordmark.svg/1024px-PolyU_Logo_with_wordmark.svg.png",
    sizex=0.1, sizey=0.1,
    # source="https://upload.wikimedia.org/wikipedia/en/thumb/5/52/PolyU.svg/759px-PolyU.svg.png",
    # sizex=0.15, sizey=0.15,
    x=1, y=0, 
    xanchor="right", yanchor="top", 
)

# # Annotations manually
# m = df.loc[df["level"] == "main", ["id","level","entry","class","year"]]
# for i, j, k in zip(df['year'], df['id'], df['entry']):
#     fig.add_annotation(go.layout.Annotation(
#     x = i, y = j, xref="x", yref="y",
#     text = k, 
#     xanchor='auto',
#     yanchor='bottom',
#     align='center',  
#     showarrow=False,
#     textangle=-45,
#     font=dict(size=12, color=font_color, family=font_family)))

# Annotations with a button
layoutAnnotationList = []
for i, j, k in zip(df['year'], df['class'], df['entry']):
    x = i
    y = j
    text = k
    xanchor='center'
    yanchor='bottom'
    layoutAnnotationList.append( { 'x':x, 'y':y, 'xanchor':xanchor, 'yanchor':yanchor, 
                                  'text':text, 'textangle':-45 , 'showarrow':False, 
                                  "font":dict(size=12, color=font_color, family=font_family) } )
# layout = { 'annotations': layoutAnnotationList } 

layoutButtons = list([
                dict(type="buttons", active=0, showactive=True,
                     x=1, xanchor="right",
                     y=-0.1, yanchor="top",
                     buttons=list([   
                        dict(label = 'Annotations: On',
                             method = 'update',
                             args = [{'visible': [True, True, True, True]},{'annotations':layoutAnnotationList}]
                             ),
                        dict(label = 'Annotations: Off',
                             method = 'update',
                             args = [{'visible':[True, True, True, True]},{'annotations':[]}]
                             ),]))])

layout = {#'annotations': layoutAnnotationList, 
          'updatemenus':layoutButtons }

# https://stackoverflow.com/questions/54222205/hidding-annotations-in-plotly-python-using-a-button

fig.update_layout(layout)
        
fig.show()

filename = "oed_pepper" 
fig.write_html(path_out_html + filename+".html")

In [366]:
# Variables #-------------------------------------------------------------------
marker_size = 20
edge_size = 2
line_width = 4
font_size = 20
opacity = 0.5

fig = px.scatter(df, x="year", y="class", 
                 size="size", size_max=marker_size,
                 color="class", color_discrete_sequence=[p2,p7,p4,p5,p1],
                 symbol="level", symbol_sequence = ['circle', 'circle-open'], opacity = opacity,
                 hover_name='entry',
                 hover_data={'class':True, 'level':True, 'year':True, 'id':True, 'size':False},
                 marginal_x="histogram",
                 template='plotly_white')

fig.update_traces(
    marker_line_width=edge_size,
    marker_line_color="rgba(0,0,0,0)",
    # selector=dict(mode='markers')
    )

fig.update_layout(
    font_family=font_family,
    font_color=font_color,
    font_size=font_size)

fig.update_xaxes(visible=True)
fig.update_yaxes(visible=True, title=None, showticklabels=True)
fig.update_xaxes(ticklabelposition="inside")
fig.update_yaxes(ticklabelposition="inside") # tickangle = 0
# fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='#f8f8f8') #f0f0f0
# fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='#f8f8f8')
# fig.update_layout(paper_bgcolor="white") # transparent background rgb(0,0,0,0)
# fig.update_layout(plot_bgcolor="white")#f6f6f6

fig.update_layout(width = 1000, height=400,
                  margin={"r":0,"t":0,"l":0,"b":0}, showlegend = True,
                  legend=dict(xanchor="left", yanchor="top", #title="class",
                              # y=-0.15, x=0, 
                              font={"color": font_color, "size": font_size-1, "family": font_family}, 
                              bgcolor='rgba(0,0,0,0)', 
                              orientation="h"))

fig.show()

# writing and saving ---------------------------------------------------------
filename = "oed_pepper_b" 
fig.write_image(path_out_pdf + filename+".pdf", engine="kaleido")

# full size for html ---------------------------------------------------------
fig.update_layout(
    font_size=12,
    autosize=True,
    margin={"r":0,"t":60,"l":0,"b":30},
    title=dict(y=0.9, x=0, xanchor='left', yanchor='top', 
               text='A timeline of words and phrases derived from spice names, based on main- and sub-level entries in the OED',   
               font={"color": "black", "size": font_size, "family": font_family}),
    legend=dict(y=1, x=1, 
                xanchor="left", yanchor="top", 
                orientation="v")
    )

fig.add_annotation(xref="paper", yref="paper", x=1, y=0, xanchor="right", yanchor="top", align="right",
                   text="Parti Gábor, 2022<br>The Hong Kong Polytechnic University",
                   font={"color": "gray", "size": 10, "family": font_family},
                   showarrow=False)

# # Annotations manually
# m = df.loc[df["level"] == "main", ["id","level","entry","class","year"]]
# for i, j, k in zip(df['year'], df['id'], df['entry']):
#     fig.add_annotation(go.layout.Annotation(
#     x = i, y = j, xref="x", yref="y",
#     text = k, 
#     xanchor='auto',
#     yanchor='bottom',
#     align='center',  
#     showarrow=False,
#     textangle=-45,
#     font=dict(size=12, color=font_color, family=font_family)))

# Annotations with a button
layoutAnnotationList = []
for i, j, k in zip(df['year'], df['class'], df['entry']):
    x = i
    y = j
    text = k
    xanchor='center'
    yanchor='bottom'
    layoutAnnotationList.append( { 'x':x, 'y':y, 'xanchor':xanchor, 'yanchor':yanchor, 
                                  'text':text, 'textangle':-45 , 'showarrow':False, 
                                  "font":dict(size=12, color=font_color, family=font_family) } )
# layout = { 'annotations': layoutAnnotationList } 

layoutButtons = list([
                dict(type="buttons", active=0, showactive=True,
                     x=1, xanchor="left",
                     y=0, yanchor="bottom",
                     buttons=list([   
                        dict(label = 'Annotations: On',
                             method = 'update',
                             args = [{'visible': [True, True, True, True]},{'annotations':layoutAnnotationList}]
                             ),
                        dict(label = 'Annotations: Off',
                             method = 'update',
                             args = [{'visible':[True, True, True, True]},{'annotations':[]}]
                             ),]))])

layout = {#'annotations': layoutAnnotationList, 
          'updatemenus':layoutButtons }

# https://stackoverflow.com/questions/54222205/hidding-annotations-in-plotly-python-using-a-button

fig.update_layout(layout)
        
fig.show()

fig.write_html(path_out_html + filename + ".html")

Compact

In [367]:
# Variables
marker_size = 24
edge_size = 3
line_width = 5
font_size = 24
opacity = 0.5

fig = px.scatter(
    df, x="year", y="id", 
    color="class",
    symbol="level",
    symbol_sequence = ['circle', 'circle-open'],
    # color_discrete_sequence=px.colors.qualitative.Prism, #Viridis #Bold
    # color_discrete_sequence=px.colors.sequential.Viridis,
    color_discrete_sequence=[p2,p7,p4,p5,p1],
    opacity = 0.5,
    hover_name='entry',
    hover_data={'entry':True, 'level':True, 'class':True, 'year':True, 'id':False, 'entry':False},
    labels={"id": "", "year":"year", "class":"class", "level":"level"},
    # title="A timeline of words and phrases derived from spice names,<br>based on main- and sub-level entries in the OED",
    marginal_x="histogram", #'rug', 'box', 'violin', or 'histogram'
    # marginal_y="rug"
    )

# fig = px.line(df, x='year', y='value', color='class')

fig.update_traces(
    # marker_color="lightskyblue",
    marker_line_color="rgba(0,0,0,0)",
    marker_line_width=edge_size,
    marker_size=marker_size,
    selector=dict(mode='markers')
    )

# # a good way to id subsets
# sub = df.loc[df["level"] == "sub", ["id","level","entry","class","year"]]
# print(sub)

fig.update_xaxes(visible=True, showticklabels=False)
fig.update_yaxes(visible=False, showticklabels=False)
fig.update_layout(paper_bgcolor="white") # transparent background rgb(0,0,0,0)
fig.update_layout(plot_bgcolor="#f8f8f8")#f6f6f6
fig.update_layout(xaxis = go.layout.XAxis(title='year', showticklabels=True))
fig.update_layout(yaxis = go.layout.YAxis(title=None, visible = True, showticklabels=False, showgrid=True))
fig.update_xaxes(showgrid=True, gridwidth=2, gridcolor='white') #f0f0f0
fig.update_yaxes(showgrid=True, gridwidth=2, gridcolor='white')
fig.update_xaxes(ticklabelposition="inside")
fig.update_yaxes(ticklabelposition="inside", tickangle = 270, showticklabels=True)


fig.update_layout(
    width = 1200, height=400,
    margin={"r":0,"t":0,"l":0,"b":0},
    showlegend = True,
    legend=dict(#xref="plot", yref="plot"
                title="class",
                y=-0.15, x=0, xanchor="left", yanchor="top", 
                bgcolor='rgba(0,0,0,0)',   
                font={"color": "black", "size": 24, "family": font_family}, 
                orientation="h",
                # traceorder = 'normal', #or reversed
                # bgcolor="white",
                # bordercolor="gainsboro", 
                # borderwidth=1
                )
    )

fig.update_layout(
    font_family=font_family,
    font_color=font_color,
    font_size=font_size)

# fig.add_annotation(y=0.1, x=950, xanchor="center", yanchor="middle", text="Old English",
#                    font={"color": "darkgray", "size": 10, "family": font_family},
#                    showarrow=False)

fig.show()

# # write
# filename = "pepper_oed" 
# fig.write_image(filename+".png", engine="kaleido")
# fig.write_image(filename+".pdf", engine="kaleido")
# # download
# files.download(filename+".pdf")
# files.download(filename+".png")

# # full size for html-----------------------
# fig.update_layout(
#     width = 1200, height=400,
#     margin={"r":0,"t":60,"l":0,"b":0},
#     title=dict(y=0.9, x=0, xanchor='left', yanchor='top', 
#                text='A timeline of words and phrases derived from spice names, based on main- and sub-level entries in the OED',   
#                font={"color": "black", "size": font_size, "family": font_family}),
#     legend=dict(y=1, x=1, 
#                 xanchor="left", yanchor="top", 
#                 orientation="v")
#     )

# fig.add_annotation(xref="paper", yref="paper", y=0, x=1, xanchor="right", yanchor="top", align="right",
#                    text="Parti Gábor, 2022<br>The Hong Kong Polytechnic University",
#                    font={"color": "gray", "size": 10, "family": font_family},
#                    showarrow=False)

# fig.show()

# fig.write_html(filename+".html")


## Timeline with binning

In [368]:
df['century'] = century(df['year'])
df['value'] = 1
d = df['century'].value_counts().to_dict()

for index, row in df.iterrows():
  for key, value in d.items():
    if row['century'] == key:
      row['value'] = value
      df.loc[index, 'value'] = value

df

Unnamed: 0,id,tier,level,entry,class,year,size,century,value
126,pepper,3,main,"pepper, n.",n.,925,1,10,7
129,pepper,3,main,"pepper-quern, n.",n.,940,1,10,7
130,pepper,3,main,"peppercorn, n. and adj.",n.,945,1,10,7
132,pepper,3,main,"pepperer, n.1",n.,1309,1,14,2
134,pepper,3,main,"peppergrass, n.",n.,1500,1,16,16
...,...,...,...,...,...,...,...,...,...
238,pepper,3,sub,pepperily,adv.,1898,1,19,33
133,pepper,3,main,to have pepper in the nose,phrase,1400,1,15,1
310,pepper,3,sub,†to pepper a person's box (also pans),phrase,1608,1,17,28
199,pepper,3,sub,to snuff pepper,phrase,1624,1,17,28


Plotly Go

In [369]:
fig = px.line(df, x='year', y='value', color='class')

# fig = make_subplots(rows=2, cols=1)

# Go ===========================================================================

# fig = go.Figure()

# fig.add_trace(go.Scatter(x=df['year'], y=df['tier'],
#                     mode='markers',
#                     name='oed',
#                     text=df['entry'],
#                     textposition = 'top right',
#                     textfont={"color": font_color, "size": font_size, "family": font_family},
#                     # marker_color = 'darkorange',
#                     marker=dict(symbol='circle', color=PolyU, size=marker_size, opacity=opacity, line=dict(color='white', width=edge_size)), 
#                     opacity=1,
#                     )) 


# x = df['year']
# y = df['value']

# fig.add_trace(go.Scatter(x=df['year'], y=df['value']/10,
#                          mode="lines",
#                          name="spline",
#                          line_shape='spline',
#                          line_color = 'black', line_width=line_width, line_dash='solid', 
#                          opacity=opacity,
#                          hoverinfo='text+name',
#                          text=["tweak line smoothness"],
#                          ))

# # Go =========================================================================


# fig.update_traces(
#     mode='markers+lines',
#     marker=dict(#symbol='0', #'diamond',
#                 size=marker_size,
#                 # line=dict(width=4, 
#                            # color='white'
#                           # )
#                   ),
#                   )

# fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='#f0f0f0')
# fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='#f0f0f0')
# fig.update_layout(paper_bgcolor="rgb(0,0,0,0)") # transparent
# fig.update_layout(paper_bgcolor="white",
#                   plot_bgcolor="white")

# fig.update_layout(
#     width = 1200, height=400,
#     # margin={"r":0,"t":0,"l":0,"b":0},
#     showlegend = True,
#     legend=dict(
#                 # y=0.9, x=0.12, xanchor="left", yanchor="top", 
#                 bgcolor='rgba(0,0,0,0)',   
#                 font={"color": "black", "size": 12, "family": font_family}, 
#                 # traceorder = 'normal', #or reversed
#                 orientation="h",
#                 # bgcolor="white",
#                 # bordercolor="gainsboro", 
#                 # borderwidth=1
#                 ),
#     # title=dict(
#                #y=0.99, x=0.12, xanchor='left', yanchor='top', 
#               #  text='Title',   
#               #  font={"color": "black", "size": 20, "family": font_family}),
#     # hovermode="closest", #default
#     # hoverlabel=dict(bgcolor="white", font_size=12, font_family=font_family),
#     )

# fig.update_layout(
#     font_family="Raleway",
#     font_color="black",
#     font_size=font_size
#     # title_font_family="Times New Roman",
#     # title_font_color="red",
#     # legend_title_font_color="green"
# )

# fig.add_annotation(y=0, x=1, xanchor="right", yanchor="bottom", text="Parti Gábor, 2022",
#                    font={"color": "lightgray", "size": 10, "family": font_family},
#                    showarrow=False)

# fig.add_annotation(y=2.1, x=950, xanchor="center", yanchor="middle", text="Old English",
#                    font={"color": "darkgray", "size": 10, "family": font_family},
#                    showarrow=False)

fig.show()

# # write
# fig.write_image("oed.png", engine="kaleido")
# fig.write_image("oed.pdf", engine="kaleido")
# # download
# files.download("oed.pdf")
# files.download("oed.png")

# # full size for html-----------------------
# # fig.update_layout(width = 1200, height=600,
# #                   legend=dict(y=0.9, x=0, xanchor="left", yanchor="top")
# #                   )

# fig.write_html("oed.html")
# files.download("oed.html")

## Move and Copy

In [370]:
# Move files to the thesis folder
copy_dir(path_out_tex, destination_tex, "*.tex")
copy_dir(path_out_pdf, destination_pdf, "*.pdf")

# Move files to the website folder
copy_dir(path_out_json, destination_json, "*.json")
copy_dir(path_out_html, destination_html, "*.html")

In [9]:
# Convert pdfs to pngs (for presentations) and move them accordingly

path = "output/pdf/"
files = list_files(path)

for file in files:
    convert_pdf_to_png(file)

move_dir(path, path_out_png, "*.png")

# Website

In [61]:
from datetime import date
from markdownTable import markdownTable

## Preprocess data

In [62]:
# Read and store content of an excel file 
read_file = pd.read_excel(path_in+"spices.xlsx")

# Write the dataframe object into csv file
read_file.to_csv (path_in+"spices.csv", index = None, header=True)

# Load in dataset of spices as a dataframe
df_spices=pd.read_csv(path_in+'spices.csv', header =[0], delimiter=',', encoding="utf-8")

# Select ones to include
df_spices = df_spices.loc[(df_spices['include'] == "in")]

# List the list of ids
list_of_spices = df_spices['id'].tolist()
list_of_spices.sort()
print("List of spices:", list_of_spices, "\n", len(list_of_spices), "spices in total.")

List of spices: ['Sichuan pepper', 'allspice', 'anise', 'asafoetida', 'caraway', 'cardamom', 'cassia', 'chile', 'cinnamon', 'clove', 'coriander', 'cumin', 'dill', 'fennel', 'fenugreek', 'ginger', 'long pepper', 'mace', 'nutmeg', 'pepper', 'saffron', 'star anise', 'turmeric', 'vanilla'] 
 24 spices in total.


In [63]:
# Read and store content of an excel file 
read_file = pd.read_excel(path_in+"names.xlsx")

# Write the dataframe object into csv file
read_file.to_csv (path_in+"names.csv", index = None, header=True)

# Load in dataset of names
df_names=pd.read_csv(path_in+'names.csv', header =[0], delimiter=',', encoding="utf-8")

df_names = df_names.loc[df_names['include'] == 'yes'] # include ones to include
# df_names = df_names.loc[df_names['conventionalized'] == 'yes'] # exclude those not in a dictionary

df_names.fillna('', inplace=True)
print(df_names.shape)

(360, 69)


In [64]:
# Read and store content of an excel file 
read_file = pd.read_excel(path_in+"etymologies.xlsx")

# Write the dataframe object into csv file
read_file.to_csv (path_in+"etymologies.csv", index = None, header=True)

# Load in dataset
df_etymologies=pd.read_csv(path_in+'etymologies.csv', header =[0], delimiter=',', encoding="utf-8")

# Split the dataset wherever an empty row is found
df_list_with_na = np.split(df_etymologies, df_etymologies[df_etymologies.isnull().all(1)].index)

# Drop NAs and reset the index
df_list = []
for df in df_list_with_na:
  df = df.dropna(how='all')
  df.reset_index(inplace=True, drop=True)
  df_list.append(df)

# Automatically extract IDs from the dataset
ids = []
for df in df_list:
  df = df.dropna(how='all')
  df.reset_index(drop=True)
  id = str(df['id'].iloc[0])
  ids.append(id)

# Print the number if IDs and what are they
length = len(df_list)
print(ids)
print(length, "words in total")

#Create a defaultdict of spice-word etymologies
etymologies=defaultdict(list)
for i in range(length):
  etymologies[ids[i]]=df_list[i]

# Testing
etymologies['saffron']

['tester', 'allspice', 'fulful ifranji', 'duoxiangguo', 'pimento', 'anise', 'anisun', 'huiqin', 'asafoetida', 'hing', 'hiltit', 'anjudan', 'awei', 'xingqu', 'caraway', 'karawiya', 'geluzi', 'cardamom', 'amomum', 'hal', 'qaqulla', 'doukou', 'cassia', 'salikha', 'rougui', 'cinnamon', 'darsini', 'qirfa', 'chile', 'fulful harr', 'lajiao', 'paprika', 'clove', 'qaranful', 'dingxiang', 'coriander', 'kuzbura', 'yansui', 'husui', 'cumin', 'kammun', 'ziran', 'dill', 'shibitt', 'shiluo', 'fennel', 'shamar', 'huixiang', 'fenugreek', 'hulba', 'huluba', 'ginger', 'zanjabil', 'jiang', 'long pepper', 'darfilfil', 'biba', 'mace', 'basbas', 'roudoukoupi', 'nutmeg', 'jawz al-tib', 'roudoukou', 'pepper', 'fulful', 'hujiao', 'bors', 'saffron', 'zafaran', 'fanhonghua', 'Sichuan pepper', 'fagara', 'fulful sitshuwan', 'huajiao', 'star anise', 'yansun najmi', 'bajiaohuixiang', 'badian', 'turmeric', 'kurkum', 'jianghuang', 'vanilla', 'faniliya', 'xiangcao']
84 words in total


Unnamed: 0,#,include,lang,id,plot color,lang color,boxskip,treeskip,plotskip,doubt,...,ref page,reference,ref link,ref page2,reference2,ref link2,done,Unnamed: 46,Unnamed: 47,Unnamed: 48
0,1.0,yes,en,saffron,crimson,e1,,,,,...,,OE,https://www.etymonline.com/word/saffron,,,,yes,,,
1,2.0,yes,en,saffron,,,,,,,...,,MED,https://quod.lib.umich.edu/m/middle-english-di...,,tlfi,https://www.cnrtl.fr/etymologie/safran,yes,,,
2,3.0,yes,en,saffron,,,,,,,...,,MED,https://quod.lib.umich.edu/m/middle-english-di...,,,,yes,,,
3,4.0,yes,en,saffron,,,,,,,...,,Hans-Wehr,"http://ejtaal.net/aa/#hw4=451,ll=1274,ls=5,la=...",,,,yes,,,


## Create webpage

Create webpage using markdown files

In [65]:
def spicepage(key):
    '''
    This cell generates markdown tables from the spices data and writes them out to a file.
    '''

    # The Spice

    # Dataframe of current item 
    df_local = df_spices.copy()

    # Dataframe of current item 
    df_local = df_local.loc[df_spices['id'] == key]

    # Reset index
    df_local.reset_index(drop=True,inplace=True)

    # Extract categories and tags
    category = df_local['category'][0]
    if ";" in df_local['category'][0]:
        category_list = category.split("; ")
    else:
        category_list = "['" + category + "']"

    tag = df_local['tag'][0]
    if ";" in df_local['tag'][0]:
        tag_list = tag.split("; ")
    else:
        tag_list = "['" + tag + "']"

    # Page preamble
    preamble = "---\ntitle: " + key.title() + "\nauthor: Gábor Parti\ndate: " + str(date.today()) + "\nweight: 1\n# bookCollapseSection: true\n# bookComments: true\n# bookFlatSection: true\n# bookHidden: true\n# bookSearchExclude: true\n# bookToc: false\nplotly: true\ncategories: " + str(category_list)  + "\ntags: " + str(tag_list) + "\n---\n\n"

    # Title
    title = "# " + key.title() + "\n\n"

    # Overview 
    overview_head = "## Overview\n\n"

    # Merge species name
    df_local['species name'] = "*" + df_local['species'] + "* " + df_local['species by']
    # Set link
    df_local['botanical database'] = "[POWO](" + df_local['POWO'] + ")"

    # Prepare overview tables
    df_overview = df_local[['species name', 'family', 'part used', 'macroarea', 'region of origin', 'cultivation', 'color', 'botanical database']]
    # Transpose table
    df_overview = df_overview.T
    # Reset index
    df_overview.reset_index(inplace=True)
    # Rename columns
    df_overview.columns = ['id', key]
    # Prepare data to create markdown table
    data = df_overview.to_dict(orient='records')
    # Create markdown table
    overview_mdt = markdownTable(data).setParams(row_sep = 'markdown', quote = False).getMarkdown()

    overview = overview_head + overview_mdt + "\n\n"

    # Intro
    category = re.sub('; ', ' and ', category)
    tag = re.sub('; ', ' and ', tag)

    part = df_local['part used'][0]
    part = re.sub('; ', ' and ', part)

    intro = key.title() + " (" + df_local['species name'][0] + ")" + " is a " + tag + " " + category + " from the *" + df_local['family'][0] + "* family,[^powo] originating in the region(s) of " + df_local['region of origin'][0] + ".[^van_wyk_culinary_2014] " + "It is used for its " + part + ", primarily for " + df_local['major uses'][0] + ". Its aroma is described as " + df_local['taste/smell'][0] + ", with a heat index of " + df_local['heat'][0] + ".[^ucla_medicinal_2002]"

    references = "[^powo]: POWO. (2022). Plants of the World Online (Botanical Database). Facilitated by the Royal Botanic Gardens, Kew. http://www.plantsoftheworldonline.org/\n[^van_wyk_culinary_2014]: van Wyk, B.-E. (2014). Culinary Herbs and Spices of the World. University of Chicago Press, joint publication with the Royal Botanic Gardens, Kew. https://doi.org/10.7208/chicago/9780226091839.001.0001\n[^ucla_medicinal_2002]: Medicinal Spices Exhibit. (2002). UCLA Biomedical Library: History & Special Collections. https://unitproj.library.ucla.edu/biomed/spice/index.cfm?spicefilename=taste.txt&itemsuppress=yes&displayswitch=0"

    intro = intro + "\n\n"
    references = references + "\n\n"

    df_quick_names = df_local[['English', 'Arabic', 'Chinese', 'Hungarian']]
    # df_names = df_names.T
    # df_names.reset_index(inplace=True)
    # df_names.columns = ['language', 'name(s)']
    data = df_quick_names.to_dict(orient='records')
    quick_names_mdt = markdownTable(data).setParams(row_sep = 'markdown', quote = False).getMarkdown()
    quick_names = quick_names_mdt + "\n\n"

    # The Nomenclature

    # Dataframe of current item 
    df_names_local = df_names.copy()

    # Dataframe of current item 
    df_names_local = df_names_local.loc[df_names_local['id'] == key]

    # Reset index
    df_names_local.reset_index(drop=True,inplace=True)

    # Names
    names_head = "***\n\n## Names\n\n"

    # Heads
    names_head_en = "### English\n\n"
    names_head_ar = "### Arabic\n\n"
    names_head_zh = "### Chinese\n\n"

    # Language
    language = "English"
    df = df_names_local.loc[df_names_local['language'] == language]
    df = df[['term', 'source human']]
    df = df.rename(columns={'source human': 'source'})
    data = df.to_dict(orient='records')
    names_mdt_en = markdownTable(data).setParams(row_sep = 'markdown', quote = False).getMarkdown()

    language = "Arabic"
    df = df_names_local.loc[df_names_local['language'] == language]
    df = df[['script', 'term', 'literal', 'source human']]
    df = df.rename(columns={'source human': 'source'})
    data = df.to_dict(orient='records')
    names_mdt_ar = markdownTable(data).setParams(row_sep = 'markdown', quote = False).getMarkdown()

    language = "Chinese"
    df = df_names_local.loc[df_names_local['language'] == language]
    df = df[['script', 'term', 'literal', 'source human']]
    df = df.rename(columns={'source human': 'source'})
    data = df.to_dict(orient='records')
    names_mdt_zh = markdownTable(data).setParams(row_sep = 'markdown', quote = False).getMarkdown()

    names = names_head + names_head_en + names_mdt_en + "\n\n" + names_head_ar + names_mdt_ar + "\n\n" + names_head_zh + names_mdt_zh + "\n\n"

    # Plotly file (for etymology)
    key_ = re.sub(" ", "_", key)
    embed_json = r'{{< load-plotly >}}' + '\n' + r'{{< plotly json="/plotly/diffusion_name_' + key_ + r'.json" height="300px" >}}' + '\n\n'

    # Assemble page
    page = preamble + title + quick_names + intro + overview + names + embed_json + references

    # Def filename
    
    # Write md file
    with open(path_out_md + key_ + '.md', 'w', encoding='utf-8') as f:
        f.write(page)
    return

In [73]:
# The Etymologies

df_local = etymologies[key]

key = "anise"

cwd = os.path.realpath('./')
ltx = cwd + "\\thesis\\envs\\etymbox_" + key + ".tex"
md = cwd + "\\website\\drafts\\" + key + ".md"

command = "pandoc -s " + ltx + " -o " + md
print("Running command:\n"+command)

subprocess.call(command)

Running command:
pandoc -s C:\Users\parti\Projects\spice-core\thesis\envs\etymbox_anise.tex -o C:\Users\parti\Projects\spice-core\website\drafts\anise.md


0

In [67]:
spicepage('allspice')

In [68]:
# Choose a spice (using id)
for key in list_of_spices:
    spicepage(key)

In [69]:
# This package creates MD files using python, but inserts a header I cannot rewrite to add the Hugo header
# mdFile = MdUtils(file_name=key, title=key.title())
# mdFile.new_header(level=1, title='Overview')  # style is set 'atx' format by default.
# mdFile.new_paragraph(header)
# mdFile.new_paragraph(md)
# mdFile.create_md_file()

## Move and Copy

In [70]:
# Move files to the website folder
copy_dir(path_out_md, destination_md, "*.md")

## Convert LaTeX files to Markdown files

In [72]:
key = "allspice"

cwd = os.path.realpath('./')
ltx = cwd + "\\thesis\\contents\\spices\\" + key + ".tex"
md = cwd + "\\website\\drafts\\" + key + ".md"

command = "pandoc -s " + ltx + " -o " + md
print("Running command:\n"+command)

subprocess.call(command)

Running command:
pandoc -s C:\Users\parti\Projects\spice-core\thesis\contents\spices\allspice.tex -o C:\Users\parti\Projects\spice-core\website\drafts\allspice.md


0

In [47]:
# # Combine serveral mardown files (test)
# cwd = os.path.realpath('./')

# file_list = glob.glob(cwd + "\\website\\content\\drafts\\*.md")
# with open(cwd + '\\website\\content\\drafts\\combined.md', 'w', encoding="utf8") as result:
#     for file_ in file_list:
#         for line in open(file_, 'r', encoding="utf8"):
#             result.write(line)

# End

In [48]:
print("All done!")

All done!


# I. Symposium

In [23]:
# Read and store content of an excel file 
df_spices = pd.read_excel(path_in+"spices.xlsx")

# Write the dataframe object into csv file
df_spices.to_csv (path_in+"spices.csv", index = None, header=True)

# Load in dataset of spices as a dataframe
df_spices=pd.read_csv(path_in+'spices.csv', header =[0], delimiter=',', encoding="utf-8")

# Select ones to include
# df_spices = df_spices.loc[(df_spices['include'] == "in")]

# If for symposium, use this
df_spices = df_spices.loc[df_spices['sym'] == 'yes'] # include ones to include

# List the list of ids
list_of_spices = df_spices['id'].tolist()
list_of_spices.sort()
print("List of spices:", list_of_spices, "\n", len(list_of_spices), "spices in total.")

df = df_spices

List of spices: ['Borneo camphor', 'Java round cardamom', 'Siam cardamom', 'Sumatra benzoin', 'agarwood', 'aloe', 'areca nut', 'asafoetida', 'basil', 'camelthorn', 'caoguo', 'cassia', 'catechu', 'chile', 'clove', 'coriander', 'creat', 'date', "dragon's blood", 'fennel', 'fenugreek', 'field mustard', 'fig', 'frankincense', 'ginger', 'henna', 'long pepper', 'mace', 'mustard', 'myrobalan', 'myrrh', 'nutmeg', 'opium poppy', 'pepper', 'quinine', 'rosemary', 'rosewood', 'safflower', 'saffron', 'santalwood', 'sappanwood', 'senna', 'sesame', 'star anise', 'storax', 'tamarind', 'tobacco', 'turmeric', 'velvetleaf', 'zedoary'] 
 50 spices in total.


In [24]:
# Variables for symposium (light) ########

sym = '#C55A11'
transparent = 'rgba(255,255,255,0)'
half_transparent = 'rgba(255,255,255,0.5)'
quarter_transparent = 'rgba(255,255,255,0.25)'
tenth_transparent = 'rgba(255,255,255,0.1)'

marker_symbol= 'circle'
marker_size = 12
edge_size = 1
edge_color = 'white'
opacity = 0.75
line_width = 4
font_size = 12
font_family = "Raleway"
font_color = "black"
water = '#ebedef'
grid_color = '#d6dbdf'
land = '#aeb6bf'
lines = '#85929e'
copyright_color = '#5d6d7e'
# water = 'white'
# grid_color = '#EDEDED'
# land = 'gainsboro'
# lines = 'gainsboro'
# copyright_color = 'lightgray'
background_color = transparent
legend_background_color = tenth_transparent

In [25]:
# # Orthographic globe layout
ortho_layout = dict(
    paper_bgcolor=background_color,
    plot_bgcolor=background_color,
    geo = dict(
        resolution=110, #50 is large or 110 small
        scope='world',
        projection_type = 'orthographic',
        projection_scale = 1,
        projection_rotation = {'lat': 15, 'lon': 30, 'roll': 0},
        bgcolor=background_color,
        showcoastlines=True, coastlinewidth = 1, coastlinecolor = lines,
        showcountries=False, countrywidth = 1, countrycolor = lines, 
        showframe=True, framewidth = 1, framecolor = lines, 
        showlakes=True, lakecolor = water,
        showland=True, landcolor = land, 
        showocean=True, oceancolor = water,
        showrivers=True, riverwidth = 1, rivercolor = water,
        showsubunits=False, subunitwidth = 1, subunitcolor = lines, 
        lonaxis = dict(showgrid = True, gridwidth = 0.5, dtick = 10, gridcolor=grid_color),
        lataxis = dict (showgrid = True, gridwidth = 0.5, dtick = 10, gridcolor=grid_color)),
    showlegend = True,
    legend=dict(x=1, y=0.9, xanchor="right", yanchor="top", bgcolor=half_transparent,  
                font=dict(color=font_color, size=font_size, family=font_family), 
                title_font=dict(color=font_color, size=font_size+2, family=font_family),
                traceorder = 'normal', orientation="v"),
    title=dict(x=0, y=0.99, xanchor='left', yanchor='top', text='',
               font=dict(color=font_color, size=font_size+6, family=font_family)),
    margin={"r":0,"t":0,"l":0,"b":0},
    hoverlabel=dict(#bgcolor="white", 
                    font_size=font_size, 
                    font_family=font_family),
    )

# Adding images
logo = dict(
    source="https://upload.wikimedia.org/wikipedia/en/thumb/9/9e/PolyU_Logo_with_wordmark.svg/1024px-PolyU_Logo_with_wordmark.svg.png",
    sizex=0.15, sizey=0.15,
    x=0, y=0, 
    xanchor="left", 
    yanchor="bottom", 
)

# Copyright
cr = go.layout.Template()
cr.layout.annotations = [
    dict(
        name="copyright",
        text="© Gábor Parti, 2022",
        font=dict(color=copyright_color, size=8, family=font_family),
        opacity=0.9,
        xref="paper",
        yref="paper",
        x=0.5,
        y=0,
        # xanchor="right", 
        # yanchor="bottom", 
        # align="center",
        showarrow=False,)]

In [30]:
# add a value to make small points bigger
df['size'] = df['no. of native regions'].astype(int)+4
size_max = 30

########################################################################################

#### Document
fig = px.scatter_geo(df,
    lat='lat', 
    lon='lon',
    text='id',
    color='family',
    color_discrete_sequence=prism_extended,
    size_max = size_max,
    size = 'size',
    opacity = opacity,
    hover_name='id',
    hover_data={'species':True, 'family':True, 'region of origin':True, 'Arabic':False, 'Chinese':True, 'lon':False, 'lat':False, 'size':False, 'id':False},
    # labels={"group": "category"}
    )

fig.update_traces(
    textposition = 'middle right',
    mode = "markers",
    textfont = dict(size=font_size, color=font_color, family=font_family),
    marker = dict(
        # symbol = marker_symbol,
        # size = marker_size,
        line = dict(
            color=edge_color,
            width=edge_size)
        )
    )

fig.update_layout(ortho_layout)
fig.update_layout(geo=dict(projection_rotation = {'lat': 20, 'lon': 80, 'roll': 0}))
fig.update_layout(showlegend = False)
fig.update_layout(document_size)

# fig.show()

# write
filename = "distribution_with_native_regions"
fig.write_image(path_out_pdf + filename + ".pdf", engine="kaleido")
fig.write_image(path_out_png + filename + ".png", scale=3)

################################################################################

#### HTML
fig = px.scatter_geo(df,
    lat='lat', 
    lon='lon',
    text='id',
    color='family',
    color_discrete_sequence=prism_extended,
    size_max = size_max,
    size = 'size',
    opacity = opacity,
    hover_name='id',
    hover_data={'species':True, 'family':True, 'region of origin':True, 'Arabic':False, 'Chinese':True, 'lon':False, 'lat':False, 'size':False, 'id':False, 'year recorded in TCM':True},
    # labels={"group": "category"}
    )

fig.update_traces(mode = "markers+text",
                  textposition='middle right',
                  textfont={"size": font_size, "color": font_color, "family": font_family},
                  marker=dict(symbol=marker_symbol, 
                              # size=marker_size, 
                              line=dict(color=edge_color, width=edge_size)))

fig.update_layout(ortho_layout)
fig.update_layout(geo=dict(projection_rotation = {'lat': 20, 'lon': 80, 'roll': 0}))
fig.update_layout(title_text = "Approximate origins of exotic Chinese <i>materia medica</i>")
fig.update_layout(template = cr)
fig.add_layout_image(logo)

fig.add_annotation(x=0, y=0.9,
                   xanchor="left", yanchor="top", align="left",
                   text="Use the mouse to <b>rotate</b> the globe, <b>zoom</b> in and out,<br>and <b>hover</b> over data points for more information on an item.<br><br><i>Color</i> represents plant families,<br>double-click on a family in the legend to isolate them.<br><i>Size</i> represents the number of native regions for a plant.",
                   font={"color": font_color, "size": font_size-2, "family": font_family},
                   showarrow=False)

fig.add_annotation(x=0, y=0.5,
                   xanchor="left", yanchor="top", align="left",
                   text="<br><br><br><b>Sources:</b><br>· POWO. (2022). <i>Plants of the World Online</i> (Botanical Database).<br>Royal Botanic Gardens, Kew.<br>http://www.plantsoftheworldonline.org/<br>· PolyU. (2022). <i>Chinese Herbal Medicine Database.</i><br>The Hong Kong Polytechnic University, School of Nursing.<br>https://herbaltcm.sn.polyu.edu.hk/herbal/<br>· van Wyk, B.-E. (2014). Culinary Herbs and Spices of the World.<br>University of Chicago Press.<br>· Hu, S.-Y. (1990). History of the Introduction of Exotic Elements<br>Into Traditional Chinese Medicine.<br><i>J. Arnold Arbor.</i> 71(4):487–526.",
                   font={"color": font_color, "size": font_size-4, "family": font_family},
                   showarrow=False)

    
fig.show()

# write
filename = "distribution_with_native_regions"
fig.write_html(path_out_html + filename + ".html")
fig.write_json(path_out_json + filename + ".json")