# Encyclopaedia Aromatica pipeline

## Libraries

In [61]:
# Import dependencies
import os
import pandas as pd
import numpy as np
import re

from collections import defaultdict
from datetime import date, datetime, timedelta
from py_markdown_table.markdown_table import markdown_table

# pip install babelnet

from plotly.io import write_image, write_json
import plotly.express as px
import plotly.graph_objs as go
import geopandas as gpd

from palette import *
from scripts.unsplash import *
from scripts.pexels import *
from scripts.functions import *

pd.options.mode.copy_on_write = True # to avoid SettingWithCopyWarning, https://stackoverflow.com/questions/20625582/how-to-deal-with-settingwithcopywarning-in-pandas


## Paths

In [62]:
path_in = "data/"
path_out_tex = "output/tex/"
path_out_md = "output/md/"
path_out_html = "output/html/"
path_out_json = "output/json/"
path_out_png = "output/png/"
path_out_pdf = "output/pdf/"
path_downloaded_photos = "output/photos/"

website_md = "../content/items/"
website_json = "../static/plotly/"
website_photos = "../static/images/photos/"

In [63]:
# import requests
 
# def fetch_wikidata(params):
#     url = 'https://www.wikidata.org/w/api.php'
#     try:
#         return requests.get(url, params=params)
#     except:
#         return 'There was and error'

# # What text to search for
# query = 'Elettaria cardamomum'
 
# # Which parameters to use
# params = {
#         'action': 'wbsearchentities',
#         'format': 'json',
#         'search': query,
#         'language': 'en'
#     }
 
# # Fetch API
# data = fetch_wikidata(params)
 
# #show response as JSON
# data = data.json()
# data

## Functions

In [64]:
# ################################################################################
# # Convert PDFs
# from pdf2image import convert_from_path

# def convert_pdf_to_png(file):
#     name = str(file)
#     name = re.sub(".*(?=/)", "", name)
#     name = re.sub("\..*", "", name)
#     pages = convert_from_path(file, 0)
#     for page in pages:
#         page.save(path + name + ".png", 'PNG')


In [65]:
# Start timer
start_time = datetime.now()

# Data

## Spices

In [66]:
# Read and store content of an excel file 
df = pd.read_excel(path_in+"spices.xlsx")

# Write the dataframe object into csv file
df.to_csv (path_in+"spices.csv", index = None, header=True)

# Load in dataset of spices as a dataframe
df = pd.read_csv(path_in+'spices.csv', header=[0], delimiter=',', encoding="utf-8")

# Select ones to include
df = df.loc[(df['include'] == "in")]

# Info
list_of_items = df['item'].tolist()
list_of_items.sort() # Sorts alphabetically, but capital letters come first

# Add keys based on item, make lowercase and replace spaces with underscores
df['key'] = df['item'].str.lower().str.replace(" ", "_")
list_of_keys = df['key'].tolist()
print(len(list_of_keys), "spices in total.")
print(list_of_keys)

# Add links
df['url'] = "https://partigabor.github.io/aromatica/items/" + df['item'].str.replace(" ", "_")

# Add counts of distribution
df['no. of native regions'] = df['native regions'].str.count(',') + 1
df['no. of introduced regions'] = df['introduced regions'].str.count(',') + 1

37 spices in total.
['allspice', 'angelica', 'anise', 'asafoetida', 'caraway', 'cardamom', 'cassia', 'chile', 'cinnamon', 'clove', 'coriander', 'cumin', 'dill', 'fennel', 'fenugreek', 'ginger', 'lemongrass', 'mace', 'nigella', 'nutmeg', 'pepper', 'saffron', 'star_anise', 'turmeric', 'vanilla', 'acuyo', 'agarwood', 'ajwain', 'ambergris', 'annatto', 'ashanti_pepper', 'black_cardamom', 'cubeb', 'javanese_long_pepper', 'long_pepper', 'sichuan_pepper', 'timiz']


### Amend data (online)

In [67]:
if internet_on():
    
    # Location coordinates
    generate_coordinates(df) # Generate geo-coordinates from location column
    # generate_centroid_coordinates(df) # Generate geo-coordinates by finding the centroid of the native regions
    
    # # # Get a definition from WordNet
    # wn_definition(df)
    
    # # Translate using WordNet
    # wn_langs = ['fra', 'ita']
    # for lan in wn_langs:
    #     wn_translate(df, lan)
    
    # # Translate using DeepL
    # dl_langs = ['fr', 'it']
    # for lang in dl_langs:
    #     dl_translate(df, lang)
        
    # # Create a new column called french and fill with NaN
    # df['french'] = np.nan
    # df['italian'] = np.nan
    
    # # Change "" to NaN in df
    # df = df.replace(r'^\s*$', np.nan, regex=True)
    
    # # Fill in 'french' column with values from 'wn_translation_fra' or 'dl_translation_fr' column where it is NaN
    # df['french'] = df['french'].fillna(df['wn_translation_fra'])
    # df['french'] = df['french'].fillna(df['dl_translation_fr'])
    
    # df['italian'] = df['italian'].fillna(df['wn_translation_ita'])
    # df['italian'] = df['italian'].fillna(df['dl_translation_it'])

# # Check
# df_translations = df.filter(regex='fr')
# df_translations

In [68]:
# Checking lat an lon where item is angelica
# print("Check coordinates:", df.loc[df['item'] == 'ambergris', ['lat', 'lon']])

# List which items have NaN lat and lon values
print("List of items with no coordinates:", df.loc[(df['lat'].isnull()) & (df['lon'].isnull()), ['item', 'lat', 'lon']])

List of items with no coordinates: Empty DataFrame
Columns: [item, lat, lon]
Index: []


In [69]:
# save for checking as xlsx
df.to_excel("checking.xlsx", index = None, header=True)

## Botanical data

In [70]:
# Read in wcvp files
wcvp = pd.read_csv(path_in + 'resources/wcvp/wcvp_names.csv', header=[0], delimiter='|', encoding="utf-8", dtype=str)

# # Check line where taxon name is "Piper capense"
# wcvp.loc[wcvp['taxon_name'] == "Piper capense"]

# Merge wcvp with spice data 
df = pd.merge(df, wcvp, on=['taxon_name', 'taxon_authors'], how='left')

# POWO links if not already there
df['powo'] = df['powo'].fillna("https://powo.science.kew.org/taxon/" + df['powo_id'].astype(str))


In [71]:
df.head()

Unnamed: 0,include,values,level,done,id,group,item,description,related,see also,...,lifeform_description,climate_description,accepted_plant_name_id,basionym_plant_name_id,replaced_synonym_author,homotypic_synonym,parent_plant_name_id,powo_id,hybrid_formula,reviewed
0,in,76.0,Basic,yes,,,allspice,The dried unripe berries of a small Caribbean ...,Bay rum tree,wild allspice,...,tree,seasonally dry tropical,156136,132459.0,,,156124,196799-2,,Y
1,in,49.0,Basic,,,,angelica,The scented stems and roots of a medicinal pla...,,,...,perennial,temperate,2638995,,,,2638971,837560-1,,N
2,in,66.0,Basic,,,,anise,The seed-like fruits of a Mediterranean herb,fennel,star anise,...,annual,temperate,2402426,,,,2402470,846658-1,,N
3,in,66.0,Basic,,,,asafoetida,The dried resin of Ferula foetida and F. assa-...,,,...,perennial,temperate,2808419,2479171.0,,,2808299,842277-1,,N
4,in,55.0,Basic,,,,caraway,The seed-like fruits of an Eurasian herb,,,...,perennial,temperate,2701499,,,,2701447,839677-1,,N


In [72]:
# Read in wcvp distribution files
wcvp_dist = pd.read_csv(path_in+'resources/wcvp/wcvp_distribution.csv', header=[0], delimiter='|', encoding="utf-8", dtype=str)
wcvp_dist

# Check line where taxon name is "Piper capense"
wcvp_dist.loc[wcvp_dist['plant_name_id'] == "2557720"]

Unnamed: 0,plant_locality_id,plant_name_id,continent_code_l1,continent,region_code_l2,region,area_code_l3,area,introduced,extinct,location_doubtful
1052884,2283824,2557720,2,AFRICA,22,West Tropical Africa,GHA,Ghana,0,0,0
1052885,2283825,2557720,2,AFRICA,22,West Tropical Africa,GUI,Guinea,0,0,0
1052886,2432043,2557720,2,AFRICA,22,West Tropical Africa,LBR,Liberia,0,0,0
1052887,2316225,2557720,2,AFRICA,22,West Tropical Africa,NGA,Nigeria,0,0,0
1052888,2283831,2557720,2,AFRICA,22,West Tropical Africa,SIE,Sierra Leone,0,0,0
1052889,2283820,2557720,2,AFRICA,23,West-Central Tropical Africa,BUR,Burundi,0,0,0
1052890,2283821,2557720,2,AFRICA,23,West-Central Tropical Africa,CMN,Cameroon,0,0,0
1052891,2820568,2557720,2,AFRICA,23,West-Central Tropical Africa,EQG,Equatorial Guinea,0,0,0
1052892,2283826,2557720,2,AFRICA,23,West-Central Tropical Africa,GAB,Gabon,0,0,0
1052893,2332684,2557720,2,AFRICA,23,West-Central Tropical Africa,GGI,Gulf of Guinea Is.,0,0,0


In [73]:
# Assign
df_items = df.copy()
df.head()

Unnamed: 0,include,values,level,done,id,group,item,description,related,see also,...,lifeform_description,climate_description,accepted_plant_name_id,basionym_plant_name_id,replaced_synonym_author,homotypic_synonym,parent_plant_name_id,powo_id,hybrid_formula,reviewed
0,in,76.0,Basic,yes,,,allspice,The dried unripe berries of a small Caribbean ...,Bay rum tree,wild allspice,...,tree,seasonally dry tropical,156136,132459.0,,,156124,196799-2,,Y
1,in,49.0,Basic,,,,angelica,The scented stems and roots of a medicinal pla...,,,...,perennial,temperate,2638995,,,,2638971,837560-1,,N
2,in,66.0,Basic,,,,anise,The seed-like fruits of a Mediterranean herb,fennel,star anise,...,annual,temperate,2402426,,,,2402470,846658-1,,N
3,in,66.0,Basic,,,,asafoetida,The dried resin of Ferula foetida and F. assa-...,,,...,perennial,temperate,2808419,2479171.0,,,2808299,842277-1,,N
4,in,55.0,Basic,,,,caraway,The seed-like fruits of an Eurasian herb,,,...,perennial,temperate,2701499,,,,2701447,839677-1,,N


In [74]:
# Subsetting categories (spices, herbs, incense)
# df_spices = df.loc[(df['id'] == "S")]
# print(df_spices.shape[0])
# df_herbs = df.loc[(df['id'] == "H")]
# print(df_herbs.shape[0])
# df_incense = df.loc[(df['id'] == "I")]
# print(df_incense.shape[0])

## Images

In [75]:
# # Download images from Unsplash and Pexels to be curated later, query with underscores
# item = "allspice"
# dashed_key = re.sub(" ", "-", item)
# unsplash_downloader(dashed_key, path_downloaded_photos)
# pexels_downloader(item, path_downloaded_photos)

# # Move images to the right folder, regardless of extension
# move_dir(path_downloaded_photos, website_photos, "*.png")
# move_dir(path_downloaded_photos, website_photos, "*.jpg")
# move_dir(path_downloaded_photos, website_photos, "*.jpeg")

In [76]:
# Create thumbnails for images
for photo in list_files("../static/images/photos"):
    create_thumbnail(photo)

In [77]:
# Images df
# # Define the relative folder path
# folder_path = '../static/images/photos'

# # List of keys
# list_of_keys.sort()

# # Initialize item_info dictionary
# item_info = {}  # Dictionary to store item information

# # Initialize item counts, extensions, file names, and sources to empty lists for each item
# for item in list_of_keys:
#     item_info[item] = {'count': 0, 'extensions': [], 'file_names': [], 'sources': []}

# # Iterate through the files in the folder
# for filename in os.listdir(folder_path):
#     if os.path.isfile(os.path.join(folder_path, filename)):
#         # Extract the item name, extension, file name, and source from the file name
#         item_name = filename.split('-')[0]
#         extension = filename.split('.')[-1]
#         file_name = filename
#         source = filename.split('-')[2] if len(filename.split('-')) > 2 else ""
#         # Remove the file extension from the source
#         source = source.split('.')[0]
        
#         # Check if the item name is in the list_of_items
#         if item_name in list_of_keys:
#             item_info[item_name]['count'] += 1
#             item_info[item_name]['extensions'].append(extension)
#             item_info[item_name]['file_names'].append(file_name)
#             item_info[item_name]['sources'].append(source)

# # Create a Pandas DataFrame from the item_info dictionary
# data = {'key': [], 'count': [], 'source': [], 'extension': []}
# for item, info in item_info.items():
#     data['key'].append(item)
#     data['count'].append(info['count'])
#     data['source'].append(', '.join(info['sources']))
#     data['extension'].append(', '.join(info['extensions']))

# df = pd.DataFrame(data)

# # Fill in missing items with 0 image counts and empty sources, extensions
# for item in list_of_keys:
#     if item not in df['key'].values:
#         df = df.append({'key': item, 'count': 0, 'source': '', 'extension': ''}, ignore_index=True)

# # Reorder the DataFrame with columns 'item', 'count', 'source', 'extension'
# df = df[['key', 'count', 'source', 'extension']]

# # Sort the DataFrame by 'item'
# df = df.sort_values(by='key')

# # Reset the index of the DataFrame
# df = df.reset_index(drop=True)

# # Rename the columns
# df.rename(columns={'count': 'img_count', 'source': 'img_source', 'extension': 'img_extension'}, inplace=True)

# # Save
# df_images = df.copy()

# # Display the final DataFrame
# print(df)

# # Merge the two dataframes
# df_items = pd.merge(df_items, df_images, on='key', how='left')

## Names

In [78]:
# # Read and store content of an excel file 
# df = pd.read_excel(path_in+"names.xlsx")

# # Write the dataframe object into csv file
# df.to_csv (path_in+"names.csv", index = None, header=True)

# # Load in dataset of names
# df = pd.read_csv(path_in+'names.csv', header =[0], delimiter=',', encoding="utf-8")

# # Select ones to include
# df = df.loc[df['include'] == 'yes'] # include ones to include

In [79]:
# # Change NaN to empty string
# df.fillna('', inplace=True)

# # Info
# print(df.shape[0], "names in total.")

# # Assign
# df_names = df.copy()

## Etymologies

In [80]:
# Read and store content of an excel file 
read_file = pd.read_excel(path_in+"etymologies.xlsx")

# Write the dataframe object into csv file
read_file.to_csv (path_in+"etymologies.csv", index = None, header=True)

# Load in dataset
df_etymologies=pd.read_csv(path_in+'etymologies.csv', header =[0], delimiter=',', encoding="utf-8")

# Split the dataset wherever an empty row is found
df_list_with_na = np.split(df_etymologies, df_etymologies[df_etymologies.isnull().all(1)].index)

# Drop NAs and reset the index
df_list = []
for df in df_list_with_na:
  df = df.dropna(how='all')
  df.reset_index(inplace=True, drop=True)
  df_list.append(df)

# Automatically extract IDs from the dataset
list_of_etymologies = []
for df in df_list:
  # print(df['item'].iloc[0])
  df = df.dropna(how='all')
  df.reset_index(drop=True)
  item = str(df['item'].iloc[0])
  list_of_etymologies.append(item)

# Print the number if IDs and what are they
length = len(df_list)
print(length, "words in total")
print(list_of_etymologies)

#Create a defaultdict of spice-word etymologies
etymologies=defaultdict(list)
for i in range(length):
  etymologies[list_of_etymologies[i]]=df_list[i]

# Testing
# print(etymologies['saffron'])

84 words in total
['tester', 'allspice', 'fulful ifranji', 'duoxiangguo', 'pimento', 'anise', 'anisun', 'huiqin', 'asafoetida', 'hing', 'hiltit', 'anjudan', 'awei', 'xingqu', 'caraway', 'karawiya', 'geluzi', 'cardamom', 'amomum', 'hal', 'qaqulla', 'doukou', 'cassia', 'salikha', 'rougui', 'cinnamon', 'darsini', 'qirfa', 'chile', 'fulful harr', 'lajiao', 'paprika', 'clove', 'qaranful', 'dingxiang', 'coriander', 'kuzbura', 'yansui', 'husui', 'cumin', 'kammun', 'ziran', 'dill', 'shibitt', 'shiluo', 'fennel', 'shamar', 'huixiang', 'fenugreek', 'hulba', 'huluba', 'ginger', 'zanjabil', 'jiang', 'long pepper', 'darfilfil', 'biba', 'mace', 'basbas', 'roudoukoupi', 'nutmeg', 'jawz al-tib', 'roudoukou', 'pepper', 'fulful', 'hujiao', 'bors', 'saffron', 'zafaran', 'fanhonghua', 'Sichuan pepper', 'fagara', 'fulful sitshuwan', 'huajiao', 'star anise', 'yansun najmi', 'bajiaohuixiang', 'badian', 'turmeric', 'kurkum', 'jianghuang', 'vanilla', 'faniliya', 'xiangcao']



'DataFrame.swapaxes' is deprecated and will be removed in a future version. Please use 'DataFrame.transpose' instead.



### Etymology box for LaTeX

In [81]:
# key = "tester"

# ################################################################################

# # The following code will create an etymology box environment for the key, to be used in LaTeX
# print("Started the generation of '" + key + "' as etymbox...")

# df_local = etymologies[key]
# # df_local.fillna('', inplace=True)

# # # Skipping those marked
# df_local = df_local[df_local['boxskip'] != 'yes']
# df_local.reset_index(inplace=True, drop=True)

# # # Replace empty cells with NaNs
# # df_local.replace(r'^\s*$', np.nan, regex=True, inplace=True)
# # df_local.replace(r'^nan$', np.nan, regex=True, inplace=True)

# # Create content and sources stage by stage
# content = ""
# source = ""
# sources = ""
# source_list = []
# nl = "\n"

# for index, row in df_local.iterrows():
#   stage = "< "
#   if pd.notna(row['complex']): # complex relationships
#     row['complex'] = re.sub("and from", "+", row['complex'])
#     stage += row['complex'] + " "
#   if pd.notna(row['language']): # language
#     stage += "\\textbf{" + row['language'] + "} "
#   if pd.notna(row['script']): # script
#     script = "{" + row['script'] + "} "
#     if row['language'] == 'Chinese':
#       script = "\\tc{" + row['script'] + "} "
#     stage += script
#   if pd.notna(row['term']): # term
#     stage += "\\textit{" + row['term'] + "} "
#   if pd.notna(row['IPA']): # IPA
#     stage += row['IPA'] + " "
#   if pd.notna(row['meaning']): # meaning
#     stage += "`" + row['meaning'] + "' "
#   if pd.notna(row['literal']): # literal meaning
#     stage += "[" + row['literal'] + "] "
#   stage = re.sub(' $', '', stage)
#   stage += ", "
#   if pd.notna(row['explanation']): # explanation
#     stage += row['explanation'] + " "
#   if pd.notna(row['remark']): # remark
#     stage += "(" + row['remark'] + ") "
#   stage = re.sub(',? ?$', '', stage)

#   if pd.notna(row['date']): # dates
#     stage += ", "
#     row['date'] = re.sub('a(?=\d)', 'a. ', row['date'])
#     row['date'] = re.sub('c(?=\d)', 'ca. ', row['date'])
#     if re.match('^-\d\d?$', row['date']): # if is a century BC
#       row['date'] = re.sub("-", "", row['date'])
#       date = "\\nth{" + row['date'] + "} c. \BC{}" # ARAB NUMERALS
#       # date = " " + roman(row['date']) + " \BC{}" # ROMAN NUMERALS
#     elif re.match('^\d\d?$', row['date']): # if is a century AD
#       date = "\\nth{" + row['date'] + "} c. \AD{}" # ARAB NUMERALS
#       # date = " " + roman(row['date']) + " \AD{}" # ROMAN NUMERALS
#     else:
#       date = row['date'] + " " # if it's a year
#     stage += date
#     stage = re.sub(',? ?$', '', stage)
#   if pd.notna(row['cognates']): # cognates
#     stage += "; cf. cognates " + row['cognates'] + " "
#   if pd.notna(row['derivates']): # cognates
#     if pd.notna(row['cognates']):
#       stage = re.sub(' $', '', stage)
#       stage += "; " + row['derivates'] + " "
#     else:
#       stage = re.sub(' $', '', stage)
#       stage += "; cf. " + row['derivates'] + " "
#   stage = re.sub(',? ?$', '', stage)
#   # stage = re.sub('cf\..*?(cf\.)', '', stage)

# # Final touches
#   if row['doubt'] == 'yes':
#     stage = re.sub(r'<', '<\\\\textss{?}', stage) # ???
#   if row['complex'] == '+':
#     stage = re.sub('<', '', stage)
#   if row['complex'] == 'or from':
#     stage = re.sub('<', '', stage)
#   content += stage + nl

# # Sources
#   source=""
#   if pd.notna(row['source zotero']):
#     source = row['source zotero']
#     print("1",source)
#     if '{' in source:
#       source = "s" + row['source zotero'].lower()
#       print(source)
#     else:
#       source = "{" + row['source zotero'].lower() + "}"
#       print(source)
#     if pd.notna(row['source page']):
#       source = "[" + str(row['source page']) + "]{" + row['source zotero'].lower() + "}"
#       print("4",source)
#       if row['source page'].isalpha() == True:
#         source = "[s.v. " + str(row['source page']) + "]{" + row['source zotero'].lower() + "}"
#         print("5",source)
#     source = "\\textcite" + source
#   print(source)
#   source_list.append(source)

# # clear duplicates from sources:
# print("SL1: ", source_list)
# # source_set = sorted(set(source_list), key=source_list.index)
# source_set = set(source_list)
# print("SS2: ", source_set)
# source_list2 = list(source_set)
# print("S3: ", source_list2)
# sources_unduplicated = '; '.join(source_list2)
# print("S4: ", sources_unduplicated)
# # test for duplicates
# newlist = [] # empty list to hold unique elements from the list
# duplist = [] # empty list to hold the duplicate elements from the list
# for i in source_list:
#     if i not in newlist:
#         newlist.append(i)
#     else:
#         duplist.append(i) # this method catches the first duplicate entries, and appends them to the list

# # The next stage is to print the duplicate entries, and the unique entries
# print("List of duplicates", duplist)
# print("Unique Item List", newlist) 
# if len(duplist) > 0:
#   # print("UNDUPL")
#   sources = sources_unduplicated
# else:
#   # print("ORI")
#   sources =  '; '.join(source_list)
# # print("S5: ", sources)

# sources =  '; '.join(source_list)

# # Cleaning
# sources = re.sub("; $", "", sources)
# sources = re.sub("^; ", "", sources)
# sources = re.sub("(; )+", "; ", sources)
# sources = "\\footnote{" + sources + "}\n"

# content = re.sub("\n$", "", content)
# content = re.sub(r"^< ", "", content) # delete the first <
# content = re.sub(r"\n,", ",", content)
# content = re.sub(r" nan ", " ", content)
# content = re.sub("(<\.\n?)+$", "", content)

# content += sources

# env_begin = r"\begin{etymology}" + "\label{ety:" + df_local['item'].iloc[0] + "}" + nl 
# env_end = r"\end{etymology}"

# box = env_begin + content + env_end
# box = re.sub(r"\u200e", "", box) #removes right to left mark

# # Save the spicebox as a standalone tex file

# filename = re.sub(" ", "_", key)
# filename = filename.lower()
# f = open(path_out_tex + "{}.tex".format("etymbox_" + filename), "w", encoding='utf-8')
# f.write(box)
# f.close()
# print("Etymology-box '" + str(key) + "' as a tex file was created.")
# box


In [82]:
# def etymbox(key):
  
#   # The following code will create a etymology box environment for the key, to be used in LaTeX
#   print("Started the generation of '" + key + "' as etymbox...")

#   df_local = etymologies[key]
#   # df_local.fillna('', inplace=True)

#   # # Skipping those marked
#   df_local = df_local[df_local['boxskip'] != 'yes']
#   df_local.reset_index(inplace=True, drop=True)

#   # # Replace empty cells with NaNs
#   # df_local.replace(r'^\s*$', np.nan, regex=True, inplace=True)
#   # df_local.replace(r'^nan$', np.nan, regex=True, inplace=True)

#   # Create content and sources stage by stage
#   content = ""
#   source = ""
#   sources = ""
#   source_list = []
#   nl = "\n"

#   for index, row in df_local.iterrows():
#     stage = "< "
#     if pd.notna(row['complex']): # complex relationships
#       row['complex'] = re.sub("and from", "+", row['complex'])
#       stage += row['complex'] + " "
#     if pd.notna(row['language']): # language
#       stage += "\\textbf{" + row['language'] + "} "
#     if pd.notna(row['script']): # script
#       script = "{" + row['script'] + "} "
#       if row['language'] == 'Chinese':
#         script = "\\tc{" + row['script'] + "} "
#       stage += script
#     if pd.notna(row['term']): # term
#       stage += "\\textit{" + row['term'] + "} "
#     if pd.notna(row['IPA']): # IPA
#       stage += row['IPA'] + " "
#     if pd.notna(row['meaning']): # meaning
#       stage += "`" + row['meaning'] + "' "
#     if pd.notna(row['literal']): # literal meaning
#       stage += "[" + row['literal'] + "] "
#     stage = re.sub(' $', '', stage)
#     stage += ", "
#     if pd.notna(row['explanation']): # explanation
#       stage += row['explanation'] + " "
#     if pd.notna(row['remark']): # remark
#       stage += "(" + row['remark'] + ") "
#     stage = re.sub(',? ?$', '', stage)

#     if pd.notna(row['date']): # dates
#       stage += ", "
#       row['date'] = re.sub('a(?=\d)', 'a. ', row['date'])
#       row['date'] = re.sub('c(?=\d)', 'ca. ', row['date'])
#       if re.match('^-\d\d?$', row['date']): # if is a century BC
#         row['date'] = re.sub("-", "", row['date'])
#         date = "\\nth{" + row['date'] + "} c. \BC{}" # ARAB NUMERALS
#         # date = " " + roman(row['date']) + " \BC{}" # ROMAN NUMERALS
#       elif re.match('^\d\d?$', row['date']): # if is a century AD
#         date = "\\nth{" + row['date'] + "} c. \AD{}" # ARAB NUMERALS
#         # date = " " + roman(row['date']) + " \AD{}" # ROMAN NUMERALS
#       else:
#         date = row['date'] + " " # if it's a year
#       stage += date
#       stage = re.sub(',? ?$', '', stage)
#     if pd.notna(row['cognates']): # cognates
#       stage += "; cf. cognates " + row['cognates'] + " "
#     if pd.notna(row['derivates']): # cognates
#       if pd.notna(row['cognates']):
#         stage = re.sub(' $', '', stage)
#         stage += "; " + row['derivates'] + " "
#       else:
#         stage = re.sub(' $', '', stage)
#         stage += "; cf. " + row['derivates'] + " "
#     stage = re.sub(',? ?$', '', stage)
#     # stage = re.sub('cf\..*?(cf\.)', '', stage)

#   # Final touches
#     if row['doubt'] == 'yes':
#       stage = re.sub(r'<', '<\\\\textss{?}', stage) # ???
#     if row['complex'] == '+':
#       stage = re.sub('<', '', stage)
#     if row['complex'] == 'or from':
#       stage = re.sub('<', '', stage)
#     content += stage + nl

#   # Sources
#     source=""
#     if pd.notna(row['source zotero']):
#       source = row['source zotero']
#       # print(source)
#       if '{' in source:
#         source = "s" + row['source zotero'].lower()
#         # print(source)
#       else:
#         source = "{" + row['source zotero'].lower() + "}"
#         # print(source)
#       if pd.notna(row['source page']):
#         source = "[" + str(row['source page']) + "]{" + row['source zotero'].lower() + "}"
#         # print(source)
#         if row['source page'].isalpha() == True:
#           source = "[s.v. " + str(row['source page']) + "]{" + row['source zotero'].lower() + "}"
#           # print(source)
#       source = "\\textcite" + source
#     # print(source)
#     source_list.append(source)

#   # clear duplicates from sources:
#   # print("SL1: ", source_list)
#   # source_set = sorted(set(source_list), key=source_list.index)
#   source_set = set(source_list)
#   # print("SS2: ", source_set)
#   source_list2 = list(source_set)
#   # print("S3: ", source_list2)
#   sources_unduplicated = '; '.join(source_list2)
#   # print("S4: ", sources_unduplicated)
#   # test for duplicates
#   newlist = [] # empty list to hold unique elements from the list
#   duplist = [] # empty list to hold the duplicate elements from the list
#   for i in source_list:
#       if i not in newlist:
#           newlist.append(i)
#       else:
#           duplist.append(i) # this method catches the first duplicate entries, and appends them to the list
#   # The next stage is to print the duplicate entries, and the unique entries
#   # print("List of duplicates", duplist)
#   # print("Unique Item List", newlist) 
#   if len(duplist) > 0:
#     # print("UNDUPL")
#     sources = sources_unduplicated
#   else:
#     # print("ORI")
#     sources =  '; '.join(source_list)
#   # print("S5: ", sources)

#   sources =  '; '.join(source_list)

#   # Cleaning
#   sources = re.sub("; $", "", sources)
#   sources = re.sub("^; ", "", sources)
#   sources = re.sub("(; )+", "; ", sources)
#   sources = "\\footnote{" + sources + "}\n"

#   content = re.sub("\n$", "", content)
#   content = re.sub(r"^< ", "", content) # delete the first <
#   content = re.sub(r"\n,", ",", content)
#   content = re.sub(r" nan ", " ", content)
#   content = re.sub("(<\.\n?)+$", "", content)

#   content += sources

#   env_begin = r"\begin{etymology}" + "\label{ety:" + df_local['item'].iloc[0] + "}" + nl 
#   env_end = r"\end{etymology}"

#   box = env_begin + content + env_end
#   box = re.sub(r"\u200e", "", box) #removes right to left mark

#   # Save the spicebox as a standalone tex file
#   filename = re.sub(" ", "_", key)
#   filename = filename.lower()
#   f = open(path_out_tex + "{}.tex".format("etymbox_" + filename), "w", encoding='utf-8')  
#   f.write(box)
#   f.close()
#   print("Etymology-box '" + str(key) + "' as a tex file was created.")

#   return box

# etymbox("tester")

### Etymology box for Markdown

In [83]:
def etymbox(e):

  # The following code will create a etymology box environment for the key, to be used in Markdown
  print("Started the generation of '" + e + "' as etymbox...")

  # Select word
  df_local = etymologies[e]
  # df_local.fillna('', inplace=True)

  # Skipping those marked
  df_local = df_local[df_local['skip'] != 'yes']
  df_local.reset_index(inplace=True, drop=True)

  # # Replace empty cells with NaNs
  # df_local.replace(r'^\s*$', np.nan, regex=True, inplace=True)
  # df_local.replace(r'^nan$', np.nan, regex=True, inplace=True)

  # Initialize
  content = ""
  sources = ""
  source_pages = ""

  # Iterate through a word's etymology dataframe (stage by stage = row by row)
  for index, row in df_local.iterrows():
    # Initialize stage, add an '<' if its not the first stage
    if index == 0:
      stage = ""
    else:
      stage = "< "
    # If there are complex relations, add them (e.g., partly, and, or)
    if pd.notna(row['complex']):
      stage += row['complex'] + " "
    # Add language (in bold)
    if pd.notna(row['language']):
      stage += "**" + row['language'] + "** "
    # Add the term with native script (if exists)
    if pd.notna(row['script']):
      stage += row['script'] + " "
    # Add the term with transcription (in italics)
    if pd.notna(row['term']):
      stage += "*" + row['term'] + "* "
    # Add /IPA/
    if pd.notna(row['IPA']):
      stage += "/" + row['IPA'] + "/ "
    # Add 'meaning', gloss
    if pd.notna(row['meaning']):
      stage += "'" + row['meaning'] + "' "
    # Add the [literal meaning] if there is one
    if pd.notna(row['literal']):
      stage += "[" + row['literal'] + "] "
    # Clear ending
    stage = re.sub(' $', '', stage)
    # Add explanation
    if pd.notna(row['explanation']):
      stage += ", " + row['explanation'] + " "
    # Add (remark)
    if pd.notna(row['remark']):
      stage += " (" + row['remark'] + ") "
    # Clear ending
    stage = re.sub(" +", " ", stage)
    stage = re.sub(",? ?$", "", stage)

    # Add date, if there is a date
    if pd.notna(row['date']):
      # If it's a year
      date = row['date']

    # Add century if there is no date
    if pd.notna(row['century']) and pd.isna(row['date']):
      
      # If it's a century BC
      if re.match('^-\d\d?\??$', row['century']):
        # Remove dash
        row['century'] = re.sub("-", "", row['century'])
        # If there is question mark
        if re.match('\d\?', row['century']):
          # Remove the question mark
          row['century'] = re.sub("\?", "", row['century'])
          # Date BC
          # date = roman(int(row['century'])) + " BC?" # ROMAN NUMERALS
          date = row['century'] + " c. BC?" # ARAB NUMERALS
        else:
          # date = roman(int(row['century'])) + " BC" # ROMAN NUMERALS
          date = row['century'] + " c. BC" # ARAB NUMERALS

      # If it is a century AD
      elif re.match('^\d\d?\??$', row['century']):
        # If there is question mark
        if re.match('\d\?', row['century']):
          # Remove the question mark
          row['century'] = re.sub("\?", "", row['century'])
          # Date AD
          # date = "AD " + roman(int(row['century'])) + "?" # ROMAN NUMERALS
          date = "AD " + row['century'] + " c." # ARAB NUMERALS
        else:
          # date = "AD " + roman(int(row['century'])) # ROMAN NUMERALS
          date = "AD " + row['century'] + " c." # ARAB NUMERALS

      # Add date to stage
      stage += ", " + date
      
    # Clear ending
    stage = re.sub(',? ?$', '', stage)

    # If both cognates and derivates
    if pd.notna(row['cognates']) and pd.notna(row['derivates']):
      stage += "; cf. cognates " + row['cognates'] + "; derivates " + row['derivates'] + " "
    # If cognates only
    if pd.notna(row['cognates']) and pd.isna(row['derivates']):
      stage += "; cf. cognates " + row['cognates'] + " "
    # If derivates only
    if pd.notna(row['derivates']) and pd.isna(row['cognates']):
      stage += "; cf. derivates " + row['derivates'] + " "
    
    # Clear ending
    stage = re.sub(';?,? ?$', '', stage)

    # If stage is doubtful, use '<?'
    if row['doubt'] == 'yes':
      stage = re.sub('<', '<\?', stage)
    # If stage is "complex", remove '<'
    if pd.notna(row['complex']):
      stage = re.sub('<', '', stage)

    # # Sources (at each stage) A
    # source = ""
    # # If there is source (zotero), add
    # if pd.notna(row['source']):
    #   source = '\"' + row['source'] + '\"'
    #   # If there is page, add
    #   if pd.notna(row['source page']):
    #     source = '\"' + str(row['source'].lower()) + '\" \"' + str(row['source page']) + '\"'
    # # Add the Hugo shortcode syntax 
    # source = r' {{< cite ' + source + r' >}}'
    # # Create content
    # content += stage + source + "\n"

    # Sources (once in the end, removing duplicates) B
    # If there is source (zotero), add
    if pd.notna(row['source']):
      sources += row['source'] + ";"
      # If there is page, add
      if pd.notna(row['source page']):
        source_pages += str(row['source page']) + ";"
      else:
        source_pages += ";"
    # Create content
    content += stage + "\n"

  # If using version B of sources
  # Clean ending
  sources = re.sub(';?$', '', sources)
  source_pages = re.sub(';?$', '', source_pages)
  # Add the Hugo-cite shortcode syntax 
  source = r' {{< cite "' + sources + r'" "' + source_pages + r'" >}}'
  # Add source to content
  content += source

  # Cleaning
  box = content
  box = re.sub(r"\u200e", "", box) # Removes right-to-left mark

  # # Save the spicebox as a standalone markdown file (if ever needed)
  # filename = re.sub(" ", "_", key)
  # filename = filename.lower()
  # f = open(path_out_md + "{}.md".format("etymbox_" + filename), "w", encoding='utf-8')  
  # f.write(box)
  # f.close()
  # print("Etymology-box '" + str(key) + "' as a md file was created.")

  return box

etymbox("tester")

Started the generation of 'tester' as etymbox...


'**Language A** тест *test* /tɛst/ \'meaning1\' [literal1], explanation1 (remark1); cf. cognates cognates1; derivates derivates1\n< **Language B** тестер *tester* /ˈtɛstə/ \'meaning2\' [literal2], explanation2, AD 12 c.; cf. cognates cognates2\n< **Language C** тестинг *testing* /ˈtɛstɪŋ/ \'meaning3\' [literal3] (remark3), 9 c. BC; cf. derivates derivates3\n< **Language D** тесте *teste* /ˈaltə/ \'meaning4\' [literal4], explanation4 (remark4); cf. cognates cognates4; derivates derivates4\n {{< cite "oed;wehr_dictionary_1976;wehr_dictionary_1976;liddell_greekenglish_1940;wehr_dictionary_1976;lewis_latin_1879;liddell_greekenglish_1940" "1;2-3;4;5;6;;7" >}}'

In [84]:
# Creating a dictionary of etymologies
dictionary_of_etymologies = {}

# Loop
for e in list_of_etymologies:
    box = (etymbox(e))
    # box = r'{{% notice style="primary" title="Pirates" icon="skull-crossbones" %}}' + "\n" + text + "\n" + r"{{% /notice %}}" + "\n\n"
    dictionary_update = {e: box}
    dictionary_of_etymologies.update(dictionary_update)
print('Done.')


Started the generation of 'tester' as etymbox...
Started the generation of 'allspice' as etymbox...
Started the generation of 'fulful ifranji' as etymbox...
Started the generation of 'duoxiangguo' as etymbox...
Started the generation of 'pimento' as etymbox...
Started the generation of 'anise' as etymbox...
Started the generation of 'anisun' as etymbox...
Started the generation of 'huiqin' as etymbox...
Started the generation of 'asafoetida' as etymbox...
Started the generation of 'hing' as etymbox...
Started the generation of 'hiltit' as etymbox...
Started the generation of 'anjudan' as etymbox...
Started the generation of 'awei' as etymbox...
Started the generation of 'xingqu' as etymbox...
Started the generation of 'caraway' as etymbox...
Started the generation of 'karawiya' as etymbox...
Started the generation of 'geluzi' as etymbox...
Started the generation of 'cardamom' as etymbox...
Started the generation of 'amomum' as etymbox...
Started the generation of 'hal' as etymbox...
St

In [85]:
# Check
dictionary_of_etymologies['allspice']

'**English** *allspice*, from *all* + *spice*; after the flavor profile that resembles the combined aroma of cloves, nutmeg, cinnamon, and black pepper\n {{< cite "oed" "allspice" >}}'

# Website Generation

## Create a webpage

In [86]:
# Todo: Make every element optional with if statements

def webpage(key):
    
    '''
    This cell generates web pages from the datasets and writes them out to markdown files.
    '''

    # Get dataframe of current item
    print("Working on", key)
    df_local = df_items.loc[df_items['key'] == key]
    
    # Item
    item = df_local['item'].iloc[0]
    
    # Reset index
    df_local.reset_index(drop=True, inplace=True)
    
    # Definition (if exists)
    if 'wn_definition' in df_local:
        if pd.notna(df_local['wn_definition'][0]):
            definition = df_local['wn_definition'][0]
            definition = definition[0].upper() + definition[1:]
            definition = "WordNet definition: " + definition + "." if definition[-1] != "." else definition
    else:
        definition = ""

    # Description
    if pd.notna(df_local['description'][0]):
        description = df_local['description'][0]
        description = description[0].upper() + description[1:]
        aka = ", also known as " + str(df_local['en alt'].iloc[0]) if pd.notna(df_local['en alt'].iloc[0]) else ""
        if aka != "":
            description += aka
        related = ", related to " + str(df_local['related'].iloc[0]) if pd.notna(df_local['related'].iloc[0]) else ""
        if related != "":
            description += related
        see_also = ". See also " + str(df_local['see also'].iloc[0]) if pd.notna(df_local['see also'].iloc[0]) else ""
        if see_also != "":
            description += see_also            
            
        description += ". "
        preamble_description = description
        page_description = ">" + description
        
    else:
        description = ""
        preamble_description = ""
        page_description = ""

    page_description += "\n\n"
    
    # Extract categories and tags and groups (which will be treated as tags)
    if pd.notna(df_local['category'][0]):
        category = df_local['category'][0]
        category_list = category.split("; ") if ";" in category else [f'{category}']
    else:
        category = ""
        category_list = []
    
    if pd.notna(df_local['tag'][0]):
        tag = df_local['tag'][0]
        tag_list = tag.split("; ") if ";" in tag else [f'{tag}']
    else:
        tag = ""
        tag_list = []
        
    if pd.notna(df_local['group'][0]):
        group = df_local['group'][0]
        group_list = group.split(";") if ";" in group else [f'{group}']
    else:
        group_list = []
        
    tag_list = tag_list + group_list

    # Assemble preamble
    preamble = f'+++\ntitle = "{item.title()}"\nauthor = "Gabor Parti"\ndate = "{str(date.today())}"\ndescription = "{preamble_description}"\nweight = 10\n# draft = "true"\n# hidden = "true"\nplotly = true\ncategories = {str(category_list)}\ntags = {str(tag_list)}\nbibFile = "static/bibliography/parti.json"\n+++\n\n'



    ###########################
    ######## The Spice ########
    ###########################
    
    taxon_name = str(df_local['taxon_name'][0])
    
    if pd.notna(df_local['taxon_authors'][0]):
        taxon_authors = str(df_local['taxon_authors'][0])
    else:
        taxon_authors = ""
        
    taxon = f"*{taxon_name}* {taxon_authors}"
        
    family = str(df_local['family'][0])
    
    category = re.sub('; ', ' and ', category)
    if pd.notna(df_local['tag'][0]):
        tag = re.sub('; ', ' and ', tag)
    else:
        tag = ""
    
    part = str(df_local['part used'][0])
    
    lifeform_description = str(df_local['lifeform_description'][0])
    climate_description = str(df_local['climate_description'][0])
    
    # Intro
    intro = item.upper() + f" (*{taxon_name}* {taxon_authors}) is a(n) {lifeform_description} from the *{family}* family, growing in the {climate_description} biome, originating in the region(s) of {df_local['powo range'][0]}. It is used as a(n) {tag} {category}, and cultivated for its {part}.\n\n"
    
    # if pd.notna(df_local['heat'][0]):
    #     intro = intro + "It is used for its " + re.sub('; ', ' and ', part) + ", primarily for " + str(df_local['major uses'][0]) + ". Its aroma is described as " + str(df_local['taste/smell'][0]) + ", with a heat index of " + str(df_local['heat'][0]) + ".[^ucla_medicinal_2002]" + "\n\n" + "[^ucla_medicinal_2002]: Medicinal Spices Exhibit. (2002). UCLA Biomedical Library: History & Special Collections. https://unitproj.library.ucla.edu/biomed/spice/index.cfm?spicefilename=taste.txt&itemsuppress=yes&displayswitch=0\n\n"
    
    # Add values from df_local['petruzzello][0] if it's value is 'yes'.
    citations = ""
    if pd.notna(df_local['powo'][0]):
       citations += "powo"
    # if df_local['petruzzello'][0] == 'yes':
    #     citations += ";petruzzello_2023_list" 
    # if df_local['vanwyk'][0] == 'yes':
    #     citations += ";vanwyk_2014_culinary"
    # if df_local['dalby'][0] == 'yes':
    #     citations += ";dalby_2000_dangerous"
    # if df_local['hill'][0] == 'yes':
    #     citations += ";hill_2004_contemporary"
    # if df_local['anderson'][0] == 'yes':
    #     citations += ";anderson_2023_history"
        
    if citations != "":
        sources = r'{{< cite "' + citations + '" >}}\n\n'
    else:
        sources = ""
    intro += sources
    
    
    
    # Overview 
    overview_head = "## Overview\n\n"
    
    # Add taxon
    df_local['taxon'] = taxon

    # Prepare overview tables
    df_overview = df_local[['taxon', 'family', 'region of origin', 'macroarea', 'part used']]
    
    # Add cultivation if available
    if pd.notna(df_local['cultivation'][0]):
        #add cultivation column to df_overview
        df_overview['cultivation'] = df_local['cultivation']
        
    # Set database link(s)
    if pd.notna(df_local['powo'][0]):
        botanical_database = "[POWO](" + df_local['powo'] + ")"
    else:
        botanical_database = ""
        
    if pd.notna(df_local['gbif'][0]):
        botanical_database += ", [GBIF](" + df_local['gbif'][0] + ")"
        
    df_local['botanical database'] = botanical_database
    
    if pd.notna(df_local['botanical database'][0]):
        #add cultivation column to df_overview
        df_overview['botanical database'] = df_local['botanical database']
        
    # Transpose table
    df_overview = df_overview.T
    # Reset index
    df_overview.reset_index(inplace=True)
    # Rename columns
    df_overview.columns = ['item', item]
    # Prepare data to create markdown table
    data = df_overview.to_dict(orient='records')
    # Create markdown table
    overview_mdt = markdown_table(data).set_params(row_sep = 'markdown', quote = False).get_markdown()
    # Table
    overview = overview_head + overview_mdt + "\n\n"
    
    # Photo (todo, make list of img ext and source now they are string)
    # if df_local['img_count'][0] > 0:
    #     img_src = re.sub(',.*', '', df_local['img_source'][0])
    #     img_src = re.sub('_', ' ', img_src)
        
    #     photo = "![" + df_local['item'][0] + "](/images/photos/" + df_local['item'][0] + "-1-" + re.sub(',.*', '', df_local['img_source'][0]) + "." + re.sub(',.*', '', df_local['img_extension'][0]) + "?width=14rem&classes=shadow" + ' "Photo: ' + img_src + '")\n\n'
    # else:
    #     photo = ""
    
    
    
    # Illustration # &classes=shadow
    if pd.notna(df_local['ill source'].iloc[0]):
        illustration_alt = "Illustration of " + df_local['taxon_name'][0] + " from " + df_local['ill source'][0]
        illustration = "![" + df_local['taxon_name'][0] + '](/images/illustrations/' + key + '.png?width=40rem "' + illustration_alt + '")' + '\n'
        illustration_source = df_local['ill source'][0] + r"{{< cite -" + str(df_local['ill key'][0]) + r" >}} " + str(df_local['ill page'][0]) + r"."
        illustration = illustration + "\n>Illustration of " + df_local['taxon_name'][0] + " from " + illustration_source
        if pd.notna(df_local['ill link'].iloc[0]):
            illustration = illustration + " [{{% icon image %}}](" + df_local['ill link'].iloc[0] + ")"
    else:
        illustration = ""
    illustration = illustration + "\n\n"
    
    # Top display gallery
    if len(list_files(f"../static/images/photos/{key}")) > 1:
        display_gallery = '{{< load-photoswipe >}}\n\n{{< gallery dir="/images/photos/' + key + '" hover-effect="slideup" caption-effect="fade" caption-position="none" />}}' + "\n\n"
    else:
        display_gallery = ""
        
    # Gallery on the bottom of the page
    if os.path.isdir(f"../static/images/photos/{key}/gallery"):
        gallery =  '## Gallery\n\n{{< load-photoswipe >}}\n\n{{< gallery dir="/images/photos/' + key + '/gallery" hover-effect="slideup" caption-effect="fade" />}}' + "\n\n"
    else:
        gallery = ""



    # Quick names (predefined)
    if pd.notna(df_local['English'][0]):
        en = "**English:** " + df_local['English'][0] + " · "
    else:
        en = ""
    if pd.notna(df_local['Hungarian'][0]):
        hu = "**Hungarian:** " + df_local['Hungarian'][0] + " · "
    else:
        hu = ""
    if pd.notna(df_local['Arabic'][0]):
        ar = '**Arabic:** <span class="arabic-text" dir="rtl">' + df_local['Arabic'][0] + '</span>' + " · "
    else:
        ar = ""
    if pd.notna(df_local['Chinese'][0]):
        zh = '**Chinese:** <span class="traditional-chinese-text">' + df_local['Chinese'][0] + '</span>' + " · "
    else:
        zh = ""
    
    quick_names = en + hu + ar + zh
    
    # Quick names (additional)
    if 'french' in df_local:
        if pd.notna(df_local['french'][0]):
            fr = "**French:** " + df_local['french'][0] + " · "
            quick_names += fr
        
    if 'italian' in df_local:
        if pd.notna(df_local['italian'][0]):
            it = "**Italian:** " + df_local['italian'][0] + " · "
            quick_names += it

    print(quick_names)
    
    # Remove last dot from quick names
    quick_names = re.sub(" · $", "", quick_names)
    # Center it
    quick_names = '<center>\n\n' + quick_names + '\n\n</center>\n\n'



    # Distribution
    if pd.notna(df_local['native regions'][0]):
        distribution = "## Distribution\n\n"
        distribution = distribution + r'{{< load-plotly >}}' + '\n\n' + r'{{< plotly json="/aromatica/plotly/distributions/' + key + r'.json" weight="600" height="300" >}}' + '\n\n'
        distribution = distribution + f">Native and introduced habitats of {df_local['taxon_name'][0]}[^powo]\n\n[^powo]: {df_local['powo'][0]}\n\n"

        # Check if 'native regions' is not empty and not 'NA' before adding to the string
        regions = ""
        if pd.notna(df_local['native regions'][0]) and df_local['native regions'][0] != 'NA':
            regions += "**Native regions:** &ensp; &ensp; &ensp; " + df_local['native regions'][0] + "\n\n"
        # Check if 'introduced regions' is not empty and not 'NA' before adding to the string
        if pd.notna(df_local['introduced regions'][0]) and df_local['introduced regions'][0] != 'NA':
            regions += "**Introduced regions:** " + df_local['introduced regions'][0] + "\n\n"
        distribution += '<p style="text-align:left;">\n\n' + regions + '</p>\n\n'
    else :
        distribution = ""


    
    ##############################
    ######## Nomenclature ########
    ##############################

    # Dataframe of current item 
    # df_names_local = df_names.loc[df_names['item'] == item]

    # Reset index
    # df_names_local.reset_index(drop=True, inplace=True)

    # Names
    # names_head = "***\n\n## Nomenclature\n\n"
    # Heads
    # names_head_en = "### English\n\n"
    # names_head_ar = "### Arabic\n\n"
    # names_head_zh = "### Chinese\n\n"
    
    # # Language by language
    # language = "English"
    # df = df_names_local.loc[df_names_local['language'] == language]
    # df = df[['term', 'source human']]
    # df = df.rename(columns={'source human': 'source'})
    # data = df.to_dict(orient='records')
    # names_mdt_en = markdown_table(data).set_params(row_sep = 'markdown', quote = False).get_markdown()

    # language = "Arabic"
    # df = df_names_local.loc[df_names_local['language'] == language]
    # df = df[['script', 'term', 'literal', 'source human']]
    # df = df.rename(columns={'source human': 'source'})
    # data = df.to_dict(orient='records')
    # names_mdt_ar = markdown_table(data).set_params(row_sep = 'markdown', quote = False).get_markdown()

    # language = "Chinese"
    # df = df_names_local.loc[df_names_local['language'] == language]
    # df = df[['script', 'term', 'literal', 'source human']]
    # df = df.rename(columns={'source human': 'source'})
    # data = df.to_dict(orient='records')
    # names_mdt_zh = markdown_table(data).set_params(row_sep = 'markdown', quote = False).get_markdown()
    # names = names_head + names_head_en + names_mdt_en + "\n\n" + names_head_ar + names_mdt_ar + "\n\n" + names_head_zh + names_mdt_zh + "\n\n"
    names = ""

    ###########################
    ####### Etymologies #######
    ###########################

    # wordlist = df_local['etymologies'][0].split("; ")
    # etymologies = ""
    # for word in wordlist:
    #     etymologies += dictionary_of_etymologies[word]
    # etymologies = "## Etymologies\n\n" + etymologies
    etymologies = ""
    

    # Manuscripts (if any)
    # if a file exists in the folder "content/items/manuscripts/" with the name of the item, then add it to the page
    if os.path.isfile(f"../content/items/manuscripts/{key}_ms.md"):
        manuscript = "***\n\n{{% include \"content/items/manuscripts/" + key + "_ms.md\" %}}\n\n"
    else:
        manuscript = ""

    ######## Assemble page ########
    page = preamble + page_description + display_gallery + quick_names + intro + overview + illustration + distribution + names + etymologies + gallery
    
    # Bibliography - if page contains "{{< cite", then add bibliography
    if "{{< cite" in page:
        bibliography = "# Bibliography\n\n{{< bibliography cited >}}\n\n"
    else:
        bibliography = ""
            
    # Write page to file
    with open(path_out_md + key + '_gen.md', 'w', encoding='utf-8') as f:
            f.write(page)
            
    # Write bibliography to file
    with open(path_out_md + key + '_bib.md', 'w', encoding='utf-8') as f:
            f.write(bibliography)
    return

# Loop through all spices
for key in list_of_keys:
    webpage(key)
print("Done.")

Working on allspice
**English:** allspice · **Hungarian:** szegfűbors · **Arabic:** <span class="arabic-text" dir="rtl">فلفل إفرنجي</span> · **Chinese:** <span class="traditional-chinese-text">多香果</span> · 
Working on angelica
**English:** angelica · **Hungarian:** orvosi angyalgyökér · **Arabic:** <span class="arabic-text" dir="rtl">عشبة الملاك المخزنية</span> · **Chinese:** <span class="traditional-chinese-text">歐白芷</span> · 
Working on anise
**English:** anise · **Hungarian:** ánizs · **Arabic:** <span class="arabic-text" dir="rtl">أنيسون</span> · **Chinese:** <span class="traditional-chinese-text">茴芹</span> · 
Working on asafoetida
**English:** asafoetida · **Hungarian:** ördöggyökér · **Arabic:** <span class="arabic-text" dir="rtl">حلتیت</span> · **Chinese:** <span class="traditional-chinese-text">阿魏</span> · 
Working on caraway
**English:** caraway · **Hungarian:** fűszerkömény  · **Arabic:** <span class="arabic-text" dir="rtl">كراويا</span> · **Chinese:** <span class="traditiona

## Merge autogenerated files with manuscripts and bibliographies

In [87]:
def assemble_page(key):
    '''
    This function merges generated web pages with hand written parts (manuscripts), and page bibliographies into a final markdown file to show on a web page. Gen and bib are generated above, manuscripts are not. E.g., allspice_gen.md + allspice_ms.md + allspice_bib.md = allspice.md.
    '''
    filepath = website_md + 'manuscripts/' + key + '_ms.md'
    if os.path.isfile(filepath) == True:
        # Read generated files
        with open(path_out_md + key + '_gen.md', 'r', encoding='utf-8') as f:
            generated = f.read()
        # Read manuscript files    
        with open(website_md + 'manuscripts/' + key + '_ms.md', 'r', encoding='utf-8') as f:
            manuscript = f.read()
        # Read bib files
        with open(path_out_md + key + '_bib.md', 'r', encoding='utf-8') as f:
            bibliography = f.read()
        # Assemble    
        page = generated + "\n\n***\n\n" + manuscript + "\n\n" + bibliography
        # Write out page file
        with open(website_md + key + '.md', 'w', encoding='utf-8') as f:
            f.write(page)
    else:
        # Read generated files
        with open(path_out_md + key + '_gen.md', 'r', encoding='utf-8') as f:
            generated = f.read()
        # Read bib files    
        with open(path_out_md + key + '_bib.md', 'r', encoding='utf-8') as f:
            bibliography = f.read()
        # Assemble    
        page = generated + "\n\n" + bibliography
        # Write out page file
        with open(website_md + key + '.md', 'w', encoding='utf-8') as f:
            f.write(page)
    return

# Loop through all spices
for key in list_of_keys:
    assemble_page(key)
print("Done.")

Done.


# Maps

In [88]:
df = df_items.copy()

In [89]:
# # Basic example with Plotly Go
# # Create figure data
# fig = go.Figure(data=go.Scattergeo(
#         lon = df['lon'],
#         lat = df['lat'],
#         text = df['item'],
#         mode = 'markers',
#         # marker_color = df['cnt'],
#         ))

# # Update layout
# fig.update_layout(
#         title = 'Title',
#         geo_scope='world',
#         template = 'plotly_dark'
#     )

# # Show figure
# fig.show()

### Map settings

In [90]:
# Visual variables for map (dark mode)

font_size = 12
font_color = "#ffffff"
font_family = "Noto Sans"
marker_symbol= 'circle'
marker_size = 12
max_marker_size = 32
edge_color = transparent
edge_size = 1
opacity = 0.7
line_width = 4
water = "#202020"
grid_color = "#282828"
land = "#303030"
lines = "#383838"
copyright_color = "#404040"
background_color = transparent
legend_background_color = quarter_transparent
color_scheme = prism

In [91]:
# Orthographic globe layout
ortho_traces = dict(
    textposition = 'top right', # middle left, bottom center, etc.
    textfont = dict(size=font_size, color=font_color, family=font_family),
    hovertemplate=
        "<b>%{text}</b><br><br>" +
        "Species: <i>%{customdata[0]}</i><br>" +
        "Family: <i>%{customdata[1]}</i><br>" +
        "Region of origin: %{customdata[2]}<br>" +
        "Arabic: %{customdata[3]} <i>%{customdata[4]}</i><br>" +
        "Chinese: %{customdata[5]} <i>%{customdata[6]}</i><br>" +
        # "Spreadability: %{customdata[7]:.2f}<br>" +
        "<extra></extra>",
    marker = dict(
        symbol = marker_symbol,
        size = marker_size,
        line = dict(
            color=edge_color,
            width=edge_size
        )
    )
)

ortho_layout = go.Layout(
    paper_bgcolor=background_color,
    plot_bgcolor=background_color,
    geo = dict(
        resolution=110, # 50 is large; 110 is small
        scope='world', # 'world', 'asia'
        projection_type = 'orthographic', # orthographic, natural earth
        projection_scale = 1,
        projection_rotation = {'lat': 20, 'lon': 80, 'roll': 0},
        bgcolor=background_color,
        showcoastlines=True, coastlinewidth = 1, coastlinecolor = lines,
        showcountries=False, countrywidth = 1, countrycolor = lines,
        showframe=True, framewidth = 1, framecolor = lines,
        showlakes=True, lakecolor = water,
        showland=True, landcolor = land,
        showocean=True, oceancolor = water,
        showrivers=True, riverwidth = 1, rivercolor = water,
        showsubunits=True, subunitwidth = 0, subunitcolor = lines,
        lonaxis = dict(showgrid = True, gridwidth = 0.5, dtick = 10, gridcolor=grid_color),
        lataxis = dict (showgrid = True, gridwidth = 0.5, dtick = 10, gridcolor=grid_color)),
    showlegend = True,
    legend=dict(x=0, y=0, xanchor="left", yanchor="bottom", bgcolor=legend_background_color,
                font=dict(color=font_color, size=font_size, family=font_family),
                title_font=dict(color=font_color, size=font_size+2, family=font_family),
                traceorder = 'normal', orientation="v"),
    title=dict(x=0.5, y=0.99, xanchor='center', yanchor='top', text='',
               font=dict(color=font_color, size=font_size+6, family=font_family)),
    margin={"r":0,"t":0,"l":0,"b":0},
    hoverlabel=dict(#bgcolor="white",
                    font_size=font_size,
                    font_family=font_family),
    )

# "Document size" for pdfs
document_size = dict(width = 600, height=600)

# Copyright
cr = dict(
    name="copyright",
    text="© Gábor Parti, 2023",
    font=dict(color=copyright_color, size=8, family=font_family),
    opacity=0.9,
    xref="paper",
    yref="paper",
    x=0.5,
    y=0,
    # xanchor="right", 
    # yanchor="bottom", 
    # align="center",
    showarrow=False,
)
# fig.update_layout(annotations = [cr]) # to call

## Info
info = dict(
    name="info",
    text="Click on a material to navigate to its corresponding page!",
    font=dict(color=font_color, size=font_size, family=font_family),
    opacity=0.9,
    xref="paper",
    yref="paper",
    x=0.5,
    y=0.05,
    # xanchor="right", 
    # yanchor="bottom", 
    # align="center",
    showarrow=False,
)
# fig.update_layout(annotations = [info]) # to call

# Adding layout images
logo = dict(
    source="https://upload.wikimedia.org/wikipedia/en/thumb/9/9e/PolyU_Logo_with_wordmark.svg/1024px-PolyU_Logo_with_wordmark.svg.png",
    sizex=0.15, sizey=0.15,
    x=1, y=0, 
    xanchor="right", 
    yanchor="bottom", 
)
# fig.add_layout_image(logo) # to call

In [92]:
# Natural earth layout for cloropleth maps (regions and distributions)
ne_traces = dict(
    hovertemplate=
        "Region: <i>%{customdata[0]}</i><br>" +
        "Code: <i>%{customdata[1]}</i><br>" +
        "<extra></extra>",
)

ne_layout = go.Layout(
    paper_bgcolor=background_color,
    plot_bgcolor=background_color,
    geo = dict(
        resolution=110, # 50 is large; 110 is small
        scope='world', # 'world', 'asia'
        projection_type = 'orthographic', #'natural earth',
        projection_scale = 2,
        projection_rotation = {'lat': 20, 'lon': 80, 'roll': 0},
        bgcolor=background_color,
        showcoastlines=True, coastlinewidth = 1, coastlinecolor = lines,
        showcountries=False, countrywidth = 1, countrycolor = lines,
        showframe=True, framewidth = 1, framecolor = lines,
        showlakes=True, lakecolor = water,
        showland=True, landcolor = land,
        showocean=True, oceancolor = water,
        showrivers=True, riverwidth = 1, rivercolor = water,
        showsubunits=True, subunitwidth = 0, subunitcolor = lines,
        lonaxis = dict(showgrid = True, gridwidth = 0.5, dtick = 10, gridcolor=grid_color),
        lataxis = dict (showgrid = True, gridwidth = 0.5, dtick = 10, gridcolor=grid_color)),
    showlegend = True,
    legend=dict(xanchor="left", yanchor="bottom", 
                # x=0.1, y=0.1, # for natural earth
                x=0, y=0, # for orthographic
                bgcolor=legend_background_color,
                font=dict(color=font_color, size=font_size, family=font_family),
                title_font=dict(color=font_color, size=font_size+2, family=font_family),
                traceorder = 'normal', orientation="v"),
    title=dict(x=0.5, y=0.99, xanchor='center', yanchor='top', text='',
               font=dict(color=font_color, size=font_size+6, family=font_family)),
    margin={"r":0,"t":0,"l":0,"b":0},
    hoverlabel=dict(#bgcolor="white",
                    font_size=font_size,
                    font_family=font_family),
    )

In [93]:
# Define the map layout with the Marble basemap
marble = go.Layout(
    hovermode="closest",
    mapbox_style="white-bg",
    mapbox_layers=[
        {
            "below": 'traces',
            "sourcetype": "raster",
            "sourceattribution": "United States Geological Survey",
            "source": [
                "https://basemap.nationalmap.gov/arcgis/rest/services/USGSImageryOnly/MapServer/tile/{z}/{y}/{x}"
            ]
        }
      ]
    # mapbox=dict(
    #     center=dict(lat=0, lon=0),
    #     style="open-street-map", 
    #     zoom=1,
    # ),
)

## Plot by family

In [94]:
# Set size
df['size'] = 1

# Create figure data
data = px.scatter_geo(df,
    lat='lat', 
    lon='lon',
    text='item',
    color='family',
    color_discrete_sequence=color_scheme,
    size_max = max_marker_size,
    size = 'size',
    opacity = opacity,
    hover_name='item',
    hover_data={'taxon_name':True, 'family':True, 'region of origin':True, 'Arabic':True, 'ar transliteration':True, 'Chinese':True, 'pinyin':True, 'lon':False, 'lat':False, 'size':False}, #'spreadability':':.2f', 
    labels={"group": "category"}
    )

# Save figure data
fig = data

###################################################
# Interactive visualization (HTML/JSON) for the web

# Call the orthographic traces and layout settings from above
fig.update_traces(ortho_traces)
fig.update_layout(ortho_layout)
# fig.update_layout(title_text = "Title")
# fig.update_layout(basemap_visible=True)

# Add copyrigth
fig.update_layout(annotations=[cr])

# Show figure
fig.show()

# Write interactive visualization (HTML/JSON) for the web
filename = "home"
fig.write_html(path_out_html + filename + ".html")
fig.write_json(path_out_json + filename + ".json", validate=True, pretty=True)

###################################################
# Image (PNG/PDF) for documents

# Call figure data
# fig = data

# Call the orthographic traces and layout settings from above
# fig.update_traces(ortho_traces)
# fig.update_layout(ortho_layout)
# fig.update_layout(geo=dict(projection_rotation = {'lat': 20, 'lon': 80, 'roll': 0}))
# fig.update_layout(document_size)

# Show figure
# fig.show()

# filename = "home"
# fig.write_image(path_out_png + filename + ".png", scale=3)
# fig.write_image(filename + ".pdf", engine="kaleido")


In [95]:
# Move files to the website folder
move_dir(path_out_json, website_json, "*.json")

# Other maps

In [96]:
# # Basic map to show some countries, not very good

# import plotly.express as px
# import geopandas as gpd

# # Load the world map data
# world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))

# # Filter the data to include only India, Sri Lanka, and Maldives
# target_countries = ['India', 'Sri Lanka', 'Maldives', 'Wales']
# filtered_world = world[world['name'].isin(target_countries)]

# # Create a choropleth map
# fig = px.choropleth(
#     filtered_world,
#     locations='iso_a3',
#     color='name',
#     color_discrete_sequence = color_scheme,
#     projection='orthographic',
#     title='Highlighted Countries: India, Sri Lanka, Maldives'
# )

# # Call the orthographic traces and layout settings from above
# fig.update_layout(ortho_layout)
# # fig.update_layout(title_text = "Title")

# # Show the map
# fig.show()


In [97]:
df = df_items.copy()

key = "saffron"

## Distribution maps

In [98]:
def habitat_map(key):
    '''
    This function generates a map of the distributions of a spice.
    '''

    print("Drawing habitat map of", key)
    
    # Load the geographic data (shapefile or GeoJSON) # https://github.com/tdwg/wgsrpd
    gdf = gpd.read_file("data\\resources\\geo\\level3.geojson")

    # Rename columns
    gdf.columns = ['name', 'code', 'code_l2', 'code_l1', 'geometry']

    # Get local df and reset its index
    df_local = df_items.loc[df_items['key'] == key]
    df_local.reset_index(drop=True, inplace=True)

    # Get list of native regions
    native_regions = df_local.loc[0, 'native regions'].split(', ')

    # Get list of introduced regions, if it is not NA
    if pd.notna(df_local.loc[0, 'introduced regions']):
        introduced_regions = df_local.loc[0, 'introduced regions'].split(', ')
    else:
        introduced_regions = []

    # Filter data for native regions from gdf dataframe's LEVEL3_NAM column
    filtered_data_native = gdf[gdf['name'].isin(native_regions)].copy()  # Ensure a copy is made
    filtered_data_native.loc[:, 'region'] = 'native'

    # Filter data for introduced regions from gdf dataframe's LEVEL3_NAM column
    filtered_data_introduced = gdf[gdf['name'].isin(introduced_regions)].copy()  # Ensure a copy is made
    if not filtered_data_introduced.empty:
        filtered_data_introduced.loc[:, 'region'] = 'introduced'

    # Concatenate the filtered dataframes
    filtered_data = pd.concat([filtered_data_native, filtered_data_introduced])

    # ***

    # Create the choropleth map
    fig = px.choropleth(
        filtered_data,
        geojson = filtered_data.geometry,
        locations = filtered_data.index,
        color = 'region',
        color_discrete_sequence = ['#88ae43', '#6943ae'],
        hover_data = {'name': True, 'code': True},
        projection = 'natural earth'
    )

    # Call layout
    fig.update_traces(ne_traces)
    fig.update_layout(ne_layout)

    # Adjust map bounds
    # fig.update_geos(fitbounds='locations')  

    # Get centroid values for the native regions of the item
    for index, row in df_local.iterrows():
        native_distribution = row['native regions'].split(', ')
        # Filter data for native distribution from gdf dataframe's LEVEL3_NAM column
        native_data = gdf[gdf['name'].isin(native_distribution)].copy() 
        # Calculate centroid data
        native_centroid = native_data.to_crs('+proj=cea').centroid.to_crs(native_data.crs)
        native_centroid_lon = native_centroid.x.iloc[0]
        native_centroid_lat = native_centroid.y.iloc[0] + 10

    # Amend projection rotation with the native regions' centroid values
    fig.update_layout(geo=dict(projection_rotation = {'lat': native_centroid_lat, 'lon': native_centroid_lon, 'roll': 0}))

    # Show the map
    # fig.show()

    # Save
    # fig.write_html(path_out_html + key + ".html")
    fig.write_json(path_out_json + key + ".json", validate=True, pretty=True)
    # fig.write_image(path_out_png + key + ".png", scale=3)
    return

In [99]:
# # Loop through all spices
# for key in list_of_keys:
#     # If item belongs to kingdom Plantae, draw a map
#     if df_items.loc[df_items['key'] == key, 'kingdom'].iloc[0] == 'Plantae':
#         habitat_map(key)
# print("Done.")

# # Move files to the website folder
# move_dir(path_out_json, website_json + "/distributions", "*.json")

# End

In [100]:
# ...measure time
end_time = datetime.now()
print("All done at " + str(end_time) + ".")
print('Duration: {}'.format(end_time - start_time))

All done at 2023-12-08 16:16:45.228691.
Duration: 0:00:25.596728


# Notes

In [101]:
# print(gpd.datasets.available)
# import geodatasets
# geodatasets.data

# with open("data\\resources\\geo\\level3.geojson", 'r') as f:
#     geojson_data = json.load(f)

# geojson_data