# Encyclopaedia Aromatica pipeline

## Libraries

In [1]:
# !pip install -r requirements.txt

In [40]:
# Import dependencies
import pandas as pd
import numpy as np
import re
import os
from datetime import date, datetime, timedelta
from collections import defaultdict
from py_markdown_table.markdown_table import markdown_table
from scripts.functions import *

import plotly.express as px
import plotly.graph_objs as go
from plotly.io import write_image, write_json
from plotly.offline import plot

## Functions

In [3]:
# from tabulate import tabulate

# # Functions and palette
# from scripts.functions import *
# from scripts.palette import *

In [4]:

# # pip install babelnet


# 

# # from scripts.unsplash import *
# # from scripts.pexels import *

# pd.options.mode.copy_on_write = True # to avoid SettingWithCopyWarning, https://stackoverflow.com/questions/20625582/how-to-deal-with-settingwithcopywarning-in-pandas


## Paths

In [5]:
path_in = "data/"
path_out_tex = "output/tex/"
path_out_md = "output/md/"
path_out_html = "output/html/"
path_out_json = "output/json/"
path_out_png = "output/png/"
path_out_pdf = "output/pdf/"

# Create these folders if they don't exist
for folder in [path_out_tex, path_out_md, path_out_html, path_out_json, path_out_png, path_out_pdf]:
    if not os.path.exists(folder):
        os.makedirs(folder)

website_md = "../content/items/"
website_json = "../static/plotly/"

path_downloaded_photos = "output/photos/"
website_photos = "../static/images/photos/"

In [6]:
# Start timer
start_time = datetime.now()

# Data

## Spices

In [7]:
# Read and store content of an excel file 
df = pd.read_excel(path_in+"spices.xlsx")

# Write the dataframe object into csv file
df.to_csv (path_in+"spices.csv", index = None, header=True)

# Load in dataset of spices as a dataframe
df = pd.read_csv(path_in+'spices.csv', header=[0], delimiter=',', encoding="utf-8")

# Select ones to include
df = df.loc[(df['include'] == "in")]

# Save a list of items (the spices)
list_of_items = df['item'].tolist()
list_of_items.sort() # Sorts alphabetically, but capital letters come first

# Save a list of keys based on "item", make it lowercase and replace spaces with underscores
df['key'] = df['item'].str.lower().str.replace(" ", "_")
list_of_keys = df['key'].tolist()

# Show how many spices there are and what are they
n = len(list_of_keys)
print(n, "spices in total:, ", list_of_keys)
print(list_of_keys)

# Write n to a markdown file
number = f"As of now, the website contains information on **{n}** items."
with open("../content/number.md", "w") as file:
    file.write(number)

# Add links in a new column
df['url'] = "https://partigabor.github.io/aromatica/items/" + df['key']

# Inspect
# print(tabulate(df, headers='keys', tablefmt='psql'))
df

22 spices in total:,  ['allspice', 'anise', 'asafoetida', 'caraway', 'cardamom', 'cassia', 'cinnamon', 'clove', 'coriander', 'cubeb', 'cumin', 'fennel', 'fenugreek', 'ginger', 'long_pepper', 'mace', 'nutmeg', 'pepper', 'saffron', 'star_anise', 'turmeric', 'vanilla']
['allspice', 'anise', 'asafoetida', 'caraway', 'cardamom', 'cassia', 'cinnamon', 'clove', 'coriander', 'cubeb', 'cumin', 'fennel', 'fenugreek', 'ginger', 'long_pepper', 'mace', 'nutmeg', 'pepper', 'saffron', 'star_anise', 'turmeric', 'vanilla']


Unnamed: 0,include,v,done,id,powo,taxon_name,taxon_authors,taxon_syn,taxon_alt,group,...,eol,ncbi,Hindi,hi,hi literal,Hi alt,Indonesian,Persian,key,url
0,in,82.0,yes,,https://powo.science.kew.org/taxon/196799-2,Pimenta dioica,(L.) Merr.,Pimenta officinalis Lindl.,,false peppers,...,https://eol.org/pages/2508608,375272.0,गंधद्रव्य,gandhadravya,,,merica Jamaika,فلفل فرنگی,allspice,https://partigabor.github.io/aromatica/items/a...
1,in,79.0,,,,Pimpinella anisum,L.,,,,...,https://eol.org/pages/581422,271192.0,मोटी सौंफ़,motī saunf,fat fennel,,adas manis,بادیان رومی، انیسون,anise,https://partigabor.github.io/aromatica/items/a...
2,in,64.0,,,,Ferula foetida,(Bunge) Regel,,Ferula assa-foetida L.; Ferula narthex; et al.,,...,,371345.0,हींग,hīng,,,,,asafoetida,https://partigabor.github.io/aromatica/items/a...
5,in,48.0,,,,Carum carvi,L.,,,,...,,48032.0,,,,,,,caraway,https://partigabor.github.io/aromatica/items/c...
6,in,72.0,,1.0,https://powo.science.kew.org/taxon/796556-1,Elettaria cardamomum,(L.) Maton,Amomum cardamomum L.,,cardamoms,...,https://eol.org/pages/1120064,105181.0,इलायची,ilāychī,,,kapulaga seberang,هل سبز,cardamom,https://partigabor.github.io/aromatica/items/c...
7,in,78.0,,,,Cinnamomum aromaticum,Nees,Cinnamomum cassia (L.) J.Presl.,et al.,cinnamon,...,https://eol.org/pages/483593,119260.0,दालचीनी,dālchīnī,,दारचीनी,kayu manis,دارچین چینی,cassia,https://partigabor.github.io/aromatica/items/c...
9,in,64.0,,,,Cinnamomum verum,J.Presl,Cinnamomum zeylanicum Blume,,cinnamon,...,https://eol.org/pages/490672,128608.0,दालचीनी,dālchīnī,?,दारचीनी darchini,kayu manis,دارچین,cinnamon,https://partigabor.github.io/aromatica/items/c...
10,in,75.0,,,,Syzygium aromaticum,(L.) Merr. & L.M.Perry,Eugenia aromatica (L.) Baill.; Eugenia cayophy...,,,...,https://eol.org/pages/2508665,219868.0,लौंग,laung,,,cengkih,میخک صدپر,clove,https://partigabor.github.io/aromatica/items/c...
11,in,65.0,,,,Coriandrum sativum,L.,,,,...,https://eol.org/pages/581687,4047.0,धनिया,dhaniyā,,,ketumbar,گشنیز,coriander,https://partigabor.github.io/aromatica/items/c...
12,in,62.0,,,,Piper cubeba,L.f.,Cubeba officinalis Miq.,,false peppers,...,https://eol.org/pages/51845162,,कबाबचीनी,kabābchīnī,Chinese-cubeb,,kemukus,کبابهٔ چینی‎,cubeb,https://partigabor.github.io/aromatica/items/c...


## Botanical data

In [8]:
# Read in wcvp files
wcvp = pd.read_csv(path_in + 'resources/wcvp/wcvp_names.csv', header=[0], delimiter='|', encoding="utf-8", dtype=str)

# Check row with a taxon name
# wcvp.loc[wcvp['taxon_name'] == "Zingiber officinale"]

# Merge wcvp plant data with spice data
df = pd.merge(df, wcvp, on=['taxon_name', 'taxon_authors'], how='left')

# Add family name "Animalia" to animal spices that don't have value in family column
# df.loc[(df['kingdom'] == "Animalia") & (df['family'].isnull()), 'family'] = "Animalia"

# Add POWO links by adding the powo_id to the end of the link "https://powo.science.kew.org/taxon/"
df['powo_url'] = "https://powo.science.kew.org/taxon/" + df['powo_id']

# Look at powo column of a spice
# print("Print POWO link for checking:", df.loc[df['key'] == 'ginger', ['powo_url']], "\n")

# Show items with missing POWO links
print("List items with no POWO links:\n", df.loc[df['powo_url'].isnull(), ['item', 'powo_url']])

List items with no POWO links:
 Empty DataFrame
Columns: [item, powo_url]
Index: []


In [9]:
# Read in wcvp plant distribution files
wcvp_dist = pd.read_csv(path_in+'resources/wcvp/wcvp_distribution.csv', header=[0], delimiter='|', encoding="utf-8", dtype=str)

# Check line by plant_name_id
wcvp_dist.loc[wcvp_dist['plant_name_id'] == "273361"]

# Iterate through df to get native and introduced areas and regions
for index, row in df.iterrows():
    # Get a list of all vales in the area column for this plant_name_id, where 'introduced' is 0
    native = wcvp_dist.loc[(wcvp_dist['plant_name_id'] == row['plant_name_id']) & (wcvp_dist['introduced'] == "0"), 'area'].tolist()
    # Convert list to string
    native = ', '.join(native)
    # Add to df
    df.at[index, 'native'] = native
    
    # Get a list of all vales in the area column for this plant_name_id, where 'introduced' is 1
    introduced = wcvp_dist.loc[(wcvp_dist['plant_name_id'] == row['plant_name_id']) & (wcvp_dist['introduced'] == "1"), 'area'].tolist()
    # Convert list to string
    introduced = ', '.join(introduced)
    # Add to df
    df.at[index, 'introduced'] = introduced
    
    # Get a list of all vales in the region column for this plant_name_id, where 'introduced' is 0
    regions = wcvp_dist.loc[(wcvp_dist['plant_name_id'] == row['plant_name_id']) & (wcvp_dist['introduced'] == "0"), 'region'].tolist()
    # Remove duplicates from the list
    regions = list(dict.fromkeys(regions))
    # Convert list to string
    regions = ', '.join(regions)
    # Add to df
    df.at[index, 'regions'] = regions
    
    # Get list of all values in the continent column for this plant_name_id, where 'introduced' is 0
    continents = wcvp_dist.loc[(wcvp_dist['plant_name_id'] == row['plant_name_id']) & (wcvp_dist['introduced'] == "0"), 'continent'].tolist()
    # Remove duplicates from the list
    continents = list(dict.fromkeys(continents))
    # Convert list to string
    continents = ', '.join(continents)
    # Title case continents column values
    continents = continents.title()    
    # Add to df
    df.at[index, 'continents'] = continents
     
# # Add counts of distributions
# df['n_of_native'] = df['native'].str.count(',') + 1
# df['n_of_introduced'] = df['introduced'].str.count(',') + 1

# Check if any native or introduced areas are missing
print("List items with no native areas:\n", df.loc[df['native'].isnull(), ['item', 'native']])

List items with no native areas:
 Empty DataFrame
Columns: [item, native]
Index: []


## Geocordinates (online)

In [10]:
if internet_is_on():
    
    # Location coordinates
    # generate_coordinates(df) # Generate geo-coordinates from location column
    centroid_coordinates(df) # Generate geo-coordinates by finding the centroid of the native areas

Calculating coordinates of allspice
Calculating coordinates of anise
Calculating coordinates of asafoetida
Calculating coordinates of caraway
Calculating coordinates of cardamom
Calculating coordinates of cassia
Calculating coordinates of cinnamon
Calculating coordinates of clove
Calculating coordinates of coriander
Calculating coordinates of cubeb
Calculating coordinates of cumin
Calculating coordinates of fennel
Calculating coordinates of fenugreek
Calculating coordinates of ginger
Calculating coordinates of long pepper
Calculating coordinates of mace
Calculating coordinates of nutmeg
Calculating coordinates of pepper
Calculating coordinates of saffron
Calculating coordinates of star anise
Calculating coordinates of turmeric
Calculating coordinates of vanilla


## Definitions (online)

In [11]:
# import nltk
# from nltk.corpus import wordnet as wn
# !nltk.download("wordnet")
# !nltk.download("omw-1.4")
# !nltk.download("extended_omw") # if you want the wiktionary data

# Wordnets using the Open Multilingual WordNet (https://omwn.org/omw1.html) # 100%: cmn, fin, hrv
# # wn_langs = ['als', 'arb', 'bul', 'cat', 'cmn', 'dan', 'ell', 'eng', 'eus', 'fin', 'fra', 'glg', 'heb', 'hrv', 'ind', 'isl', 'ita', 'ita_iwn', 'jpn', 'lit', 'nld', 'nno', 'nob', 'pol', 'por', 'slv', 'spa', 'swe', 'tha', 'zsm'] # 100%: cmn, fin, hrv

# wn_langs = ['fra'] # 'eng', 'arb', 'cmn', 
# print(wn.synset('allspice.n.03').definition())
# print(wn.synsets('allspice', pos='n'))
# print(wn.synset('allspice.n.03').lemma_names('ita'))

# #WordNet definitions
# def wn_definition(df):
#     '''
#     Returns a dataframe with Wordnet definitions of the words in the wn column.

#         Parameters:
#             df (pandas dataframe): Dataframe with a wn column.

#         Returns:
#             df (pandas dataframe): Dataframe with Wordnet definitions of the words in the wn column.
#     '''
#     for index, row in df.iterrows():
#         if pd.notna(row['wn']):
#             wn_definition = str(wn.synset(row['wn']).definition())
#             df.at[index, "wn_definition"] = str(wn_definition)
#     return df

In [12]:
# if internet_is_on():
#     # Get a definition from WordNet
#     wn_definition(df)

## Translations (online)

In [13]:

# ### WordNet translations
# def wn_translate(df, lan):
#     '''
#     Returns a dataframe with Wordnet translations of the words in the wn column.

#         Parameters:
#             df (pandas dataframe): Dataframe with a wn column.
#             lan (str): Language code of the language to translate to.

#         Returns:
#             df (pandas dataframe): Dataframe with Wordnet translations of the words in the wn column.
#     '''
#     for index, row in df.iterrows():
#         if pd.notna(row['wn']):
#             translated_list = wn.synset(row['wn']).lemma_names(lan)
#             translated = ", ".join(str(x) for x in translated_list)
#             translated = re.sub("_", " ", translated)
#             df.at[index, f"wn_translation_{lan}"] = translated
#     return df



# ### Translator
# from googletrans import Translator
# translator = Translator()
# def translate(input, language):
#     '''
#     Returns a translation of a word or phrase into a language.
    
#             Parameters:
#                 input (str): Word or phrase to translate.
#                 language (str): Language code of the language to translate to.
    
#             Returns:
#                 translated (str): Translation of the word or phrase into the language.
#         '''
#     translated = translator.translate(input, dest=language)
#     return translated.text

   

# #### Translate with DeepL, using Google Translate
# # https://developers.google.com/admin-sdk/directory/v1/languages
# # dl_languages = {'afrikaans': 'af', 'albanian': 'sq', 'amharic': 'am', 'arabic': 'ar', 'armenian': 'hy', 'assamese': 'as', 'aymara': 'ay', 'azerbaijani': 'az', 'bambara': 'bm', 'basque': 'eu', 'belarusian': 'be', 'bengali': 'bn', 'bhojpuri': 'bho', 'bosnian': 'bs', 'bulgarian': 'bg', 'catalan': 'ca', 'cebuano': 'ceb', 'chichewa': 'ny', 'chinese (simplified)': 'zh-CN', 'chinese (traditional)': 'zh-TW', 'corsican': 'co', 'croatian': 'hr', 'czech': 'cs', 'danish': 'da', 'dhivehi': 'dv', 'dogri': 'doi', 'dutch': 'nl', 'english': 'en', 'esperanto': 'eo', 'estonian': 'et', 'ewe': 'ee', 'filipino': 'tl', 'finnish': 'fi', 'french': 'fr', 'frisian': 'fy', 'galician': 'gl', 'georgian': 'ka', 'german': 'de', 'greek': 'el', 'guarani': 'gn', 'gujarati': 'gu', 'haitian creole': 'ht', 'hausa': 'ha', 'hawaiian': 'haw', 'hebrew': 'iw', 'hindi': 'hi', 'hmong': 'hmn', 'hungarian': 'hu', 'icelandic': 'is', 'igbo': 'ig', 'ilocano': 'ilo', 'indonesian': 'id', 'irish': 'ga', 'italian': 'it', 'japanese': 'ja', 'javanese': 'jw', 'kannada': 'kn', 'kazakh': 'kk', 'khmer': 'km', 'kinyarwanda': 'rw', 'konkani': 'gom', 'korean': 'ko', 'krio': 'kri', 'kurdish (kurmanji)': 'ku', 'kurdish (sorani)': 'ckb', 'kyrgyz': 'ky', 'lao': 'lo', 'latin': 'la', 'latvian': 'lv', 'lingala': 'ln', 'lithuanian': 'lt', 'luganda': 'lg', 'luxembourgish': 'lb', 'macedonian': 'mk', 'maithili': 'mai', 'malagasy': 'mg', 'malay': 'ms', 'malayalam': 'ml', 'maltese': 'mt', 'maori': 'mi', 'marathi': 'mr', 'meiteilon (manipuri)': 'mni-Mtei', 'mizo': 'lus', 'mongolian': 'mn', 'myanmar': 'my', 'nepali': 'ne', 'norwegian': 'no', 'odia (oriya)': 'or', 'oromo': 'om', 'pashto': 'ps', 'persian': 'fa', 'polish': 'pl', 'portuguese': 'pt', 'punjabi': 'pa', 'quechua': 'qu', 'romanian': 'ro', 'russian': 'ru', 'samoan': 'sm', 'sanskrit': 'sa', 'scots gaelic': 'gd', 'sepedi': 'nso', 'serbian': 'sr', 'sesotho': 'st', 'shona': 'sn', 'sindhi': 'sd', 'sinhala': 'si', 'slovak': 'sk', 'slovenian': 'sl', 'somali': 'so', 'spanish': 'es', 'sundanese': 'su', 'swahili': 'sw', 'swedish': 'sv', 'tajik': 'tg', 'tamil': 'ta', 'tatar': 'tt', 'telugu': 'te', 'thai': 'th', 'tigrinya': 'ti', 'tsonga': 'ts', 'turkish': 'tr', 'turkmen': 'tk', 'twi': 'ak', 'ukrainian': 'uk', 'urdu': 'ur', 'uyghur': 'ug', 'uzbek': 'uz', 'vietnamese': 'vi', 'welsh': 'cy', 'xhosa': 'xh', 'yiddish': 'yi', 'yoruba': 'yo', 'zulu': 'zu'}

# dl_languages = {'french': 'fr'} # 'hungarian': 'hu', 'english': 'en', 'arabic': 'ar', 'chinese': 'zh-TW',
# dl_language_list = list(dl_languages.values())

# from deep_translator import GoogleTranslator as dl
# # translated = dl(source='en', target='hu').translate("allspice") # api_key=openai
# # print(translated)

# def dl_translate(df, lang):
#     '''
#     Returns a dataframe with DeepL translations of the words in the English column.

#         Parameters:
#             df (pandas dataframe): Dataframe with an English column.
#             lang (str): Language code of the language to translate to.

#         Returns:
#             df (pandas dataframe): Dataframe with DeepL translations of the words in the English column.
#     '''    
#     for index, row in df.iterrows():
#         if pd.notna(row['English']):
#             translated = dl(source='en', target=lang).translate(row['English'])
#             df.at[index, f"dl_translation_{lang}"] = translated
#     return df

In [14]:

    
    # # Translate using WordNet
    # wn_langs = ['fra', 'ita']
    # for lan in wn_langs:
    #     wn_translate(df, lan)
    
    # # Translate using DeepL
    # dl_langs = ['fr', 'it']
    # for lang in dl_langs:
    #     dl_translate(df, lang)
        
    # # Create a new column called french and fill with NaN
    # df['french'] = np.nan
    # df['italian'] = np.nan
    
    # # Change "" to NaN in df
    # df = df.replace(r'^\s*$', np.nan, regex=True)
    
    # # Fill in 'french' column with values from 'wn_translation_fra' or 'dl_translation_fr' column where it is NaN
    # df['french'] = df['french'].fillna(df['wn_translation_fra'])
    # df['french'] = df['french'].fillna(df['dl_translation_fr'])
    
    # df['italian'] = df['italian'].fillna(df['wn_translation_ita'])
    # df['italian'] = df['italian'].fillna(df['dl_translation_it'])

# # Check
# df_translations = df.filter(regex='fr')
# df_translations

In [15]:
# Checking lat and lon for an item
print("Check coordinates:", df.loc[df['key'] == 'ginger', ['lat', 'lon']])

# List which items have NaN lat and lon values
print("List of items with no coordinates:", df.loc[(df['lat'].isnull()) & (df['lon'].isnull()), ['item', 'lat', 'lon']])

Check coordinates:      lat   lon
13  26.0  93.0
List of items with no coordinates: Empty DataFrame
Columns: [item, lat, lon]
Index: []


In [16]:
# Check if any coordinates overlap (identical lat and lon values in different rows)
print("List of items with identical coordinates:", df.loc[df.duplicated(subset=['lat', 'lon'], keep=False), ['item', 'lat', 'lon']])

# # Repel data points by adding a small random value to each coordinate
# df['lat'] = df['lat'] + np.random.uniform(-5, 5, len(df))
# df['lon'] = df['lon'] + np.random.uniform(-5, 5, len(df))

List of items with identical coordinates: Empty DataFrame
Columns: [item, lat, lon]
Index: []


In [17]:
# Subsetting categories (spices, herbs, incense)
# df_spices = df.loc[(df['id'] == "S")]
# print(df_spices.shape[0])
# df_herbs = df.loc[(df['id'] == "H")]
# print(df_herbs.shape[0])
# df_incense = df.loc[(df['id'] == "I")]
# print(df_incense.shape[0])

In [18]:
# Assign
df_items = df.copy()
df.head()

Unnamed: 0,include,v,done,id,powo,taxon_name,taxon_authors,taxon_syn,taxon_alt,group,...,homotypic_synonym,parent_plant_name_id,powo_id,hybrid_formula,reviewed,powo_url,native,introduced,regions,continents
0,in,82.0,yes,,https://powo.science.kew.org/taxon/196799-2,Pimenta dioica,(L.) Merr.,Pimenta officinalis Lindl.,,false peppers,...,,156124,196799-2,,Y,https://powo.science.kew.org/taxon/196799-2,"Mexico Gulf, Mexico Southwest, Mexico Southeas...","Society Is., Caroline Is., Hawaii, Florida, El...","Mexico, Central America, Caribbean","Northern America, Southern America"
1,in,79.0,,,,Pimpinella anisum,L.,,,,...,,2402470,846658-1,,N,https://powo.science.kew.org/taxon/846658-1,"Cyprus, Lebanon-Syria, Palestine, Turkey","Denmark, Norway, Sweden, Austria, Czechoslovak...",Western Asia,Asia-Temperate
2,in,64.0,,,,Ferula foetida,(Bunge) Regel,,Ferula assa-foetida L.; Ferula narthex; et al.,,...,,2808299,842277-1,,N,https://powo.science.kew.org/taxon/842277-1,"Kazakhstan, Kirgizstan, Turkmenistan, Tadzhiki...",,"Middle Asia, Western Asia, Indian Subcontinent","Asia-Temperate, Asia-Tropical"
3,in,48.0,,,,Carum carvi,L.,,,,...,,2701447,839677-1,,N,https://powo.science.kew.org/taxon/839677-1,"Denmark, Finland, Norway, Sweden, Austria, Bel...","Føroyar, Great Britain, Iceland, Ireland, Alge...","Northern Europe, Middle Europe, Southwestern E...","Europe, Asia-Temperate, Asia-Tropical"
4,in,72.0,,1.0,https://powo.science.kew.org/taxon/796556-1,Elettaria cardamomum,(L.) Maton,Amomum cardamomum L.,,cardamoms,...,,243049,796556-1,,Y,https://powo.science.kew.org/taxon/796556-1,India,"Réunion, Bangladesh, Cambodia, Thailand, Vietn...",Indian Subcontinent,Asia-Tropical


In [19]:
# Save for checking as xlsx
# df.to_excel("checking.xlsx", index = None, header=True)

## Images

In [20]:
# ##################
# ### Downloader ###
# ##################

# # Download images from Unsplash and Pexels to be curated later, query with underscores
# search = "pink pepper"
# dashed_search = re.sub(" ", "-", search)
# unsplash_downloader(dashed_search, path_downloaded_photos)
# pexels_downloader(search, path_downloaded_photos)

# # Move images to the right folder, regardless of extension
# move_dir(path_downloaded_photos, website_photos, "*.png")
# move_dir(path_downloaded_photos, website_photos, "*.jpg")
# move_dir(path_downloaded_photos, website_photos, "*.jpeg")

In [21]:
# Images df
# # Define the relative folder path
# folder_path = '../static/images/photos'

# # List of keys
# list_of_keys.sort()

# # Initialize item_info dictionary
# item_info = {}  # Dictionary to store item information

# # Initialize item counts, extensions, file names, and sources to empty lists for each item
# for item in list_of_keys:
#     item_info[item] = {'count': 0, 'extensions': [], 'file_names': [], 'sources': []}

# # Iterate through the files in the folder
# for filename in os.listdir(folder_path):
#     if os.path.isfile(os.path.join(folder_path, filename)):
#         # Extract the item name, extension, file name, and source from the file name
#         item_name = filename.split('-')[0]
#         extension = filename.split('.')[-1]
#         file_name = filename
#         source = filename.split('-')[2] if len(filename.split('-')) > 2 else ""
#         # Remove the file extension from the source
#         source = source.split('.')[0]
        
#         # Check if the item name is in the list_of_items
#         if item_name in list_of_keys:
#             item_info[item_name]['count'] += 1
#             item_info[item_name]['extensions'].append(extension)
#             item_info[item_name]['file_names'].append(file_name)
#             item_info[item_name]['sources'].append(source)

# # Create a Pandas DataFrame from the item_info dictionary
# data = {'key': [], 'count': [], 'source': [], 'extension': []}
# for item, info in item_info.items():
#     data['key'].append(item)
#     data['count'].append(info['count'])
#     data['source'].append(', '.join(info['sources']))
#     data['extension'].append(', '.join(info['extensions']))

# df = pd.DataFrame(data)

# # Fill in missing items with 0 image counts and empty sources, extensions
# for item in list_of_keys:
#     if item not in df['key'].values:
#         df = df.append({'key': item, 'count': 0, 'source': '', 'extension': ''}, ignore_index=True)

# # Reorder the DataFrame with columns 'item', 'count', 'source', 'extension'
# df = df[['key', 'count', 'source', 'extension']]

# # Sort the DataFrame by 'item'
# df = df.sort_values(by='key')

# # Reset the index of the DataFrame
# df = df.reset_index(drop=True)

# # Rename the columns
# df.rename(columns={'count': 'img_count', 'source': 'img_source', 'extension': 'img_extension'}, inplace=True)

# # Save
# df_images = df.copy()

# # Display the final DataFrame
# print(df)

# # Merge the two dataframes
# df_items = pd.merge(df_items, df_images, on='key', how='left')

In [22]:
# Create thumbnails for images
for photo in list_files("../static/images/photos"):
    create_thumbnail(photo)

## Names

In [23]:
# # Read and store content of an excel file 
# df = pd.read_excel(path_in+"names.xlsx")

# # Write the dataframe object into csv file
# df.to_csv (path_in+"names.csv", index = None, header=True)

# # Load in dataset of names
# df = pd.read_csv(path_in+'names.csv', header =[0], delimiter=',', encoding="utf-8")

# # Select ones to include
# df = df.loc[df['include'] == 'yes'] # include ones to include

In [24]:
# # Change NaN to empty string
# df.fillna('', inplace=True)

# # Info
# print(df.shape[0], "names in total.")

# # Assign
# df_names = df.copy()

## Etymologies

In [25]:
# Read and store content of an excel file 
read_file = pd.read_excel(path_in+"etymologies.xlsx")

# Write the dataframe object into csv file
read_file.to_csv (path_in+"etymologies.csv", index = None, header=True)

# Load in dataset
df_etymologies=pd.read_csv(path_in+'etymologies.csv', header =[0], delimiter=',', encoding="utf-8")

# Split the dataset wherever an empty row is found
df_list_with_na = np.split(df_etymologies, df_etymologies[df_etymologies.isnull().all(1)].index)

# Drop NAs and reset the index
df_list = []
for df in df_list_with_na:
  df = df.dropna(how='all')
  df.reset_index(inplace=True, drop=True)
  df_list.append(df)

# Automatically extract IDs from the dataset
list_of_etymologies = []
for df in df_list:
  # print(df['item'].iloc[0])
  df = df.dropna(how='all')
  df.reset_index(drop=True)
  item = str(df['item'].iloc[0])
  list_of_etymologies.append(item)

# Print the number if IDs and what are they
length = len(df_list)
print(length, "words in total")
print(list_of_etymologies)

#Create a defaultdict of spice-word etymologies
etymologies=defaultdict(list)
for i in range(length):
  etymologies[list_of_etymologies[i]]=df_list[i]

# Testing
print(etymologies['saffron'])

84 words in total
['tester', 'allspice', 'fulful ifranji', 'duoxiangguo', 'pimento', 'anise', 'anisun', 'huiqin', 'asafoetida', 'hing', 'hiltit', 'anjudan', 'awei', 'xingqu', 'caraway', 'karawiya', 'geluzi', 'cardamom', 'amomum', 'hal', 'qaqulla', 'doukou', 'cassia', 'salikha', 'rougui', 'cinnamon', 'darsini', 'qirfa', 'chile', 'fulful harr', 'lajiao', 'paprika', 'clove', 'qaranful', 'dingxiang', 'coriander', 'kuzbura', 'yansui', 'husui', 'cumin', 'kammun', 'ziran', 'dill', 'shibitt', 'shiluo', 'fennel', 'shamar', 'huixiang', 'fenugreek', 'hulba', 'huluba', 'ginger', 'zanjabil', 'jiang', 'long pepper', 'darfilfil', 'biba', 'mace', 'basbas', 'roudoukoupi', 'nutmeg', 'jawz al-tib', 'roudoukou', 'pepper', 'fulful', 'hujiao', 'bors', 'saffron', 'zafaran', 'fanhonghua', 'Sichuan pepper', 'fagara', 'fulful sitshuwan', 'huajiao', 'star anise', 'yansun najmi', 'bajiaohuixiang', 'badian', 'turmeric', 'kurkum', 'jianghuang', 'vanilla', 'faniliya', 'xiangcao']
     # include lang     item spice w


'DataFrame.swapaxes' is deprecated and will be removed in a future version. Please use 'DataFrame.transpose' instead.



### Etymology box for LaTeX

In [26]:
# key = "tester"

# ################################################################################

# # The following code will create an etymology box environment for the key, to be used in LaTeX
# print("Started the generation of '" + key + "' as etymbox...")

# df_local = etymologies[key]
# # df_local.fillna('', inplace=True)

# # # Skipping those marked
# df_local = df_local[df_local['boxskip'] != 'yes']
# df_local.reset_index(inplace=True, drop=True)

# # # Replace empty cells with NaNs
# # df_local.replace(r'^\s*$', np.nan, regex=True, inplace=True)
# # df_local.replace(r'^nan$', np.nan, regex=True, inplace=True)

# # Create content and sources stage by stage
# content = ""
# source = ""
# sources = ""
# source_list = []
# nl = "\n"

# for index, row in df_local.iterrows():
#   stage = "< "
#   if pd.notna(row['complex']): # complex relationships
#     row['complex'] = re.sub("and from", "+", row['complex'])
#     stage += row['complex'] + " "
#   if pd.notna(row['language']): # language
#     stage += "\\textbf{" + row['language'] + "} "
#   if pd.notna(row['script']): # script
#     script = "{" + row['script'] + "} "
#     if row['language'] == 'Chinese':
#       script = "\\tc{" + row['script'] + "} "
#     stage += script
#   if pd.notna(row['term']): # term
#     stage += "\\textit{" + row['term'] + "} "
#   if pd.notna(row['IPA']): # IPA
#     stage += row['IPA'] + " "
#   if pd.notna(row['meaning']): # meaning
#     stage += "`" + row['meaning'] + "' "
#   if pd.notna(row['literal']): # literal meaning
#     stage += "[" + row['literal'] + "] "
#   stage = re.sub(' $', '', stage)
#   stage += ", "
#   if pd.notna(row['explanation']): # explanation
#     stage += row['explanation'] + " "
#   if pd.notna(row['remark']): # remark
#     stage += "(" + row['remark'] + ") "
#   stage = re.sub(',? ?$', '', stage)

#   if pd.notna(row['date']): # dates
#     stage += ", "
#     row['date'] = re.sub('a(?=\d)', 'a. ', row['date'])
#     row['date'] = re.sub('c(?=\d)', 'ca. ', row['date'])
#     if re.match('^-\d\d?$', row['date']): # if is a century BC
#       row['date'] = re.sub("-", "", row['date'])
#       date = "\\nth{" + row['date'] + "} c. \BC{}" # ARAB NUMERALS
#       # date = " " + roman(row['date']) + " \BC{}" # ROMAN NUMERALS
#     elif re.match('^\d\d?$', row['date']): # if is a century AD
#       date = "\\nth{" + row['date'] + "} c. \AD{}" # ARAB NUMERALS
#       # date = " " + roman(row['date']) + " \AD{}" # ROMAN NUMERALS
#     else:
#       date = row['date'] + " " # if it's a year
#     stage += date
#     stage = re.sub(',? ?$', '', stage)
#   if pd.notna(row['cognates']): # cognates
#     stage += "; cf. cognates " + row['cognates'] + " "
#   if pd.notna(row['derivates']): # cognates
#     if pd.notna(row['cognates']):
#       stage = re.sub(' $', '', stage)
#       stage += "; " + row['derivates'] + " "
#     else:
#       stage = re.sub(' $', '', stage)
#       stage += "; cf. " + row['derivates'] + " "
#   stage = re.sub(',? ?$', '', stage)
#   # stage = re.sub('cf\..*?(cf\.)', '', stage)

# # Final touches
#   if row['doubt'] == 'yes':
#     stage = re.sub(r'<', '<\\\\textss{?}', stage) # ???
#   if row['complex'] == '+':
#     stage = re.sub('<', '', stage)
#   if row['complex'] == 'or from':
#     stage = re.sub('<', '', stage)
#   content += stage + nl

# # Sources
#   source=""
#   if pd.notna(row['source zotero']):
#     source = row['source zotero']
#     print("1",source)
#     if '{' in source:
#       source = "s" + row['source zotero'].lower()
#       print(source)
#     else:
#       source = "{" + row['source zotero'].lower() + "}"
#       print(source)
#     if pd.notna(row['source page']):
#       source = "[" + str(row['source page']) + "]{" + row['source zotero'].lower() + "}"
#       print("4",source)
#       if row['source page'].isalpha() == True:
#         source = "[s.v. " + str(row['source page']) + "]{" + row['source zotero'].lower() + "}"
#         print("5",source)
#     source = "\\textcite" + source
#   print(source)
#   source_list.append(source)

# # clear duplicates from sources:
# print("SL1: ", source_list)
# # source_set = sorted(set(source_list), key=source_list.index)
# source_set = set(source_list)
# print("SS2: ", source_set)
# source_list2 = list(source_set)
# print("S3: ", source_list2)
# sources_unduplicated = '; '.join(source_list2)
# print("S4: ", sources_unduplicated)
# # test for duplicates
# newlist = [] # empty list to hold unique elements from the list
# duplist = [] # empty list to hold the duplicate elements from the list
# for i in source_list:
#     if i not in newlist:
#         newlist.append(i)
#     else:
#         duplist.append(i) # this method catches the first duplicate entries, and appends them to the list

# # The next stage is to print the duplicate entries, and the unique entries
# print("List of duplicates", duplist)
# print("Unique Item List", newlist) 
# if len(duplist) > 0:
#   # print("UNDUPL")
#   sources = sources_unduplicated
# else:
#   # print("ORI")
#   sources =  '; '.join(source_list)
# # print("S5: ", sources)

# sources =  '; '.join(source_list)

# # Cleaning
# sources = re.sub("; $", "", sources)
# sources = re.sub("^; ", "", sources)
# sources = re.sub("(; )+", "; ", sources)
# sources = "\\footnote{" + sources + "}\n"

# content = re.sub("\n$", "", content)
# content = re.sub(r"^< ", "", content) # delete the first <
# content = re.sub(r"\n,", ",", content)
# content = re.sub(r" nan ", " ", content)
# content = re.sub("(<\.\n?)+$", "", content)

# content += sources

# env_begin = r"\begin{etymology}" + "\label{ety:" + df_local['item'].iloc[0] + "}" + nl 
# env_end = r"\end{etymology}"

# box = env_begin + content + env_end
# box = re.sub(r"\u200e", "", box) #removes right to left mark

# # Save the spicebox as a standalone tex file

# filename = re.sub(" ", "_", key)
# filename = filename.lower()
# f = open(path_out_tex + "{}.tex".format("etymbox_" + filename), "w", encoding='utf-8')
# f.write(box)
# f.close()
# print("Etymology-box '" + str(key) + "' as a tex file was created.")
# box


In [27]:
# def etymbox(key):
  
#   # The following code will create a etymology box environment for the key, to be used in LaTeX
#   print("Started the generation of '" + key + "' as etymbox...")

#   df_local = etymologies[key]
#   # df_local.fillna('', inplace=True)

#   # # Skipping those marked
#   df_local = df_local[df_local['boxskip'] != 'yes']
#   df_local.reset_index(inplace=True, drop=True)

#   # # Replace empty cells with NaNs
#   # df_local.replace(r'^\s*$', np.nan, regex=True, inplace=True)
#   # df_local.replace(r'^nan$', np.nan, regex=True, inplace=True)

#   # Create content and sources stage by stage
#   content = ""
#   source = ""
#   sources = ""
#   source_list = []
#   nl = "\n"

#   for index, row in df_local.iterrows():
#     stage = "< "
#     if pd.notna(row['complex']): # complex relationships
#       row['complex'] = re.sub("and from", "+", row['complex'])
#       stage += row['complex'] + " "
#     if pd.notna(row['language']): # language
#       stage += "\\textbf{" + row['language'] + "} "
#     if pd.notna(row['script']): # script
#       script = "{" + row['script'] + "} "
#       if row['language'] == 'Chinese':
#         script = "\\tc{" + row['script'] + "} "
#       stage += script
#     if pd.notna(row['term']): # term
#       stage += "\\textit{" + row['term'] + "} "
#     if pd.notna(row['IPA']): # IPA
#       stage += row['IPA'] + " "
#     if pd.notna(row['meaning']): # meaning
#       stage += "`" + row['meaning'] + "' "
#     if pd.notna(row['literal']): # literal meaning
#       stage += "[" + row['literal'] + "] "
#     stage = re.sub(' $', '', stage)
#     stage += ", "
#     if pd.notna(row['explanation']): # explanation
#       stage += row['explanation'] + " "
#     if pd.notna(row['remark']): # remark
#       stage += "(" + row['remark'] + ") "
#     stage = re.sub(',? ?$', '', stage)

#     if pd.notna(row['date']): # dates
#       stage += ", "
#       row['date'] = re.sub('a(?=\d)', 'a. ', row['date'])
#       row['date'] = re.sub('c(?=\d)', 'ca. ', row['date'])
#       if re.match('^-\d\d?$', row['date']): # if is a century BC
#         row['date'] = re.sub("-", "", row['date'])
#         date = "\\nth{" + row['date'] + "} c. \BC{}" # ARAB NUMERALS
#         # date = " " + roman(row['date']) + " \BC{}" # ROMAN NUMERALS
#       elif re.match('^\d\d?$', row['date']): # if is a century AD
#         date = "\\nth{" + row['date'] + "} c. \AD{}" # ARAB NUMERALS
#         # date = " " + roman(row['date']) + " \AD{}" # ROMAN NUMERALS
#       else:
#         date = row['date'] + " " # if it's a year
#       stage += date
#       stage = re.sub(',? ?$', '', stage)
#     if pd.notna(row['cognates']): # cognates
#       stage += "; cf. cognates " + row['cognates'] + " "
#     if pd.notna(row['derivates']): # cognates
#       if pd.notna(row['cognates']):
#         stage = re.sub(' $', '', stage)
#         stage += "; " + row['derivates'] + " "
#       else:
#         stage = re.sub(' $', '', stage)
#         stage += "; cf. " + row['derivates'] + " "
#     stage = re.sub(',? ?$', '', stage)
#     # stage = re.sub('cf\..*?(cf\.)', '', stage)

#   # Final touches
#     if row['doubt'] == 'yes':
#       stage = re.sub(r'<', '<\\\\textss{?}', stage) # ???
#     if row['complex'] == '+':
#       stage = re.sub('<', '', stage)
#     if row['complex'] == 'or from':
#       stage = re.sub('<', '', stage)
#     content += stage + nl

#   # Sources
#     source=""
#     if pd.notna(row['source zotero']):
#       source = row['source zotero']
#       # print(source)
#       if '{' in source:
#         source = "s" + row['source zotero'].lower()
#         # print(source)
#       else:
#         source = "{" + row['source zotero'].lower() + "}"
#         # print(source)
#       if pd.notna(row['source page']):
#         source = "[" + str(row['source page']) + "]{" + row['source zotero'].lower() + "}"
#         # print(source)
#         if row['source page'].isalpha() == True:
#           source = "[s.v. " + str(row['source page']) + "]{" + row['source zotero'].lower() + "}"
#           # print(source)
#       source = "\\textcite" + source
#     # print(source)
#     source_list.append(source)

#   # clear duplicates from sources:
#   # print("SL1: ", source_list)
#   # source_set = sorted(set(source_list), key=source_list.index)
#   source_set = set(source_list)
#   # print("SS2: ", source_set)
#   source_list2 = list(source_set)
#   # print("S3: ", source_list2)
#   sources_unduplicated = '; '.join(source_list2)
#   # print("S4: ", sources_unduplicated)
#   # test for duplicates
#   newlist = [] # empty list to hold unique elements from the list
#   duplist = [] # empty list to hold the duplicate elements from the list
#   for i in source_list:
#       if i not in newlist:
#           newlist.append(i)
#       else:
#           duplist.append(i) # this method catches the first duplicate entries, and appends them to the list
#   # The next stage is to print the duplicate entries, and the unique entries
#   # print("List of duplicates", duplist)
#   # print("Unique Item List", newlist) 
#   if len(duplist) > 0:
#     # print("UNDUPL")
#     sources = sources_unduplicated
#   else:
#     # print("ORI")
#     sources =  '; '.join(source_list)
#   # print("S5: ", sources)

#   sources =  '; '.join(source_list)

#   # Cleaning
#   sources = re.sub("; $", "", sources)
#   sources = re.sub("^; ", "", sources)
#   sources = re.sub("(; )+", "; ", sources)
#   sources = "\\footnote{" + sources + "}\n"

#   content = re.sub("\n$", "", content)
#   content = re.sub(r"^< ", "", content) # delete the first <
#   content = re.sub(r"\n,", ",", content)
#   content = re.sub(r" nan ", " ", content)
#   content = re.sub("(<\.\n?)+$", "", content)

#   content += sources

#   env_begin = r"\begin{etymology}" + "\label{ety:" + df_local['item'].iloc[0] + "}" + nl 
#   env_end = r"\end{etymology}"

#   box = env_begin + content + env_end
#   box = re.sub(r"\u200e", "", box) #removes right to left mark

#   # Save the spicebox as a standalone tex file
#   filename = re.sub(" ", "_", key)
#   filename = filename.lower()
#   f = open(path_out_tex + "{}.tex".format("etymbox_" + filename), "w", encoding='utf-8')  
#   f.write(box)
#   f.close()
#   print("Etymology-box '" + str(key) + "' as a tex file was created.")

#   return box

# etymbox("tester")

### Etymology box for Markdown

In [28]:
def etymbox(e):

  # The following code will create a etymology box environment for the key, to be used in Markdown
  print("Started the generation of '" + e + "' as etymbox...")

  # Select word
  df_local = etymologies[e]
  # df_local.fillna('', inplace=True)

  # Skipping those marked
  df_local = df_local[df_local['skip'] != 'yes']
  df_local.reset_index(inplace=True, drop=True)

  # # Replace empty cells with NaNs
  # df_local.replace(r'^\s*$', np.nan, regex=True, inplace=True)
  # df_local.replace(r'^nan$', np.nan, regex=True, inplace=True)

  content = ""
  
  # Iterate through a word's etymology dataframe (stage by stage = row by row)
  for index, row in df_local.iterrows():
    # Initialize stage, add an '<' if its not the first stage
    if index == 0:
      stage = ""
    else:
      stage = "< "
    # If there are complex relations, add them (e.g., partly, and, or)
    if pd.notna(row['complex']):
      stage += row['complex'] + " "
    # Add language (in bold)
    if pd.notna(row['language']):
      stage += "**" + row['language'] + "** "
    # Add the term with native script (if exists)
    if pd.notna(row['script']):
      stage += row['script'] + " "
    # Add the term with transcription (in italics)
    if pd.notna(row['term']):
      stage += "*" + row['term'] + "* "
    # Add /IPA/
    if pd.notna(row['IPA']):
      stage += "/" + row['IPA'] + "/ "
    # Add 'meaning', gloss
    if pd.notna(row['meaning']):
      stage += "'" + row['meaning'] + "' "
    # Add the [literal meaning] if there is one
    if pd.notna(row['gloss']):
      stage += "[" + row['gloss'] + "] "
    # Clear ending
    stage = re.sub(' $', '', stage)
    # Add explanation
    if pd.notna(row['explanation']):
      stage += ", " + row['explanation'] + " "
    # Add (remark)
    if pd.notna(row['remark']):
      stage += " (" + row['remark'] + ") "
    # Clear ending
    stage = re.sub(" +", " ", stage)
    stage = re.sub(",? ?$", "", stage)

    # Add date, if there is a date
    if pd.notna(row['date']):
      # If it's a year
      date = row['date']

    # Add century if there is no date
    if pd.notna(row['century']) and pd.isna(row['date']):
      
      # If it's a century BC
      if re.match('^-\d\d?\??$', row['century']):
        # Remove dash
        row['century'] = re.sub("-", "", row['century'])
        # If there is question mark
        if re.match('\d\?', row['century']):
          # Remove the question mark
          row['century'] = re.sub("\?", "", row['century'])
          # Date BC
          # date = roman(int(row['century'])) + " BC?" # ROMAN NUMERALS
          date = row['century'] + " c. BC?" # ARAB NUMERALS
        else:
          # date = roman(int(row['century'])) + " BC" # ROMAN NUMERALS
          date = row['century'] + " c. BC" # ARAB NUMERALS

      # If it is a century AD
      elif re.match('^\d\d?\??$', row['century']):
        # If there is question mark
        if re.match('\d\?', row['century']):
          # Remove the question mark
          row['century'] = re.sub("\?", "", row['century'])
          # Date AD
          # date = "AD " + roman(int(row['century'])) + "?" # ROMAN NUMERALS
          date = "AD " + row['century'] + " c." # ARAB NUMERALS
        else:
          # date = "AD " + roman(int(row['century'])) # ROMAN NUMERALS
          date = "AD " + row['century'] + " c." # ARAB NUMERALS

      # Add date to stage
      stage += ", " + date
      
    # Clear ending
    stage = re.sub(',? ?$', '', stage)

    # # If both cognates and derivates
    # if pd.notna(row['cognates']) and pd.notna(row['derivates']):
    #   stage += "; cf. cognates " + row['cognates'] + "; derivates " + row['derivates'] + " "
    # # If cognates only
    # if pd.notna(row['cognates']) and pd.isna(row['derivates']):
    #   stage += "; cf. cognates " + row['cognates'] + " "
    # # If derivates only
    # if pd.notna(row['derivates']) and pd.isna(row['cognates']):
    #   stage += "; cf. derivates " + row['derivates'] + " "
    
    # # Clear ending
    # stage = re.sub(';?,? ?$', '', stage)

    # # If stage is doubtful, use '<?'
    # if row['doubt'] == 'yes':
    #   stage = re.sub('<', '<\?', stage)
    # # If stage is "complex", remove '<'
    # if pd.notna(row['complex']):
    #   stage = re.sub('<', '', stage)

    # # Sources (at each stage) A
    # source = ""
    # # If there is source (zotero), add
    # if pd.notna(row['source']):
    #   source = '\"' + row['source'] + '\"'
    #   # If there is page, add
    #   if pd.notna(row['source page']):
    #     source = '\"' + str(row['source'].lower()) + '\" \"' + str(row['source page']) + '\"'
    # # Add the Hugo shortcode syntax 
    # source = r' {{< cite ' + source + r' >}}'
    # # Create content
    # content += stage + source + "\n"

    # # Sources (once in the end, removing duplicates) B
    # # If there is source (zotero), add
    # if pd.notna(row['source']):
    #   sources += row['source'] + ";"
    #   # If there is page, add
    #   if pd.notna(row['source page']):
    #     source_pages += str(row['source page']) + ";"
    #   else:
    #     source_pages += ";"
    # # Create content
    # content += stage + "\n"

    # Create content
    content += stage + "\n"
    
    # Sources C
    # If there is source (zotero), add
#     if pd.notna(row['sources']):
#       sources += row['sources'] + ";"
      
#   # Turn sources into a list divided by ';'
#   source_list = sources.split(';')
#   # Remove duplicates
#   source_list = list(dict.fromkeys(source_list))
#   # Convert list to string
#   sources = ';'.join(source_list)    

#   # Clean ending
#   sources = re.sub(';?$', '', sources)
# #   source_pages = re.sub(';?$', '', source_pages)

#   # Add the Hugo-cite shortcode syntax
#   sources = r' {{< cite "' + sources + r'" >}}'
# #   source = r' {{< cite "' + sources + r'" "' + source_pages + r'" >}}'
#   # Add source to content
#   content += sources

  # Cleaning
  box = content
  box = re.sub(r"\u200e", "", box) # Removes right-to-left mark

  # # Save the spicebox as a standalone markdown file (if ever needed)
  # filename = re.sub(" ", "_", key)
  # filename = filename.lower()
  # f = open(path_out_md + "{}.md".format("etymbox_" + filename), "w", encoding='utf-8')  
  # f.write(box)
  # f.close()
  # print("Etymology-box '" + str(key) + "' as a md file was created.")

  return box

etymbox("saffron")

Started the generation of 'saffron' as etymbox...


"**English** *saffron*\n< **French** *safran* 'id.'\n< **Medieval Latin** *saffrānum* 'id.'\n< **Arabic** زعفران *zaʿfarān* 'id.' (not connected with \\textit{ṣafrā'} feminine of \\textit{aṣfar} `yellow')\n"

In [29]:
# Creating a dictionary of etymologies
dictionary_of_etymologies = {}

# Loop
for e in list_of_etymologies:
    box = (etymbox(e))
    # box = r'{{% notice style="primary" title="Pirates" icon="skull-crossbones" %}}' + "\n" + text + "\n" + r"{{% /notice %}}" + "\n\n"
    dictionary_update = {e: box}
    dictionary_of_etymologies.update(dictionary_update)
print('Done.')


Started the generation of 'tester' as etymbox...
Started the generation of 'allspice' as etymbox...
Started the generation of 'fulful ifranji' as etymbox...
Started the generation of 'duoxiangguo' as etymbox...
Started the generation of 'pimento' as etymbox...
Started the generation of 'anise' as etymbox...
Started the generation of 'anisun' as etymbox...
Started the generation of 'huiqin' as etymbox...
Started the generation of 'asafoetida' as etymbox...
Started the generation of 'hing' as etymbox...
Started the generation of 'hiltit' as etymbox...
Started the generation of 'anjudan' as etymbox...
Started the generation of 'awei' as etymbox...
Started the generation of 'xingqu' as etymbox...
Started the generation of 'caraway' as etymbox...
Started the generation of 'karawiya' as etymbox...
Started the generation of 'geluzi' as etymbox...
Started the generation of 'cardamom' as etymbox...
Started the generation of 'amomum' as etymbox...
Started the generation of 'hal' as etymbox...
St

In [30]:
# Check
dictionary_of_etymologies['saffron']

"**English** *saffron*\n< **French** *safran* 'id.'\n< **Medieval Latin** *saffrānum* 'id.'\n< **Arabic** زعفران *zaʿfarān* 'id.' (not connected with \\textit{ṣafrā'} feminine of \\textit{aṣfar} `yellow')\n"

# Website generation

## Build webpages

In [31]:
def webpage(key):
    
    '''
    This cell generates web pages from the datasets and writes them out to markdown files.
    '''

    # Get dataframe of current item
    print(f"Working on {key}...")
    df_local = df_items.loc[df_items['key'] == key]

    # Set a copy
    df_local = df_local.copy()

    # Item
    item = df_local['item'].iloc[0]

    # Reset index
    df_local.reset_index(drop=True, inplace=True)

    # Definition (if exists)
    if 'wn_definition' in df_local:
        if pd.notna(df_local['wn_definition'][0]):
            definition = df_local['wn_definition'][0]
            definition = definition[0].upper() + definition[1:]
            definition = "WordNet definition: " + definition + "." if definition[-1] != "." else definition
    else:
        definition = ""

    print("Definition:", definition)

    # Description
    if pd.notna(df_local['description'][0]):
        # Initialize description from data
        description = str(df_local['description'][0])
        # Capitalize first letter
        description = description[0].upper() + description[1:]
        # Initialize aka
        aka = ", also known as " + str(df_local['common_names'].iloc[0]) if pd.notna(df_local['common_names'].iloc[0]) else ""
        # Add aka to description
        if aka != "":
            description += aka
        # Add a dot to the end of aka if not already
        description = description + "." if description != "" and description[-1] != "." else description

        # Add related items if there are any
        if pd.notna(df_local['related'][0]):
            related = str(df_local['related'].iloc[0])
            # turn related into a list that is separated by semicolons
            related = related.split("; ")
            # drop an element off the list, if it does not appear in the list of items
            related = [x for x in related if x in list_of_items]
            # change every element of the list to a link that has the form [x](../items/x)
            related = [f'[{x}](../items/{x})' for x in related]
            # in every element of the list, change whitespace to underscore if in (brackets)
            related = [re.sub(r'(?<=\()(.*?)(?=\))', lambda x: re.sub(r'\s', '_', x.group(0)), x) for x in related]        
            # join the elements of the list with a comma and a space
            related = ", ".join(related)
            # add text
        else:
            related = ""
        
        if related != "":
            related = " It is related to " + related + "." if related[-1] != "." else related
            
        description += related
            
        # Add "see also" itesm if there are any
        if pd.notna(df_local['also'][0]):
            also = str(df_local['also'].iloc[0])
            # turn related into a list that is separated by semicolons
            also = also.split("; ")
            # drop an element off the list, if it does not appear in the list of items
            also = [x for x in also if x in list_of_items]
            # change every element of the list to a link that has the form [x](../items/x)
            also = [f'[{x}](../items/{x})' for x in also]
            # in every element of the list, change whitespace to underscore if in (brackets)
            also = [re.sub(r'(?<=\()(.*?)(?=\))', lambda x: re.sub(r'\s', '_', x.group(0)), x) for x in also]        
            # join the elements of the list with a comma and a space
            also = ", ".join(also)
        else:
            also = ""
        
        if also != "":
            also = " See also " + also + "." if also[-1] != "." else also
            
        description += also            
                
        # Add a dot to the end of if not already
        description = description + "." if description != "" and description[-1] != "." else description
        preamble_description = description
        page_description = description + "\n\n"
        
    else:
        description = ""
        preamble_description = ""
        page_description = ""

    print("Description:", description)
        
    # Add Wikipedia link if exists
    if pd.notna(df_local['wikipedia'][0]):
        page_description += ' [<i class="fab fa-wikipedia-w"></i>](' + df_local['wikipedia'][0] + ")"

    # Extract categories and tags and groups (which will be treated as tags)
    if pd.notna(df_local['category'][0]):
        category = df_local['category'][0]
        category_list = category.split("; ") if ";" in category else [f'{category}']
    else:
        category = ""
        category_list = []

    if pd.notna(df_local['tag'][0]):
        tag = df_local['tag'][0]
        tag_list = tag.split("; ") if ";" in tag else [f'{tag}']
    else:
        tag = ""
        tag_list = []
        
    if pd.notna(df_local['group'][0]):
        group = df_local['group'][0]
        group_list = group.split(";") if ";" in group else [f'{group}']
    else:
        group_list = []
        
    print("Categories:", category_list)
    print("Tags:", tag_list)
    print("Groups:", group_list)

    # Combine tags and groups
    tag_and_group_list = tag_list + group_list

    # Assemble preamble
    preamble = f'+++\ntitle = "{item.title()}"\nauthor = "Gabor Parti"\ndate = "{str(date.today())}"\ndescription = "{preamble_description}"\nweight = 10\n# draft = "true"\n# hidden = "true"\nplotly = true\ncategories = {str(category_list)}\ntags = {str(tag_and_group_list)}\nbibFile = "static/bibliography/parti.json"\n+++\n\n'

    ###########################
    ######## The Spice ########
    ###########################

    # Define taxon and family
    taxon_name = str(df_local['taxon_name'][0])
    if pd.notna(df_local['taxon_authors'][0]):
        taxon_authors = str(df_local['taxon_authors'][0])
    else:
        taxon_authors = ""
    taxon = f"*{taxon_name}* {taxon_authors}"

    family = str(df_local['family'][0])
    family = f"*{family}*"

    category = re.sub('; ', ' and ', category)
    if pd.notna(df_local['tag'][0]):
        tag = re.sub('; ', ' and ', tag)
    else:
        tag = ""

    part = str(df_local['part'][0])

    geographic_area = str(df_local['geographic_area'][0])
    lifeform_description = str(df_local['lifeform_description'][0])
    climate_description = str(df_local['climate_description'][0])

    # Intro
    kingdom = str(df_local['kingdom'][0])

    if kingdom == "Plantae":
        intro = item.upper() + f" is a {tag} {category}, valued and cultivated for its {part}. It is yielded from the plant {taxon}, a(n) {lifeform_description} in the {family} family, growing in the {climate_description} biome, with the native range of {geographic_area}.[^powo_{item}]\n\n[^powo_{item}]: [Plants of the World Online](https://powo.science.kew.org)\n\n"
        
    if kingdom == "Animalia":
        intro = item.upper() + f" is a {tag} {category}, valued for its {part}. It is obtained from an animal, {taxon}, from the {family} family, living in the {climate_description} biome, with a native range of {geographic_area}."

    # # if pd.notna(df_local['heat'][0]):
    # #     intro = intro + "It is used for its " + re.sub('; ', ' and ', part) + ", primarily for " + str(df_local['major uses'][0]) + ". Its aroma is described as " + str(df_local['taste/smell'][0]) + ", with a heat index of " + str(df_local['heat'][0]) + ".[^ucla_medicinal_2002]" + "\n\n" + "[^ucla_medicinal_2002]: Medicinal Spices Exhibit. (2002). UCLA Biomedical Library: History & Special Collections. https://unitproj.library.ucla.edu/biomed/spice/index.cfm?spicefilename=taste.txt&itemsuppress=yes&displayswitch=0\n\n"

    # Add values from df_local['petruzzello][0] if it's value is 'yes'.
    citations = ""
    if pd.notna(df_local['powo_url'][0]):
        citations += "powo"
    if pd.notna(df_local['petruzzello'][0]):
        citations += ";petruzzello_2023_list"
    if pd.notna(df_local['vanwyk'][0]):
        citations += ";vanwyk_2014_culinary"
    if pd.notna(df_local['dalby'][0]):
        citations += ";dalby_2000_dangerous"
    if pd.notna(df_local['hill'][0]):
        citations += ";hill_2004_contemporary"
    if pd.notna(df_local['anderson'][0]):
        citations += ";anderson_2023_history"
        
    if citations != "":
        sources = r' {{< cite "' + citations + r'" >}}'
    else:
        sources = ""
        
    intro += " See more in " + sources + "\n\n"

    print(intro)

    # Overview 
    overview_head = "## Overview\n\n"

    # Add taxon
    df_local.loc[:, 'taxon'] = taxon

    # Prepare overview tables
    df_overview = df_local[['taxon', 'family', 'regions', 'continents', 'part']]

    # Add cultivation if available
    if pd.notna(df_local['cultivation'][0]):
        # Add cultivation column to df_overview
        df_overview = df_overview.assign(cultivation=df_local['cultivation'])
        
    # Set database link(s)
    if pd.notna(df_local['powo_url'][0]):
        botanical_database = "[POWO](" + df_local['powo_url'] + ")"
    else:
        botanical_database = ""
    if pd.notna(df_local['gbif'][0]):
        botanical_database += "; [GBIF](" + df_local['gbif'][0] + ")"
    if pd.notna(df_local['trop'][0]):
        botanical_database += "; [TROP](" + df_local['trop'][0] + ")"
    if pd.notna(df_local['eol'][0]):
        botanical_database += "; [EOL](" + df_local['eol'][0] + ")"
    df_local.loc[:, 'botanical database'] = botanical_database

    if pd.notna(df_local.loc[:, 'botanical database'][0]):
        #add cultivation column to df_overview
        df_overview = df_overview.assign(botanical_database=df_local['botanical database'])
        
    df_overview

    # Transpose table
    df_overview = df_overview.T
    # Reset index
    df_overview.reset_index(inplace=True)
    # Rename columns
    df_overview.columns = ['item', item]
    # Prepare data to create markdown table
    data = df_overview.to_dict(orient='records')
    # Create markdown table
    overview_mdt = markdown_table(data).set_params(row_sep = 'markdown', quote = False).get_markdown()
    # Table
    overview = overview_head + overview_mdt + "\n\n"

    print(overview)
    
    # Photo (todo, make list of img ext and source now they are string)
    # if df_local['img_count'][0] > 0:
    #     img_src = re.sub(',.*', '', df_local['img_source'][0])
    #     img_src = re.sub('_', ' ', img_src)
        
    #     photo = "![" + df_local['item'][0] + "](/images/photos/" + df_local['item'][0] + "-1-" + re.sub(',.*', '', df_local['img_source'][0]) + "." + re.sub(',.*', '', df_local['img_extension'][0]) + "?width=14rem&classes=shadow" + ' "Photo: ' + img_src + '")\n\n'
    # else:
    #     photo = ""
    
    
    
    # Illustration # &classes=shadow
    if pd.notna(df_local['ill_source'].iloc[0]):
        illustration_alt = "Illustration of " + df_local['taxon_name'][0] + " from " + df_local['ill_source'][0]
        illustration = "![" + df_local['taxon_name'][0] + '](/images/illustrations/' + key + '.png?width=40rem "' + illustration_alt + '")' + '\n'
        illustration_source = df_local['ill_source'][0] + r"{{< cite -" + str(df_local['ill_key'][0]) + r" >}} " + str(df_local['ill_page'][0]) + r"."
        illustration = illustration + "\n>Illustration of " + df_local['taxon_name'][0] + " from " + illustration_source
        if pd.notna(df_local['ill_link'].iloc[0]):
            illustration = illustration + " [{{% icon image %}}](" + df_local['ill_link'].iloc[0] + ")"
    else:
        illustration = ""
    illustration = illustration + "\n\n"
    
    # Top display gallery
    if len(list_files(f"../static/images/photos/{key}")) > 1:
        display_gallery = '{{< load-photoswipe >}}\n\n{{< gallery dir="/images/photos/' + key + '" hover-effect="slideup" caption-effect="fade" caption-position="none" />}}' + "\n\n"
    else:
        display_gallery = ""
        
    # Gallery on the bottom of the page
    if os.path.isdir(f"../static/images/photos/{key}/gallery"):
        gallery =  '## Gallery\n\n{{< load-photoswipe >}}\n\n{{< gallery dir="/images/photos/' + key + '/gallery" hover-effect="slideup" caption-effect="fade" />}}' + "\n\n"
    else:
        gallery = ""



    # Quick names (predefined)
    if pd.notna(df_local['English'][0]):
        en = "**English:** " + df_local['English'][0] + " · "
    else:
        en = ""
    if pd.notna(df_local['Hungarian'][0]):
        hu = "**Hungarian:** " + df_local['Hungarian'][0] + " · "
    else:
        hu = ""
    if pd.notna(df_local['Arabic'][0]):
        ar = '**Arabic:** <span class="arabic-text" dir="rtl">' + df_local['Arabic'][0] + '</span>' + " · "
    else:
        ar = ""
    if pd.notna(df_local['Hindi'][0]):
        hi = '**Hindi:** <span class="devanagari-text">' + df_local['Hindi'][0] + '</span>' + " · "
    else:
        hi = ""
    if pd.notna(df_local['Chinese'][0]):
        zh = '**Chinese:** <span class="traditional-chinese-text">' + df_local['Chinese'][0] + '</span>' + " · "
    else:
        zh = ""
    
    quick_names = en + hu + ar + hi + zh
    
    # Quick names (additional)
    if 'french' in df_local:
        if pd.notna(df_local['french'][0]):
            fr = "**French:** " + df_local['french'][0] + " · "
            quick_names += fr
        
    if 'italian' in df_local:
        if pd.notna(df_local['italian'][0]):
            it = "**Italian:** " + df_local['italian'][0] + " · "
            quick_names += it
    
    # Remove last dot from quick names
    quick_names = re.sub(" · $", "", quick_names)
    
    if pd.notna(df_local['wiktionary'][0]):
        wiktionary = ' · [<i class="fab fa-wikipedia-w"></i>](' + df_local['wiktionary'][0] + ")"
    else:
        wiktionary = ""
        
    quick_names += wiktionary
    
    # Center it
    quick_names = '<center>\n\n' + quick_names + '\n\n</center>\n\n'



    # Distribution
    if pd.notna(df_local['native'][0]):
        distribution = "## Distribution\n\n"
        distribution = distribution + r'{{< load-plotly >}}' + '\n\n' + r'{{< plotly json="/aromatica/plotly/distributions/' + key + r'.json" weight="600" height="300" >}}' + '\n\n'
        distribution = distribution + f">Native and introduced habitats of {df_local['taxon_name'][0]}[^powo]\n\n[^powo]: {df_local['powo_url'][0]}\n\n"

        # Check if 'native areas' is not empty and not 'NA' before adding to the string
        regions = ""
        if pd.notna(df_local['native'][0]) and df_local['native'][0] != 'NA':
            regions += "**Native areas:** &ensp; &ensp; &ensp; " + df_local['native'][0] + "\n\n"
        # Check if 'introduced areas' is not empty and not 'NA' before adding to the string
        if pd.notna(df_local['introduced'][0]) and df_local['introduced'][0] != 'NA':
            regions += "**Introduced areas:** " + df_local['introduced'][0] + "\n\n"
        distribution += '<p style="text-align:left;">\n\n' + regions + '</p>\n\n'
    else :
        distribution = ""


    
    ##############################
    ######## Nomenclature ########
    ##############################

    # Dataframe of current item 
    # df_names_local = df_names.loc[df_names['item'] == item]

    # Reset index
    # df_names_local.reset_index(drop=True, inplace=True)

    # Names
    # names_head = "***\n\n## Nomenclature\n\n"
    # Heads
    # names_head_en = "### English\n\n"
    # names_head_ar = "### Arabic\n\n"
    # names_head_zh = "### Chinese\n\n"
    
    # # Language by language
    # language = "English"
    # df = df_names_local.loc[df_names_local['language'] == language]
    # df = df[['term', 'source human']]
    # df = df.rename(columns={'source human': 'source'})
    # data = df.to_dict(orient='records')
    # names_mdt_en = markdown_table(data).set_params(row_sep = 'markdown', quote = False).get_markdown()

    # language = "Arabic"
    # df = df_names_local.loc[df_names_local['language'] == language]
    # df = df[['script', 'term', 'literal', 'source human']]
    # df = df.rename(columns={'source human': 'source'})
    # data = df.to_dict(orient='records')
    # names_mdt_ar = markdown_table(data).set_params(row_sep = 'markdown', quote = False).get_markdown()

    # language = "Chinese"
    # df = df_names_local.loc[df_names_local['language'] == language]
    # df = df[['script', 'term', 'literal', 'source human']]
    # df = df.rename(columns={'source human': 'source'})
    # data = df.to_dict(orient='records')
    # names_mdt_zh = markdown_table(data).set_params(row_sep = 'markdown', quote = False).get_markdown()
    # names = names_head + names_head_en + names_mdt_en + "\n\n" + names_head_ar + names_mdt_ar + "\n\n" + names_head_zh + names_mdt_zh + "\n\n"
    names = ""



    ###########################
    ####### Etymologies #######
    ###########################

    # wordlist = df_local['etymologies'][0].split("; ")
    # etymologies = ""
    # for word in wordlist:
    #     etymologies += dictionary_of_etymologies[word] + "\n\n"
    # etymologies = "## Etymologies\n\n" + etymologies
    etymologies = "" 
    
    
    
    ###########################
    ####### Manuscripts #######
    ###########################
    
    # Manuscripts (if any)
    # if a file exists in the folder "content/items/manuscripts/" with the name of the item, then add it to the page
    if os.path.isfile(f"../content/items/manuscripts/{key}_ms.md"):
        manuscript = "***\n\n{{% include \"content/items/manuscripts/" + key + "_ms.md\" %}}\n\n"
    else:
        manuscript = ""

    ######## Assemble page ########
    page = preamble + page_description + display_gallery + quick_names + overview + intro + illustration + distribution + names + etymologies + gallery
    
    # Bibliography - if page contains "{{< cite", then add bibliography
    if "{{< cite" in page:
        bibliography = "# Bibliography\n\n{{< bibliography cited >}}\n\n"
    else:
        bibliography = ""
            
    # Write page to file
    with open(path_out_md + key + '_gen.md', 'w', encoding='utf-8') as f:
            f.write(page)
            
    # Write bibliography to file
    with open(path_out_md + key + '_bib.md', 'w', encoding='utf-8') as f:
            f.write(bibliography)
    return

# Loop through all spices
for key in list_of_keys:
    webpage(key)
print("Done.")

Working on allspice...
Definition: 
Description: The dried unripe berries of a small Caribbean tree, also known as allspice; Jamaica pepper; pimento; myrtle pepper; newspice.
Categories: ['spice']
Tags: ['culinary', 'medicinal', 'perfumery', 'distillery']
Groups: ['false peppers']
ALLSPICE is a culinary and medicinal and perfumery and distillery spice, valued and cultivated for its unripe fruit; leaf; wood. It is yielded from the plant *Pimenta dioica* (L.) Merr., a(n) tree in the *Myrtaceae* family, growing in the seasonally dry tropical biome, with the native range of S. Mexico to C. America, Caribbean.[^powo_allspice]

[^powo_allspice]: [Plants of the World Online](https://powo.science.kew.org)

 See more in  {{< cite "powo;petruzzello_2023_list;vanwyk_2014_culinary;dalby_2000_dangerous;hill_2004_contemporary;anderson_2023_history" >}}


## Overview

|       item       |                                                                                      allspice                    

## Merge autogenerated files with manuscripts and bibliographies

In [32]:
def assemble_page(key):
    '''
    This function merges generated web pages with hand written parts (manuscripts), and page bibliographies into a final markdown file to show on a web page. Gen and bib are generated above, manuscripts are not. E.g., allspice_gen.md + allspice_ms.md + allspice_bib.md = allspice.md.
    '''
    filepath = website_md + 'manuscripts/' + key + '_ms.md'
    if os.path.isfile(filepath) == True:
        # Read generated files
        with open(path_out_md + key + '_gen.md', 'r', encoding='utf-8') as f:
            generated = f.read()
        # Read manuscript files    
        with open(website_md + 'manuscripts/' + key + '_ms.md', 'r', encoding='utf-8') as f:
            manuscript = f.read()
        # Read bib files
        with open(path_out_md + key + '_bib.md', 'r', encoding='utf-8') as f:
            bibliography = f.read()
        # Assemble    
        page = generated + "\n\n***\n\n" + manuscript + "\n\n" + bibliography
        # Write out page file
        with open(website_md + key + '.md', 'w', encoding='utf-8') as f:
            f.write(page)
    else:
        # Read generated files
        with open(path_out_md + key + '_gen.md', 'r', encoding='utf-8') as f:
            generated = f.read()
        # Read bib files    
        with open(path_out_md + key + '_bib.md', 'r', encoding='utf-8') as f:
            bibliography = f.read()
        # Assemble    
        page = generated + "\n\n" + bibliography
        # Write out page file
        with open(website_md + key + '.md', 'w', encoding='utf-8') as f:
            f.write(page)
    return

# Loop through all spices
for key in list_of_keys:
    assemble_page(key)
print("Done.")

Done.


# Maps

In [33]:
df = df_items.copy()

# Fill NaNs in the Arabic column with "Not available"
df.fillna({'Arabic': 'Not available'}, inplace=True)
df.fillna({'ar': ''}, inplace=True)

df.fillna({'Chinese': 'Not available'}, inplace=True)
df.fillna({'zh': ''}, inplace=True)

### Map settings

In [34]:
# Visual variables for map (dark mode)
font_size = 12
font_color = "#dddddd"
font_family = "Noto Sans"
marker_symbol= 'circle'
marker_size = 12
max_marker_size = 32
edge_color = transparent
edge_size = 1
opacity = 0.75
line_width = 4
water = "#202020"
grid_color = "#282828"
land = "#303030"
lines = "#383838"
copyright_color = "#404040"
background_color = transparent
legend_background_color = quarter_transparent
color_scheme = prism

# # Visual variables for map (light mode)
# font_size = 12
# font_color = "#000000"
# font_family = "Noto Serif"
# marker_symbol= 'circle'
# marker_size = 12
# max_marker_size = 32
# edge_color = transparent
# edge_size = 1
# opacity = 0.75
# line_width = 4
# water = "#ffffff"
# grid_color = "#f7f7f7"
# land = "#ffffff"
# lines = "#777777"
# copyright_color = "#f7f7f7"
# background_color = transparent
# legend_background_color = quarter_transparent
# color_scheme = prism

# Orthographic globe layout
ortho_traces = dict(
    textposition = 'top right', # middle left, bottom center, etc.
    textfont = dict(size=font_size, color=font_color, family=font_family),
    hovertemplate=
        "<b>%{customdata[0]}</b><br><br>" +
        "Species: <i>%{customdata[1]}</i><br>" +
        "Family: <i>%{customdata[2]}</i><br>" +
        "Region of origin: %{customdata[3]}<br>" +
        "Arabic: %{customdata[4]} <i>%{customdata[5]}</i><br>" +
        "Chinese: %{customdata[6]} <i>%{customdata[7]}</i><br>" +
        # "Spreadability: %{customdata[7]:.2f}<br>" +
        "<extra></extra>",
    marker = dict(
        symbol = marker_symbol,
        size = marker_size,
        line = dict(
            color=edge_color,
            width=edge_size
        )
    )
)

ortho_layout = go.Layout(
    paper_bgcolor=background_color,
    plot_bgcolor=background_color,
    geo = dict(
        resolution=110, # 50 is large; 110 is small
        scope='world', # 'world', 'asia'
        projection_type = 'orthographic', # orthographic, natural earth
        projection_scale = 1,
        projection_rotation = {'lat': 20, 'lon': 80, 'roll': 0},
        bgcolor=background_color,
        showcoastlines=True, coastlinewidth = 1, coastlinecolor = lines,
        showcountries=False, countrywidth = 1, countrycolor = lines,
        showframe=True, framewidth = 1, framecolor = lines,
        showlakes=True, lakecolor = water,
        showland=True, landcolor = land,
        showocean=True, oceancolor = water,
        showrivers=True, riverwidth = 1, rivercolor = water,
        showsubunits=True, subunitwidth = 0, subunitcolor = lines,
        lonaxis = dict(showgrid = True, gridwidth = 0.5, dtick = 10, gridcolor=grid_color),
        lataxis = dict (showgrid = True, gridwidth = 0.5, dtick = 10, gridcolor=grid_color)),
    showlegend = True,
    legend=dict(x=0, y=0, xanchor="left", yanchor="bottom", bgcolor=legend_background_color,
                font=dict(color=font_color, size=font_size, family=font_family),
                title_font=dict(color=font_color, size=font_size+2, family=font_family),
                traceorder = 'normal', orientation="v"),
    title=dict(x=0.5, y=0.99, xanchor='center', yanchor='top', text='',
               font=dict(color=font_color, size=font_size+6, family=font_family)),
    margin={"r":0,"t":0,"l":0,"b":0},
    hoverlabel=dict(#bgcolor="white",
                    font_size=font_size,
                    font_family=font_family),
    )

# "Document size" for pdfs
document_size = dict(width = 600, height=600)

# Copyright
cr = dict(
    name="copyright",
    text="© Gábor Parti, 2024",
    font=dict(color=copyright_color, size=font_size-4, family=font_family),
    opacity=0.9,
    xref="paper",
    yref="paper",
    x=0.5,
    y=0,
    # xanchor="right", 
    # yanchor="bottom", 
    # align="center",
    showarrow=False,
)
# fig.update_layout(annotations = [cr]) # to call

## Info
info = dict(
    name="info",
    text="Click on a material to navigate to its corresponding page!",
    font=dict(color=font_color, size=font_size+4, family=font_family),
    opacity=0.9,
    xref="paper",
    yref="paper",
    x=0.5,
    y=0.05,
    # xanchor="right", 
    # yanchor="bottom", 
    # align="center",
    showarrow=False,
)
# fig.update_layout(annotations = [info]) # to call

# Adding layout images
logo = dict(
    source="https://github.com/partigabor/aromatica/blob/main/static/images/favicon.svg",
    sizex=0.15, sizey=0.15,
    x=1, y=0, 
    xanchor="right", 
    yanchor="bottom", 
)
# fig.add_layout_image(logo) # to call

In [35]:
# Layout for cloropleth maps (regions and distributions)
dist_traces = dict(
    hovertemplate=
        "Region: <i>%{customdata[0]}</i><br>" +
        "Code: <i>%{customdata[1]}</i><br>" +
        "<extra></extra>",
)

dist_layout = go.Layout(
    paper_bgcolor=background_color,
    plot_bgcolor=background_color,
    geo = dict(
        resolution=110, # 50 is large; 110 is small
        scope='world', # 'world', 'asia'
        projection_type = 'orthographic', #'natural earth',
        projection_scale = 2,
        projection_rotation = {'lat': 20, 'lon': 80, 'roll': 0},
        bgcolor=background_color,
        showcoastlines=True, coastlinewidth = 1, coastlinecolor = lines,
        showcountries=False, countrywidth = 1, countrycolor = lines,
        showframe=True, framewidth = 1, framecolor = lines,
        showlakes=True, lakecolor = water,
        showland=True, landcolor = land,
        showocean=True, oceancolor = water,
        showrivers=True, riverwidth = 1, rivercolor = water,
        showsubunits=True, subunitwidth = 0, subunitcolor = lines,
        lonaxis = dict(showgrid = True, gridwidth = 0.5, dtick = 10, gridcolor=grid_color),
        lataxis = dict (showgrid = True, gridwidth = 0.5, dtick = 10, gridcolor=grid_color)),
    showlegend = True,
    legend=dict(xanchor="left", yanchor="bottom", 
                # x=0.1, y=0.1, # for natural earth
                x=0, y=0, # for orthographic
                bgcolor=legend_background_color,
                font=dict(color=font_color, size=font_size, family=font_family),
                title_font=dict(color=font_color, size=font_size+2, family=font_family),
                traceorder = 'normal', orientation="v"),
    title=dict(x=0.5, y=0.99, xanchor='center', yanchor='top', text='',
               font=dict(color=font_color, size=font_size+6, family=font_family)),
    margin={"r":0,"t":0,"l":0,"b":0},
    hoverlabel=dict(#bgcolor="white",
                    font_size=font_size,
                    font_family=font_family),
    )

In [36]:
# Define the map layout with the Marble basemap
marble = go.Layout(
    margin=dict(l=0, r=0, t=0, b=0),
    hovermode="closest",
    mapbox_style="white-bg",
    mapbox_layers=[
        {
            "below": 'traces',
            "sourcetype": "raster",
            "sourceattribution": "United States Geological Survey",
            "source": [
                "https://basemap.nationalmap.gov/arcgis/rest/services/USGSImageryOnly/MapServer/tile/{z}/{y}/{x}"
            ]
        }
      ]
    # mapbox=dict(
    #     center=dict(lat=0, lon=0),
    #     style="open-street-map", 
    #     zoom=1,
    # ),
)

## World map by plant family

In [37]:
# Set size
df['size'] = 1

# Create figure data
data = px.scatter_geo(df,
    lat='lat', 
    lon='lon',
    # text='item',
    color='family',
    color_discrete_sequence=color_scheme,
    size_max = max_marker_size,
    size = 'size',
    opacity = opacity,
    hover_name='item',
    hover_data={'item':True, 'taxon_name':True, 'family':True, 'geographic_area':True, 'Arabic':True, 'ar':True, 'Chinese':True, 'zh':True, 'lon':False, 'lat':False, 'size':False}, #'spreadability':':.2f', 
    labels={"group": "category"}
    )

# Save figure data
fig = data

###################################################
# Interactive visualization (HTML/JSON) for the web

# Call the orthographic traces and layout settings from above
fig.update_traces(ortho_traces)
fig.update_layout(ortho_layout)
# fig.update_layout(title_text = "Title")
# fig.update_layout(basemap_visible=True)

# Add copyrigth
fig.update_layout(annotations=[cr])

# Show figure
fig.show()

# Write interactive visualization (HTML/JSON) for the web
filename = "home"
fig.write_html(path_out_html + filename + ".html")
fig.write_json(path_out_json + filename + ".json", validate=True, pretty=True)

###################################################
# Image (PNG/PDF) for documents

# Call figure data
# fig = data

# Call the orthographic traces and layout settings from above
# fig.update_traces(ortho_traces)
# fig.update_layout(ortho_layout)
# fig.update_layout(geo=dict(projection_rotation = {'lat': 20, 'lon': 80, 'roll': 0}))
# fig.update_layout(document_size)

# Show figure
# fig.show()

fig.write_image(path_out_png + filename + ".png", scale=3)
fig.write_image(path_out_pdf + filename + ".pdf", engine="kaleido")

In [38]:
# fig = px.scatter_mapbox(df, lat="lat", lon="lon", color="family", color_discrete_sequence=color_scheme, zoom=1)

fig = go.Figure()

fig.add_trace(go.Scattermapbox(
        lat=df.lat,
        lon=df.lon,
        mode='markers+text',
        marker=go.scattermapbox.Marker(
            size=12,
            color='#88ae43',
            opacity=0.9
        ),
        text=df.item,
        hoverinfo='text'
    ))

fig.update_layout(marble)

fig.show()

# Write interactive visualization (HTML/JSON) for the web
filename = "home_marble"
fig.write_html(path_out_html + filename + ".html")
fig.write_json(path_out_json + filename + ".json", validate=True, pretty=True)

# Write image
# fig.write_image(path_out_png + filename + ".png", scale=3)

## World map with links

In [42]:
# Set size
df['size'] = 1
# Size by spreadability
# df['spreadability'] = df['spreadability'].astype(float)
# df['spreadability'] = df['spreadability'].round(2)
# df['size'] = df['spreadability'] + 2
max_marker_size = 12

# Add links
df['url'] = "https://partigabor.github.io/aromatica/items/" + df['key']

# For the website, with hyperlinks
data = px.scatter_geo(df, 
    lat='lat', lon='lon',
    text='item',
    color="family",
    color_discrete_sequence=color_scheme,
    opacity = opacity,
    size="size",
    size_max=max_marker_size,
    hover_name="item", 
    hover_data={'item':True, 'taxon_name':True, 'family':True, 'regions':True, 'Arabic':True, 'ar':True, 'Chinese':True, 'zh':True, 'lon':False, 'lat':False, 'size':False, 'url':False},
    # labels={"group": "category"},
    )

fig = data

# # Update layout
# fig.update_layout(
#         # title = 'Title',
#         geo_scope='world',
#         template = 'plotly_dark',
#     )

fig.update_traces(ortho_traces)
fig.update_layout(ortho_layout)

# Update to black background
fig.update_layout(
    paper_bgcolor='#101010',
)

fig.update_layout(annotations=[info, cr])
# fig.add_layout_image(logo)

# Write
filename = "output/distribution_map"
fig.write_json(filename + ".json", validate=True, pretty=True)
fig.write_html(filename + ".html")
fig.write_image(filename + ".pdf", engine="kaleido")
fig.write_image(filename + ".png", scale=3)

fig.show()

######################################################
# Get HTML representation of plotly.js and this figure
plot_div = plot(fig, output_type='div', include_plotlyjs=True)

# Get id of html div element that looks like
# <div id="301d22ab-bfba-4621-8f5d-dc4fd855bb33" ... >
res = re.search('<div id="([^"]*)"', plot_div)
div_id = res.groups()[0]

# Build JavaScript callback for handling clicks 
# and opening the URL in the trace's customdata 
js_callback = """
<script>
var plot_element = document.getElementById("{div_id}");
plot_element.on('plotly_click', function(data){{
    console.log(data);
    var point = data.points[0];
    if (point) {{
        console.log(point.customdata[11]);
        window.open(point.customdata[11]);
    }}
}})
</script>
""".format(div_id=div_id)

# Build HTML string
html_str = """
<html>
<body>
{plot_div}
{js_callback}
</body>
</html>
""".format(plot_div=plot_div, js_callback=js_callback)

# Write html with hyperlinks
with open(filename + ".html", 'w', encoding='utf-8') as f:
    f.write(html_str)

# Read back html and write it as json. Seems to work but does not open links when embedded...
def html_to_json(html_file):
    with open(html_file + '.html') as f:
        html = f.read()
    call_arg_str = re.findall(r'Plotly\.newPlot\((.*)\)', html[-2**16:])[0]
    call_args = json.loads(f'[{call_arg_str}]')
    plotly_json = {'data': call_args[1], 'layout': call_args[2]}  
    figure=plotly.io.from_json(json.dumps(plotly_json))
    figure.write_json(filename + ".json", validate=True, pretty=True)
    return

# html_to_json('distribution_map')

In [43]:
# Move files to the website folder
move_dir(path_out_json, website_json, "*.json")

In [44]:
# # Basic map to show some countries, not very good

# import plotly.express as px
# import geopandas as gpd

# # Load the world map data
# world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))

# # Filter the data to include only India, Sri Lanka, and Maldives
# target_countries = ['India', 'Sri Lanka', 'Maldives', 'Wales']
# filtered_world = world[world['name'].isin(target_countries)]

# # Create a choropleth map
# fig = px.choropleth(
#     filtered_world,
#     locations='iso_a3',
#     color='name',
#     color_discrete_sequence = color_scheme,
#     projection='orthographic',
#     title='Highlighted Countries: India, Sri Lanka, Maldives'
# )

# # Call the orthographic traces and layout settings from above
# fig.update_layout(ortho_layout)
# # fig.update_layout(title_text = "Title")

# # Show the map
# fig.show()


In [46]:
df = df_items.copy()

key = "saffron"

## Distribution maps

In [47]:
def habitat_map(key):
    '''
    This function generates a map of the distributions of a spice.
    '''

    print("Drawing habitat map of", key)
    
    # Load the geographic data (shapefile or GeoJSON) # https://github.com/tdwg/wgsrpd
    gdf = gpd.read_file("data\\resources\\geo\\level3.geojson")

    # Rename columns
    gdf.columns = ['name', 'code', 'code_l2', 'code_l1', 'geometry']

    # Get local df and reset its index
    df_local = df_items.loc[df_items['key'] == key]
    df_local.reset_index(drop=True, inplace=True)

    # Get list of native areas
    native = df_local.loc[0, 'native'].split(', ')

    # Get list of introduced areas, if it is not NA
    if pd.notna(df_local.loc[0, 'introduced']):
        introduced = df_local.loc[0, 'introduced'].split(', ')
    else:
        introduced = []

    # Filter data for native areas from gdf dataframe's LEVEL3_NAM column
    filtered_data_native = gdf[gdf['name'].isin(native)].copy()  # Ensure a copy is made
    filtered_data_native.loc[:, 'region'] = 'native'

    # Filter data for introduced areas from gdf dataframe's LEVEL3_NAM column
    filtered_data_introduced = gdf[gdf['name'].isin(introduced)].copy()  # Ensure a copy is made
    if not filtered_data_introduced.empty:
        filtered_data_introduced.loc[:, 'region'] = 'introduced'

    # Concatenate the filtered dataframes
    filtered_data = pd.concat([filtered_data_native, filtered_data_introduced])

    # ***

    # Create the choropleth map
    fig = px.choropleth(
        filtered_data,
        geojson = filtered_data.geometry,
        locations = filtered_data.index,
        color = 'region',
        color_discrete_sequence = ['#88ae43', '#6943ae'],
        hover_data = {'name': True, 'code': True},
        projection = 'orthographic'
    )

    # Call layout
    fig.update_traces(dist_traces)
    fig.update_layout(dist_layout)

    # Adjust map bounds
    # fig.update_geos(fitbounds='locations')

    # Get centroid values for the native areas of the item
    for index, row in df_local.iterrows():
        native_distribution = row['native'].split(', ')
        # Filter data for native distribution from gdf dataframe's LEVEL3_NAM column
        native_data = gdf[gdf['name'].isin(native_distribution)].copy() 
        # Calculate centroid data
        native_centroid = native_data.to_crs('+proj=cea').centroid.to_crs(native_data.crs)
        native_centroid_lon = native_centroid.x.iloc[0]
        native_centroid_lat = native_centroid.y.iloc[0]
    # Amend projection rotation with the native areas' centroid values
    fig.update_layout(geo=dict(projection_rotation = {'lat': native_centroid_lat, 'lon': native_centroid_lon, 'roll': 0}))

    # Show the map
    # fig.show()

    # Save
    # fig.write_html(path_out_html + key + ".html")
    fig.write_json(path_out_json + key + ".json", validate=True, pretty=True)
    # fig.write_image(path_out_png + key + ".png", scale=3)
    return

In [48]:
# Loop through all spices
for key in list_of_keys:
    # If item belongs to kingdom Plantae, draw a map
    if df_items.loc[df_items['key'] == key, 'kingdom'].iloc[0] == 'Plantae':
        habitat_map(key)
print("Done.")

# Move files to the website folder
move_dir(path_out_json, website_json + "/distributions", "*.json")

Drawing habitat map of allspice
Drawing habitat map of anise
Drawing habitat map of asafoetida
Drawing habitat map of caraway
Drawing habitat map of cardamom
Drawing habitat map of cassia
Drawing habitat map of cinnamon
Drawing habitat map of clove
Drawing habitat map of coriander
Drawing habitat map of cubeb
Drawing habitat map of cumin
Drawing habitat map of fennel
Drawing habitat map of fenugreek
Drawing habitat map of ginger
Drawing habitat map of long_pepper
Drawing habitat map of mace
Drawing habitat map of nutmeg
Drawing habitat map of pepper
Drawing habitat map of saffron
Drawing habitat map of star_anise
Drawing habitat map of turmeric
Drawing habitat map of vanilla
Done.


# End

In [None]:
# ...measure time
end_time = datetime.now()
print("All done at " + str(end_time) + ".")
print('Duration: {}'.format(end_time - start_time))

All done at 2024-03-07 13:07:20.961370.
Duration: 0:00:48.576324


# Notes

In [None]:
# print(gpd.datasets.available)
# import geodatasets
# geodatasets.data

# with open("data\\resources\\geo\\level3.geojson", 'r') as f:
#     geojson_data = json.load(f)

# geojson_data