# Encyclopaedia Aromatica pipeline

# Libraries

In [10]:
# Import dependencies
import os
import pandas as pd
import numpy as np
import re

from collections import defaultdict
from datetime import date, datetime, timedelta
from py_markdown_table.markdown_table import markdown_table

# pip install babelnet

from plotly.io import write_image, write_json
import plotly.express as px
import plotly.graph_objs as go
import geopandas as gpd

from palette import *
from scripts.unsplash import *
from scripts.pexels import *

# Paths

In [11]:
path_in = "data/"
path_out_tex = "output/tex/"
path_out_md = "output/md/"
path_out_html = "output/html/"
path_out_json = "output/json/"
path_out_png = "output/png/"
path_out_pdf = "output/pdf/"
path_downloaded_photos = "output/photos/"

website_md = "../content/items/"
website_json = "../static/plotly/"
website_photos = "../static/images/photos/"

In [12]:
# import requests
 
# def fetch_wikidata(params):
#     url = 'https://www.wikidata.org/w/api.php'
#     try:
#         return requests.get(url, params=params)
#     except:
#         return 'There was and error'

# # What text to search for
# query = 'Elettaria cardamomum'
 
# # Which parameters to use
# params = {
#         'action': 'wbsearchentities',
#         'format': 'json',
#         'search': query,
#         'language': 'en'
#     }
 
# # Fetch API
# data = fetch_wikidata(params)
 
# #show response as JSON
# data = data.json()
# data

# Functions

In [13]:
################################################################################
# List all files in a folder, including subfolders
def list_files(dir):                                                                                                  
    r = []                                                                                                            
    subdirs = [x[0] for x in os.walk(dir)]                                                                            
    for subdir in subdirs:                                                                                            
        files = os.walk(subdir).__next__()[2]                                                                             
        if (len(files) > 0):                                                                                          
            for file in files:                                                                                        
                r.append(os.path.join(subdir, file))                                                                         
    return r

################################################################################
# Move or copy files between folders
import os, shutil, pathlib, fnmatch

def move_dir(src: str, dst: str, pattern: str = '*'):
    if not os.path.isdir(dst):
        pathlib.Path(dst).mkdir(parents=True, exist_ok=True)
    for f in fnmatch.filter(os.listdir(src), pattern):
        shutil.move(os.path.join(src, f), os.path.join(dst, f))

def copy_dir(src: str, dst: str, pattern: str = '*'):
    if not os.path.isdir(dst):
        pathlib.Path(dst).mkdir(parents=True, exist_ok=True)
    for f in fnmatch.filter(os.listdir(src), pattern):
        shutil.copy(os.path.join(src, f), os.path.join(dst, f))

################################################################################
# Roman numerals from Arabic numerals
def roman(num: int) -> str:

    chlist = "VXLCDM"
    rev = [int(ch) for ch in reversed(str(num))]
    chlist = ["I"] + [chlist[i % len(chlist)] + "\u0304" * (i // len(chlist))
                    for i in range(0, len(rev) * 2)]
    def period(p: int, ten: str, five: str, one: str) -> str:
        if p == 9:
            return one + ten
        elif p >= 5:
            return five + one * (p - 5)
        elif p == 4:
            return one + five
        else:
            return one * p
    return "".join(reversed([period(rev[i], chlist[i * 2 + 2], chlist[i * 2 + 1], chlist[i * 2])
                            for i in range(0, len(rev))]))
def century(year):
    return (year) // 100 + 1 

# print(roman(17)) to call function

################################################################################
# Get coordinates for a place
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="MyApp")

def coordinates(place):
    location = geolocator.geocode(place)
    lat, lon = location.latitude, location.longitude
    coord = [lat, lon]
    return coord
# print(coordinates("Hong Kong"))

# Generate geo-coordinates from location in a df
def generate_coordinates(df):
    for index, row in df.iterrows():
        if pd.notna(row['lat']) and pd.notna(row['lon']):
            location = geolocator.geocode(row['location'])
            df.at[index, 'lat_gen'] = location.latitude
            df.at[index, 'lon_gen'] = location.longitude
    return df

# Generate coordinates based on native regions
def generate_centroid_coordinates(df):
    # Load the geographic data (shapefile or GeoJSON) # https://github.com/tdwg/wgsrpd
    gdf = gpd.read_file("data\\resources\\geo\\level3.geojson")
    for index, row in df.iterrows():
        native_distribution = row['native regions'].split(', ')
        # Filter data for native distribution from gdf dataframe's LEVEL3_NAM column
        native_data = gdf[gdf['LEVEL3_NAM'].isin(native_distribution)].copy() 
        # Calculate centroid data
        native_centroid = native_data.centroid
        df.at[index, 'lat_gen'] = native_centroid.y.iloc[0]
        df.at[index, 'lon_gen'] = native_centroid.x.iloc[0]
    return df

################################################################################
# Convert Chinese Text to Simplified if needed
import opencc
tcsc = opencc.OpenCC('t2s.json')
sctc = opencc.OpenCC('s2t.json')
print(tcsc.convert('錫蘭肉桂'), sctc.convert('锡兰肉桂'))

# ################################################################################
# Transcribe Chinese into pinyin or jyutping
import pinyin
import jyutping
py = pinyin.get('錫蘭肉桂')
jp = jyutping.get('錫蘭肉桂')
print(py)
print(' '.join(jp))

################################################################################
# Hex to RGBA
from PIL import ImageColor as ic
def hex_to_rgba(hex):
    rgb = str(ic.getrgb(hex))
    rgba = re.sub('\)', ', 1.0)', rgb)
    return rgba

# hex_to_rgba(nord0)

# ################################################################################
# # Convert PDFs
# from pdf2image import convert_from_path

# def convert_pdf_to_png(file):
#     name = str(file)
#     name = re.sub(".*(?=/)", "", name)
#     name = re.sub("\..*", "", name)
#     pages = convert_from_path(file, 0)
#     for page in pages:
#         page.save(path + name + ".png", 'PNG')

################################################################################

# Regex cheatsheet

# (?!) - negative lookahead
# (?=) - positive lookahead
# (?<=) - positive lookbehind
# (?<!) - negative lookbehind

# (?>) - atomic group

# ################################################################################
# Wordnets using the Open Multilingual WordNet (https://omwn.org/omw1.html) # 100%: cmn, fin, hrv
# wn_langs = ['als', 'arb', 'bul', 'cat', 'cmn', 'dan', 'ell', 'eng', 'eus', 'fin', 'fra', 'glg', 'heb', 'hrv', 'ind', 'isl', 'ita', 'ita_iwn', 'jpn', 'lit', 'nld', 'nno', 'nob', 'pol', 'por', 'slv', 'spa', 'swe', 'tha', 'zsm'] # 100%: cmn, fin, hrv

import nltk
from nltk.corpus import wordnet as wn
# nltk.download("wordnet")
# nltk.download("omw-1.4")
# nltk.download("extended_omw") # if you want the wiktionary data

wn_langs = ['fra'] # 'eng', 'arb', 'cmn', 
print(wn.synset('allspice.n.03').definition())
print(wn.synsets('allspice', pos='n'))
print(wn.synset('allspice.n.03').lemma_names('ita'))

def wn_define(df):
    for index, row in df.iterrows():
        if pd.notna(row['wn']):
            wn_definition = wn.synset(row['wn']).definition()
            df.at[index, f"wn_def"] = wn_definition
    return df

################################################################################
# Translations

def wn_translate(df, lan):
    for index, row in df.iterrows():
        if pd.notna(row['wn']):
            translated_list = wn.synset(row['wn']).lemma_names(lan)
            translated = ", ".join(str(x) for x in translated_list)
            translated = re.sub("_", " ", translated)
            df.at[index, f"translated_wn_{lan}"] = translated
    return df

################################################################################
# Translate with DeepL, use any translator you like, in this example GoogleTranslator
# ChatGptTranslator, MicrosoftTranslator, DeeplTranslator; need API
# https://developers.google.com/admin-sdk/directory/v1/languages
# dl_languages = {'afrikaans': 'af', 'albanian': 'sq', 'amharic': 'am', 'arabic': 'ar', 'armenian': 'hy', 'assamese': 'as', 'aymara': 'ay', 'azerbaijani': 'az', 'bambara': 'bm', 'basque': 'eu', 'belarusian': 'be', 'bengali': 'bn', 'bhojpuri': 'bho', 'bosnian': 'bs', 'bulgarian': 'bg', 'catalan': 'ca', 'cebuano': 'ceb', 'chichewa': 'ny', 'chinese (simplified)': 'zh-CN', 'chinese (traditional)': 'zh-TW', 'corsican': 'co', 'croatian': 'hr', 'czech': 'cs', 'danish': 'da', 'dhivehi': 'dv', 'dogri': 'doi', 'dutch': 'nl', 'english': 'en', 'esperanto': 'eo', 'estonian': 'et', 'ewe': 'ee', 'filipino': 'tl', 'finnish': 'fi', 'french': 'fr', 'frisian': 'fy', 'galician': 'gl', 'georgian': 'ka', 'german': 'de', 'greek': 'el', 'guarani': 'gn', 'gujarati': 'gu', 'haitian creole': 'ht', 'hausa': 'ha', 'hawaiian': 'haw', 'hebrew': 'iw', 'hindi': 'hi', 'hmong': 'hmn', 'hungarian': 'hu', 'icelandic': 'is', 'igbo': 'ig', 'ilocano': 'ilo', 'indonesian': 'id', 'irish': 'ga', 'italian': 'it', 'japanese': 'ja', 'javanese': 'jw', 'kannada': 'kn', 'kazakh': 'kk', 'khmer': 'km', 'kinyarwanda': 'rw', 'konkani': 'gom', 'korean': 'ko', 'krio': 'kri', 'kurdish (kurmanji)': 'ku', 'kurdish (sorani)': 'ckb', 'kyrgyz': 'ky', 'lao': 'lo', 'latin': 'la', 'latvian': 'lv', 'lingala': 'ln', 'lithuanian': 'lt', 'luganda': 'lg', 'luxembourgish': 'lb', 'macedonian': 'mk', 'maithili': 'mai', 'malagasy': 'mg', 'malay': 'ms', 'malayalam': 'ml', 'maltese': 'mt', 'maori': 'mi', 'marathi': 'mr', 'meiteilon (manipuri)': 'mni-Mtei', 'mizo': 'lus', 'mongolian': 'mn', 'myanmar': 'my', 'nepali': 'ne', 'norwegian': 'no', 'odia (oriya)': 'or', 'oromo': 'om', 'pashto': 'ps', 'persian': 'fa', 'polish': 'pl', 'portuguese': 'pt', 'punjabi': 'pa', 'quechua': 'qu', 'romanian': 'ro', 'russian': 'ru', 'samoan': 'sm', 'sanskrit': 'sa', 'scots gaelic': 'gd', 'sepedi': 'nso', 'serbian': 'sr', 'sesotho': 'st', 'shona': 'sn', 'sindhi': 'sd', 'sinhala': 'si', 'slovak': 'sk', 'slovenian': 'sl', 'somali': 'so', 'spanish': 'es', 'sundanese': 'su', 'swahili': 'sw', 'swedish': 'sv', 'tajik': 'tg', 'tamil': 'ta', 'tatar': 'tt', 'telugu': 'te', 'thai': 'th', 'tigrinya': 'ti', 'tsonga': 'ts', 'turkish': 'tr', 'turkmen': 'tk', 'twi': 'ak', 'ukrainian': 'uk', 'urdu': 'ur', 'uyghur': 'ug', 'uzbek': 'uz', 'vietnamese': 'vi', 'welsh': 'cy', 'xhosa': 'xh', 'yiddish': 'yi', 'yoruba': 'yo', 'zulu': 'zu'}

# from deep_translator import GoogleTranslator as dl
# translated = dl(source='en', target='hu').translate("allspice") # api_key=openai
# print(translated)

# dl_languages = {'french': 'fr'} # 'hungarian': 'hu', 'english': 'en', 'arabic': 'ar', 'chinese': 'zh-TW',
# dl_language_list = list(dl_languages.values())

# def translate(df, lang):
#     for index, row in df.iterrows():
#         if pd.notna(row['English']):
#             translated = dl(source='en', target=lang).translate(row['English'])
#             df.at[index, f"translated_dl_{lang}"] = translated
#     return df

锡兰肉桂 錫蘭肉桂
xílánròugùi
sek3 laan4 juk6 gwai3
ground dried berrylike fruit of a West Indian allspice tree; suggesting combined flavors of cinnamon and nutmeg and cloves
[Synset('allspice.n.01'), Synset('allspice.n.02'), Synset('allspice.n.03')]
['pepe_della_Giamaica', 'pimento']


In [14]:
# Start timer
start_time = datetime.now()

# Data

## Spices

In [15]:
# Read and store content of an excel file 
df = pd.read_excel(path_in+"spices.xlsx")

# Write the dataframe object into csv file
df.to_csv (path_in+"spices.csv", index = None, header=True)

# Load in dataset of spices as a dataframe
df = pd.read_csv(path_in+'spices.csv', header=[0], delimiter=',', encoding="utf-8")

# Select ones to include
df = df.loc[(df['include'] == "in")]

# Info
list_of_items = df['item'].tolist()
list_of_items.sort()
print(len(list_of_items), "spices in total.")
print(list_of_items)

# Add keys based on item, make lowercase and replace spaces with underscores
df['key'] = df['item'].str.lower().str.replace(" ", "_")
list_of_keys = df['key'].tolist()
print(list_of_keys)

# Add links
df['url'] = "https://partigabor.github.io/aromatica/items/" + df['item'].str.replace(" ", "_")

# Add counts of distribution
df['no. of native regions'] = df['native regions'].str.count(',') + 1
df['no. of introduced regions'] = df['introduced regions'].str.count(',') + 1

# Location coordinates
# generate_coordinates(df) # Generate geo-coordinates from location
# generate_centroid_coordinates(df) # Generate geo-coordinates by finding the centroid of the native regions

# Get a definition from wordnet
# wn_define(df)

# # Translate the names to other languages using OMW
# for lang in wn_langs:
#     wn_translate(df, lang)

# # Machine ranslate the names to other languages
# for lang in dl_language_list:
#     translate(df, lang)

# # Check
# df_translations = df.filter(regex='translated')
# df_translations

# Assign
df_items = df.copy()
df.head()


24 spices in total.
['Sichuan pepper', 'allspice', 'anise', 'asafoetida', 'caraway', 'cardamom', 'cassia', 'chile', 'cinnamon', 'clove', 'coriander', 'cumin', 'dill', 'fennel', 'fenugreek', 'ginger', 'long pepper', 'mace', 'nutmeg', 'pepper', 'saffron', 'star anise', 'turmeric', 'vanilla']
['allspice', 'anise', 'asafoetida', 'caraway', 'cardamom', 'cassia', 'chile', 'cinnamon', 'clove', 'coriander', 'cumin', 'dill', 'fennel', 'fenugreek', 'ginger', 'long_pepper', 'mace', 'nutmeg', 'pepper', 'saffron', 'sichuan_pepper', 'star_anise', 'turmeric', 'vanilla']


Unnamed: 0,include,values,id,1,2,3,4,5,6,group,...,Hi translit,Hi literal,Hi alt,Indonesian,Malay,Persian,key,url,no. of native regions,no. of introduced regions
0,in,81,S,M,P,D,,1.0,1.0,,...,gandhadravya?,,,,,,allspice,https://partigabor.github.io/aromatica/items/a...,13,11.0
1,in,72,S,A,P,A,,1.0,1.0,,...,moti saunf,fat fennel,,adas manis,,بادیان رومی، انیسون,anise,https://partigabor.github.io/aromatica/items/a...,4,42.0
2,in,72,S,A,F,F,,1.0,1.0,,...,hīng,,,,,,asafoetida,https://partigabor.github.io/aromatica/items/a...,8,
3,in,61,S,A,C,C,,1.0,1.0,,...,,,,,jintan,,caraway,https://partigabor.github.io/aromatica/items/c...,67,57.0
4,in,82,S,Z,E,C,,1.0,1.0,cardamoms,...,,,,,,,cardamom,https://partigabor.github.io/aromatica/items/c...,2,7.0


In [16]:
# Subsetting categories (spices, herbs, incense)
# df_spices = df.loc[(df['id'] == "S")]
# print(df_spices.shape[0])
# df_herbs = df.loc[(df['id'] == "H")]
# print(df_herbs.shape[0])
# df_incense = df.loc[(df['id'] == "I")]
# print(df_incense.shape[0])

## Images

In [17]:
# # Download images from Unsplash and pexels to be curated later
# item = "black pepper"
# dashed_key = re.sub(" ", "-", item)
# unsplash_downloader(dashed_key, path_downloaded_photos)
# pexels_downloader(item, path_downloaded_photos)

# # # Move images to the right folder
# # move_dir(path_downloaded_photos + dashed_key, website_photos + dashed_key, "*.jpg")

In [18]:
# Define the relative folder path
folder_path = '../static/images/photos'

# List of keys
list_of_keys.sort()

# Initialize item_info dictionary
item_info = {}  # Dictionary to store item information

# Initialize item counts, extensions, file names, and sources to empty lists for each item
for item in list_of_keys:
    item_info[item] = {'count': 0, 'extensions': [], 'file_names': [], 'sources': []}

# Iterate through the files in the folder
for filename in os.listdir(folder_path):
    if os.path.isfile(os.path.join(folder_path, filename)):
        # Extract the item name, extension, file name, and source from the file name
        item_name = filename.split('-')[0]
        extension = filename.split('.')[-1]
        file_name = filename
        source = filename.split('-')[2] if len(filename.split('-')) > 2 else ""
        # Remove the file extension from the source
        source = source.split('.')[0]
        
        # Check if the item name is in the list_of_items
        if item_name in list_of_keys:
            item_info[item_name]['count'] += 1
            item_info[item_name]['extensions'].append(extension)
            item_info[item_name]['file_names'].append(file_name)
            item_info[item_name]['sources'].append(source)

# Create a Pandas DataFrame from the item_info dictionary
data = {'key': [], 'count': [], 'source': [], 'extension': []}
for item, info in item_info.items():
    data['key'].append(item)
    data['count'].append(info['count'])
    data['source'].append(', '.join(info['sources']))
    data['extension'].append(', '.join(info['extensions']))

df = pd.DataFrame(data)

# Fill in missing items with 0 image counts and empty sources, extensions
for item in list_of_keys:
    if item not in df['key'].values:
        df = df.append({'key': item, 'count': 0, 'source': '', 'extension': ''}, ignore_index=True)

# Reorder the DataFrame with columns 'item', 'count', 'source', 'extension'
df = df[['key', 'count', 'source', 'extension']]

# Sort the DataFrame by 'item'
df = df.sort_values(by='key')

# Reset the index of the DataFrame
df = df.reset_index(drop=True)

# Rename the columns
df.rename(columns={'count': 'img_count', 'source': 'img_source', 'extension': 'img_extension'}, inplace=True)

# Save
df_images = df.copy()

# Display the final DataFrame
print(df)

# Merge the two dataframes
df_items = pd.merge(df_items, df_images, on='key', how='left')

               key  img_count  \
0         allspice          0   
1            anise          0   
2       asafoetida          0   
3          caraway          0   
4         cardamom          0   
5           cassia          0   
6            chile          0   
7         cinnamon          0   
8            clove          0   
9        coriander          0   
10           cumin          1   
11            dill          0   
12          fennel          0   
13       fenugreek          0   
14          ginger          0   
15     long_pepper          0   
16            mace          0   
17          nutmeg          0   
18          pepper          6   
19         saffron          0   
20  sichuan_pepper          0   
21      star_anise          0   
22        turmeric          1   
23         vanilla          0   

                                           img_source  \
0                                                       
1                                                       
2  

## Names

In [19]:
# Read and store content of an excel file 
df = pd.read_excel(path_in+"names.xlsx")

# Write the dataframe object into csv file
df.to_csv (path_in+"names.csv", index = None, header=True)

# Load in dataset of names
df = pd.read_csv(path_in+'names.csv', header =[0], delimiter=',', encoding="utf-8")

# Select ones to include
df = df.loc[df['include'] == 'yes'] # include ones to include

In [20]:
# Change NaN to empty string
df.fillna('', inplace=True)

# Info
print(df.shape[0], "names in total.")

# Assign
df_names = df.copy()

360 names in total.



Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas. Value '' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.



## Etymologies

In [21]:
# Read and store content of an excel file 
read_file = pd.read_excel(path_in+"etymologies.xlsx")

# Write the dataframe object into csv file
read_file.to_csv (path_in+"etymologies.csv", index = None, header=True)

# Load in dataset
df_etymologies=pd.read_csv(path_in+'etymologies.csv', header =[0], delimiter=',', encoding="utf-8")

# Split the dataset wherever an empty row is found
df_list_with_na = np.split(df_etymologies, df_etymologies[df_etymologies.isnull().all(1)].index)

# Drop NAs and reset the index
df_list = []
for df in df_list_with_na:
  df = df.dropna(how='all')
  df.reset_index(inplace=True, drop=True)
  df_list.append(df)

# Automatically extract IDs from the dataset
list_of_etymologies = []
for df in df_list:
  # print(df['item'].iloc[0])
  df = df.dropna(how='all')
  df.reset_index(drop=True)
  item = str(df['item'].iloc[0])
  list_of_etymologies.append(item)

# Print the number if IDs and what are they
length = len(df_list)
print(length, "words in total")
print(list_of_etymologies)

#Create a defaultdict of spice-word etymologies
etymologies=defaultdict(list)
for i in range(length):
  etymologies[list_of_etymologies[i]]=df_list[i]

# Testing
# print(etymologies['saffron'])

84 words in total
['tester', 'allspice', 'fulful ifranji', 'duoxiangguo', 'pimento', 'anise', 'anisun', 'huiqin', 'asafoetida', 'hing', 'hiltit', 'anjudan', 'awei', 'xingqu', 'caraway', 'karawiya', 'geluzi', 'cardamom', 'amomum', 'hal', 'qaqulla', 'doukou', 'cassia', 'salikha', 'rougui', 'cinnamon', 'darsini', 'qirfa', 'chile', 'fulful harr', 'lajiao', 'paprika', 'clove', 'qaranful', 'dingxiang', 'coriander', 'kuzbura', 'yansui', 'husui', 'cumin', 'kammun', 'ziran', 'dill', 'shibitt', 'shiluo', 'fennel', 'shamar', 'huixiang', 'fenugreek', 'hulba', 'huluba', 'ginger', 'zanjabil', 'jiang', 'long pepper', 'darfilfil', 'biba', 'mace', 'basbas', 'roudoukoupi', 'nutmeg', 'jawz al-tib', 'roudoukou', 'pepper', 'fulful', 'hujiao', 'bors', 'saffron', 'zafaran', 'fanhonghua', 'Sichuan pepper', 'fagara', 'fulful sitshuwan', 'huajiao', 'star anise', 'yansun najmi', 'bajiaohuixiang', 'badian', 'turmeric', 'kurkum', 'jianghuang', 'vanilla', 'faniliya', 'xiangcao']



'DataFrame.swapaxes' is deprecated and will be removed in a future version. Please use 'DataFrame.transpose' instead.



## Etymology box for LaTeX

In [22]:
# key = "tester"

# ################################################################################

# # The following code will create an etymology box environment for the key, to be used in LaTeX
# print("Started the generation of '" + key + "' as etymbox...")

# df_local = etymologies[key]
# # df_local.fillna('', inplace=True)

# # # Skipping those marked
# df_local = df_local[df_local['boxskip'] != 'yes']
# df_local.reset_index(inplace=True, drop=True)

# # # Replace empty cells with NaNs
# # df_local.replace(r'^\s*$', np.nan, regex=True, inplace=True)
# # df_local.replace(r'^nan$', np.nan, regex=True, inplace=True)

# # Create content and sources stage by stage
# content = ""
# source = ""
# sources = ""
# source_list = []
# nl = "\n"

# for index, row in df_local.iterrows():
#   stage = "< "
#   if pd.notna(row['complex']): # complex relationships
#     row['complex'] = re.sub("and from", "+", row['complex'])
#     stage += row['complex'] + " "
#   if pd.notna(row['language']): # language
#     stage += "\\textbf{" + row['language'] + "} "
#   if pd.notna(row['script']): # script
#     script = "{" + row['script'] + "} "
#     if row['language'] == 'Chinese':
#       script = "\\tc{" + row['script'] + "} "
#     stage += script
#   if pd.notna(row['term']): # term
#     stage += "\\textit{" + row['term'] + "} "
#   if pd.notna(row['IPA']): # IPA
#     stage += row['IPA'] + " "
#   if pd.notna(row['meaning']): # meaning
#     stage += "`" + row['meaning'] + "' "
#   if pd.notna(row['literal']): # literal meaning
#     stage += "[" + row['literal'] + "] "
#   stage = re.sub(' $', '', stage)
#   stage += ", "
#   if pd.notna(row['explanation']): # explanation
#     stage += row['explanation'] + " "
#   if pd.notna(row['remark']): # remark
#     stage += "(" + row['remark'] + ") "
#   stage = re.sub(',? ?$', '', stage)

#   if pd.notna(row['date']): # dates
#     stage += ", "
#     row['date'] = re.sub('a(?=\d)', 'a. ', row['date'])
#     row['date'] = re.sub('c(?=\d)', 'ca. ', row['date'])
#     if re.match('^-\d\d?$', row['date']): # if is a century BC
#       row['date'] = re.sub("-", "", row['date'])
#       date = "\\nth{" + row['date'] + "} c. \BC{}" # ARAB NUMERALS
#       # date = " " + roman(row['date']) + " \BC{}" # ROMAN NUMERALS
#     elif re.match('^\d\d?$', row['date']): # if is a century AD
#       date = "\\nth{" + row['date'] + "} c. \AD{}" # ARAB NUMERALS
#       # date = " " + roman(row['date']) + " \AD{}" # ROMAN NUMERALS
#     else:
#       date = row['date'] + " " # if it's a year
#     stage += date
#     stage = re.sub(',? ?$', '', stage)
#   if pd.notna(row['cognates']): # cognates
#     stage += "; cf. cognates " + row['cognates'] + " "
#   if pd.notna(row['derivates']): # cognates
#     if pd.notna(row['cognates']):
#       stage = re.sub(' $', '', stage)
#       stage += "; " + row['derivates'] + " "
#     else:
#       stage = re.sub(' $', '', stage)
#       stage += "; cf. " + row['derivates'] + " "
#   stage = re.sub(',? ?$', '', stage)
#   # stage = re.sub('cf\..*?(cf\.)', '', stage)

# # Final touches
#   if row['doubt'] == 'yes':
#     stage = re.sub(r'<', '<\\\\textss{?}', stage) # ???
#   if row['complex'] == '+':
#     stage = re.sub('<', '', stage)
#   if row['complex'] == 'or from':
#     stage = re.sub('<', '', stage)
#   content += stage + nl

# # Sources
#   source=""
#   if pd.notna(row['source zotero']):
#     source = row['source zotero']
#     print("1",source)
#     if '{' in source:
#       source = "s" + row['source zotero'].lower()
#       print(source)
#     else:
#       source = "{" + row['source zotero'].lower() + "}"
#       print(source)
#     if pd.notna(row['source page']):
#       source = "[" + str(row['source page']) + "]{" + row['source zotero'].lower() + "}"
#       print("4",source)
#       if row['source page'].isalpha() == True:
#         source = "[s.v. " + str(row['source page']) + "]{" + row['source zotero'].lower() + "}"
#         print("5",source)
#     source = "\\textcite" + source
#   print(source)
#   source_list.append(source)

# # clear duplicates from sources:
# print("SL1: ", source_list)
# # source_set = sorted(set(source_list), key=source_list.index)
# source_set = set(source_list)
# print("SS2: ", source_set)
# source_list2 = list(source_set)
# print("S3: ", source_list2)
# sources_unduplicated = '; '.join(source_list2)
# print("S4: ", sources_unduplicated)
# # test for duplicates
# newlist = [] # empty list to hold unique elements from the list
# duplist = [] # empty list to hold the duplicate elements from the list
# for i in source_list:
#     if i not in newlist:
#         newlist.append(i)
#     else:
#         duplist.append(i) # this method catches the first duplicate entries, and appends them to the list

# # The next stage is to print the duplicate entries, and the unique entries
# print("List of duplicates", duplist)
# print("Unique Item List", newlist) 
# if len(duplist) > 0:
#   # print("UNDUPL")
#   sources = sources_unduplicated
# else:
#   # print("ORI")
#   sources =  '; '.join(source_list)
# # print("S5: ", sources)

# sources =  '; '.join(source_list)

# # Cleaning
# sources = re.sub("; $", "", sources)
# sources = re.sub("^; ", "", sources)
# sources = re.sub("(; )+", "; ", sources)
# sources = "\\footnote{" + sources + "}\n"

# content = re.sub("\n$", "", content)
# content = re.sub(r"^< ", "", content) # delete the first <
# content = re.sub(r"\n,", ",", content)
# content = re.sub(r" nan ", " ", content)
# content = re.sub("(<\.\n?)+$", "", content)

# content += sources

# env_begin = r"\begin{etymology}" + "\label{ety:" + df_local['item'].iloc[0] + "}" + nl 
# env_end = r"\end{etymology}"

# box = env_begin + content + env_end
# box = re.sub(r"\u200e", "", box) #removes right to left mark

# # Save the spicebox as a standalone tex file

# filename = re.sub(" ", "_", key)
# filename = filename.lower()
# f = open(path_out_tex + "{}.tex".format("etymbox_" + filename), "w", encoding='utf-8')
# f.write(box)
# f.close()
# print("Etymology-box '" + str(key) + "' as a tex file was created.")
# box


In [23]:
# def etymbox(key):
  
#   # The following code will create a etymology box environment for the key, to be used in LaTeX
#   print("Started the generation of '" + key + "' as etymbox...")

#   df_local = etymologies[key]
#   # df_local.fillna('', inplace=True)

#   # # Skipping those marked
#   df_local = df_local[df_local['boxskip'] != 'yes']
#   df_local.reset_index(inplace=True, drop=True)

#   # # Replace empty cells with NaNs
#   # df_local.replace(r'^\s*$', np.nan, regex=True, inplace=True)
#   # df_local.replace(r'^nan$', np.nan, regex=True, inplace=True)

#   # Create content and sources stage by stage
#   content = ""
#   source = ""
#   sources = ""
#   source_list = []
#   nl = "\n"

#   for index, row in df_local.iterrows():
#     stage = "< "
#     if pd.notna(row['complex']): # complex relationships
#       row['complex'] = re.sub("and from", "+", row['complex'])
#       stage += row['complex'] + " "
#     if pd.notna(row['language']): # language
#       stage += "\\textbf{" + row['language'] + "} "
#     if pd.notna(row['script']): # script
#       script = "{" + row['script'] + "} "
#       if row['language'] == 'Chinese':
#         script = "\\tc{" + row['script'] + "} "
#       stage += script
#     if pd.notna(row['term']): # term
#       stage += "\\textit{" + row['term'] + "} "
#     if pd.notna(row['IPA']): # IPA
#       stage += row['IPA'] + " "
#     if pd.notna(row['meaning']): # meaning
#       stage += "`" + row['meaning'] + "' "
#     if pd.notna(row['literal']): # literal meaning
#       stage += "[" + row['literal'] + "] "
#     stage = re.sub(' $', '', stage)
#     stage += ", "
#     if pd.notna(row['explanation']): # explanation
#       stage += row['explanation'] + " "
#     if pd.notna(row['remark']): # remark
#       stage += "(" + row['remark'] + ") "
#     stage = re.sub(',? ?$', '', stage)

#     if pd.notna(row['date']): # dates
#       stage += ", "
#       row['date'] = re.sub('a(?=\d)', 'a. ', row['date'])
#       row['date'] = re.sub('c(?=\d)', 'ca. ', row['date'])
#       if re.match('^-\d\d?$', row['date']): # if is a century BC
#         row['date'] = re.sub("-", "", row['date'])
#         date = "\\nth{" + row['date'] + "} c. \BC{}" # ARAB NUMERALS
#         # date = " " + roman(row['date']) + " \BC{}" # ROMAN NUMERALS
#       elif re.match('^\d\d?$', row['date']): # if is a century AD
#         date = "\\nth{" + row['date'] + "} c. \AD{}" # ARAB NUMERALS
#         # date = " " + roman(row['date']) + " \AD{}" # ROMAN NUMERALS
#       else:
#         date = row['date'] + " " # if it's a year
#       stage += date
#       stage = re.sub(',? ?$', '', stage)
#     if pd.notna(row['cognates']): # cognates
#       stage += "; cf. cognates " + row['cognates'] + " "
#     if pd.notna(row['derivates']): # cognates
#       if pd.notna(row['cognates']):
#         stage = re.sub(' $', '', stage)
#         stage += "; " + row['derivates'] + " "
#       else:
#         stage = re.sub(' $', '', stage)
#         stage += "; cf. " + row['derivates'] + " "
#     stage = re.sub(',? ?$', '', stage)
#     # stage = re.sub('cf\..*?(cf\.)', '', stage)

#   # Final touches
#     if row['doubt'] == 'yes':
#       stage = re.sub(r'<', '<\\\\textss{?}', stage) # ???
#     if row['complex'] == '+':
#       stage = re.sub('<', '', stage)
#     if row['complex'] == 'or from':
#       stage = re.sub('<', '', stage)
#     content += stage + nl

#   # Sources
#     source=""
#     if pd.notna(row['source zotero']):
#       source = row['source zotero']
#       # print(source)
#       if '{' in source:
#         source = "s" + row['source zotero'].lower()
#         # print(source)
#       else:
#         source = "{" + row['source zotero'].lower() + "}"
#         # print(source)
#       if pd.notna(row['source page']):
#         source = "[" + str(row['source page']) + "]{" + row['source zotero'].lower() + "}"
#         # print(source)
#         if row['source page'].isalpha() == True:
#           source = "[s.v. " + str(row['source page']) + "]{" + row['source zotero'].lower() + "}"
#           # print(source)
#       source = "\\textcite" + source
#     # print(source)
#     source_list.append(source)

#   # clear duplicates from sources:
#   # print("SL1: ", source_list)
#   # source_set = sorted(set(source_list), key=source_list.index)
#   source_set = set(source_list)
#   # print("SS2: ", source_set)
#   source_list2 = list(source_set)
#   # print("S3: ", source_list2)
#   sources_unduplicated = '; '.join(source_list2)
#   # print("S4: ", sources_unduplicated)
#   # test for duplicates
#   newlist = [] # empty list to hold unique elements from the list
#   duplist = [] # empty list to hold the duplicate elements from the list
#   for i in source_list:
#       if i not in newlist:
#           newlist.append(i)
#       else:
#           duplist.append(i) # this method catches the first duplicate entries, and appends them to the list
#   # The next stage is to print the duplicate entries, and the unique entries
#   # print("List of duplicates", duplist)
#   # print("Unique Item List", newlist) 
#   if len(duplist) > 0:
#     # print("UNDUPL")
#     sources = sources_unduplicated
#   else:
#     # print("ORI")
#     sources =  '; '.join(source_list)
#   # print("S5: ", sources)

#   sources =  '; '.join(source_list)

#   # Cleaning
#   sources = re.sub("; $", "", sources)
#   sources = re.sub("^; ", "", sources)
#   sources = re.sub("(; )+", "; ", sources)
#   sources = "\\footnote{" + sources + "}\n"

#   content = re.sub("\n$", "", content)
#   content = re.sub(r"^< ", "", content) # delete the first <
#   content = re.sub(r"\n,", ",", content)
#   content = re.sub(r" nan ", " ", content)
#   content = re.sub("(<\.\n?)+$", "", content)

#   content += sources

#   env_begin = r"\begin{etymology}" + "\label{ety:" + df_local['item'].iloc[0] + "}" + nl 
#   env_end = r"\end{etymology}"

#   box = env_begin + content + env_end
#   box = re.sub(r"\u200e", "", box) #removes right to left mark

#   # Save the spicebox as a standalone tex file
#   filename = re.sub(" ", "_", key)
#   filename = filename.lower()
#   f = open(path_out_tex + "{}.tex".format("etymbox_" + filename), "w", encoding='utf-8')  
#   f.write(box)
#   f.close()
#   print("Etymology-box '" + str(key) + "' as a tex file was created.")

#   return box

# etymbox("tester")

## Etymology box for Markdown

In [24]:
def etymbox(key):

  # The following code will create a etymology box environment for the key, to be used in Markdown
  print("Started the generation of '" + key + "' as etymbox...")

  # Select word
  df_local = etymologies[key]
  # df_local.fillna('', inplace=True)

  # Skipping those marked
  df_local = df_local[df_local['skip'] != 'yes']
  df_local.reset_index(inplace=True, drop=True)

  # # Replace empty cells with NaNs
  # df_local.replace(r'^\s*$', np.nan, regex=True, inplace=True)
  # df_local.replace(r'^nan$', np.nan, regex=True, inplace=True)

  # Initialize
  content = ""
  sources = ""
  source_pages = ""

  # Iterate through a word's etymology dataframe (stage by stage = row by row)
  for index, row in df_local.iterrows():
    # Initialize stage, add an '<' if its not the first stage
    if index == 0:
      stage = ""
    else:
      stage = "< "
    # If there are complex relations, add them (e.g., partly, and, or)
    if pd.notna(row['complex']):
      stage += row['complex'] + " "
    # Add language (in bold)
    if pd.notna(row['language']):
      stage += "**" + row['language'] + "** "
    # Add the term with native script (if exists)
    if pd.notna(row['script']):
      stage += row['script'] + " "
    # Add the term with transcription (in italics)
    if pd.notna(row['term']):
      stage += "*" + row['term'] + "* "
    # Add /IPA/
    if pd.notna(row['IPA']):
      stage += "/" + row['IPA'] + "/ "
    # Add 'meaning', gloss
    if pd.notna(row['meaning']):
      stage += "'" + row['meaning'] + "' "
    # Add the [literal meaning] if there is one
    if pd.notna(row['literal']):
      stage += "[" + row['literal'] + "] "
    # Clear ending
    stage = re.sub(' $', '', stage)
    # Add explanation
    if pd.notna(row['explanation']):
      stage += ", " + row['explanation'] + " "
    # Add (remark)
    if pd.notna(row['remark']):
      stage += " (" + row['remark'] + ") "
    # Clear ending
    stage = re.sub(" +", " ", stage)
    stage = re.sub(",? ?$", "", stage)

    # Add date, if there is a date
    if pd.notna(row['date']):
      # If it's a year
      date = row['date']

    # Add century if there is no date
    if pd.notna(row['century']) and pd.isna(row['date']):
      
      # If it's a century BC
      if re.match('^-\d\d?\??$', row['century']):
        # Remove dash
        row['century'] = re.sub("-", "", row['century'])
        # If there is question mark
        if re.match('\d\?', row['century']):
          # Remove the question mark
          row['century'] = re.sub("\?", "", row['century'])
          # Date BC
          # date = roman(int(row['century'])) + " BC?" # ROMAN NUMERALS
          date = row['century'] + " c. BC?" # ARAB NUMERALS
        else:
          # date = roman(int(row['century'])) + " BC" # ROMAN NUMERALS
          date = row['century'] + " c. BC" # ARAB NUMERALS

      # If it is a century AD
      elif re.match('^\d\d?\??$', row['century']):
        # If there is question mark
        if re.match('\d\?', row['century']):
          # Remove the question mark
          row['century'] = re.sub("\?", "", row['century'])
          # Date AD
          # date = "AD " + roman(int(row['century'])) + "?" # ROMAN NUMERALS
          date = "AD " + row['century'] + " c." # ARAB NUMERALS
        else:
          # date = "AD " + roman(int(row['century'])) # ROMAN NUMERALS
          date = "AD " + row['century'] + " c." # ARAB NUMERALS

      # Add date to stage
      stage += ", " + date
      
    # Clear ending
    stage = re.sub(',? ?$', '', stage)

    # If both cognates and derivates
    if pd.notna(row['cognates']) and pd.notna(row['derivates']):
      stage += "; cf. cognates " + row['cognates'] + "; derivates " + row['derivates'] + " "
    # If cognates only
    if pd.notna(row['cognates']) and pd.isna(row['derivates']):
      stage += "; cf. cognates " + row['cognates'] + " "
    # If derivates only
    if pd.notna(row['derivates']) and pd.isna(row['cognates']):
      stage += "; cf. derivates " + row['derivates'] + " "
    
    # Clear ending
    stage = re.sub(';?,? ?$', '', stage)

    # If stage is doubtful, use '<?'
    if row['doubt'] == 'yes':
      stage = re.sub('<', '<\?', stage)
    # If stage is "complex", remove '<'
    if pd.notna(row['complex']):
      stage = re.sub('<', '', stage)

    # # Sources (at each stage) A
    # source = ""
    # # If there is source (zotero), add
    # if pd.notna(row['source']):
    #   source = '\"' + row['source'] + '\"'
    #   # If there is page, add
    #   if pd.notna(row['source page']):
    #     source = '\"' + str(row['source'].lower()) + '\" \"' + str(row['source page']) + '\"'
    # # Add the Hugo shortcode syntax 
    # source = r' {{< cite ' + source + r' >}}'
    # # Create content
    # content += stage + source + "\n"

    # Sources (once in the end, removing duplicates) B
    # If there is source (zotero), add
    if pd.notna(row['source']):
      sources += row['source'] + ";"
      # If there is page, add
      if pd.notna(row['source page']):
        source_pages += str(row['source page']) + ";"
      else:
        source_pages += ";"
    # Create content
    content += stage + "\n"

  # If using version B of sources
  # Clean ending
  sources = re.sub(';?$', '', sources)
  source_pages = re.sub(';?$', '', source_pages)
  # Add the Hugo-cite shortcode syntax 
  source = r' {{< cite "' + sources + r'" "' + source_pages + r'" >}}'
  # Add source to content
  content += source

  # Cleaning
  box = content
  box = re.sub(r"\u200e", "", box) # Removes right-to-left mark

  # # Save the spicebox as a standalone markdown file (if ever needed)
  # filename = re.sub(" ", "_", key)
  # filename = filename.lower()
  # f = open(path_out_md + "{}.md".format("etymbox_" + filename), "w", encoding='utf-8')  
  # f.write(box)
  # f.close()
  # print("Etymology-box '" + str(key) + "' as a md file was created.")

  return box

etymbox("tester")

Started the generation of 'tester' as etymbox...


'**Language A** тест *test* /tɛst/ \'meaning1\' [literal1], explanation1 (remark1); cf. cognates cognates1; derivates derivates1\n< **Language B** тестер *tester* /ˈtɛstə/ \'meaning2\' [literal2], explanation2, AD 12 c.; cf. cognates cognates2\n< **Language C** тестинг *testing* /ˈtɛstɪŋ/ \'meaning3\' [literal3] (remark3), 9 c. BC; cf. derivates derivates3\n< **Language D** тесте *teste* /ˈaltə/ \'meaning4\' [literal4], explanation4 (remark4); cf. cognates cognates4; derivates derivates4\n {{< cite "oed;wehr_dictionary_1976;wehr_dictionary_1976;liddell_greekenglish_1940;wehr_dictionary_1976;lewis_latin_1879;liddell_greekenglish_1940" "1;2-3;4;5;6;;7" >}}'

In [25]:
# Creating a dictionary of etymologies
dictionary_of_etymologies = {}
# Loop
for key in list_of_etymologies:
    box = (etymbox(key))
    # box = r'{{% notice style="primary" title="Pirates" icon="skull-crossbones" %}}' + "\n" + text + "\n" + r"{{% /notice %}}" + "\n\n"
    dictionary_update = {key: box}
    dictionary_of_etymologies.update(dictionary_update)
print('Done.')


Started the generation of 'tester' as etymbox...
Started the generation of 'allspice' as etymbox...
Started the generation of 'fulful ifranji' as etymbox...
Started the generation of 'duoxiangguo' as etymbox...
Started the generation of 'pimento' as etymbox...
Started the generation of 'anise' as etymbox...
Started the generation of 'anisun' as etymbox...
Started the generation of 'huiqin' as etymbox...
Started the generation of 'asafoetida' as etymbox...
Started the generation of 'hing' as etymbox...
Started the generation of 'hiltit' as etymbox...
Started the generation of 'anjudan' as etymbox...
Started the generation of 'awei' as etymbox...
Started the generation of 'xingqu' as etymbox...
Started the generation of 'caraway' as etymbox...
Started the generation of 'karawiya' as etymbox...
Started the generation of 'geluzi' as etymbox...
Started the generation of 'cardamom' as etymbox...
Started the generation of 'amomum' as etymbox...
Started the generation of 'hal' as etymbox...
St

In [26]:
# Check
dictionary_of_etymologies['allspice']

'**English** *allspice*, from *all* + *spice*; after the flavor profile that resembles the combined aroma of cloves, nutmeg, cinnamon, and black pepper\n {{< cite "oed" "allspice" >}}'

# Website Generation

## Create a Spice Page

In [27]:
# Photo by <a href="https://unsplash.com/@veruschkade?utm_content=creditCopyText&utm_medium=referral&utm_source=unsplash">Vera De</a> on <a href="https://unsplash.com/photos/red-and-brown-fur-on-gray-concrete-floor-hTE438DvDgg?utm_content=creditCopyText&utm_medium=referral&utm_source=unsplash">Unsplash</a>

In [28]:
def spicepage(item):
    '''
    This cell generates web pages from the spice datasets and writes them out to markdown files.
    '''

    pd.options.mode.copy_on_write = True # to avoid SettingWithCopyWarning, https://stackoverflow.com/questions/20625582/how-to-deal-with-settingwithcopywarning-in-pandas
    
    # Get dataframe of current item
    print("Working on", item)
    df_local = df_items.loc[df_items['item'] == item]
    
    # Reset index
    df_local.reset_index(drop=True, inplace=True)

    # Generate key
    key = re.sub(" ", "_", item).lower()
    
    # Description
    description = df_local['description'][0]
    description = description[0].upper() + description[1:]
    aka = ", also known as " + str(df_local['en alt'].iloc[0]) + ". " if pd.notna(df_local['en alt'].iloc[0]) else "" # review
    related = ", related to " + str(df_local['related'].iloc[0]) + ". " if pd.notna(df_local['related'].iloc[0]) else ""
    see_also = "See also " + str(df_local['see also'].iloc[0]) + ". " if pd.notna(df_local['see also'].iloc[0]) else ""
    preamble_description = description + aka + related + see_also
    page_description = ">" + description + aka + related + see_also + "\n\n"
    # if pd.notna(df_local['wn'].iloc[0]):
    #     wn_definition = wn.synset(df_local['wn'][0]).definition()
    #     wn_definition = wn_definition[0].upper() + wn_definition[1:]
    #     description = description + "\n\nAccording to WordNet: " + wn_definition + ".\n\n"

    # Extract categories and tags and groups (which will be treated as tags)
    category = df_local['category'][0]
    category_list = category.split("; ") if ";" in category else [f'{category}']
    tag = df_local['tag'][0]
    tag_list = tag.split(";") if ";" in tag else [f'{tag}']
    if pd.notna(df_local['group'][0]):
        group = df_local['group'][0]
        group_list = group.split(";") if ";" in group else [f'{group}']
    else:
        group_list = []
    tag_list = tag_list + group_list

    # Assemble preamble
    preamble = f'+++\ntitle = "{item.title()}"\nauthor = "Gabor Parti"\ndate = "{str(date.today())}"\ndescription = "{preamble_description}"\nweight = 10\n# draft = "true"\n# hidden = "true"\nplotly = true\ncategories = {str(category_list)}\ntags = {str(tag_list)}\nbibFile = "static/files/bibliography.json"\n+++\n\n'

    ###########################
    ######## The Spice ########
    ###########################
    
    # Overview 
    overview_head = "## Overview\n\n"
    # Merge species name
    df_local['species name'] = "*" + df_local['species'] + "* " + df_local['species by']
    # Set link
    df_local['botanical database'] = "[POWO](" + df_local['powo'] + ")"
    # Prepare overview tables
    df_overview = df_local[['species name', 'family', 'part used', 'macroarea', 'region of origin', 'cultivation', 'color', 'botanical database']]
    # Transpose table
    df_overview = df_overview.T
    # Reset index
    df_overview.reset_index(inplace=True)
    # Rename columns
    df_overview.columns = ['item', item]
    # Prepare data to create markdown table
    data = df_overview.to_dict(orient='records')
    # Create markdown table
    overview_mdt = markdown_table(data).set_params(row_sep = 'markdown', quote = False).get_markdown()
    # Table
    overview = overview_head + overview_mdt + "\n\n"
    
    # Intro
    category = re.sub('; ', ' and ', category)
    tag = re.sub('; ', ' and ', tag)
    part = re.sub('; ', ' and ', df_local['part used'][0])
    intro = description + ". " + item.title() + " (" + str(df_local['species name'][0]) + ")" + " is a " + tag + " " + category + " from the *" + str(df_local['family'][0]) + "* family,[^powo] originating in the region(s) of " + str(df_local['region of origin'][0]) + "It is used for its " + part + ", primarily for " + str(df_local['major uses'][0]) + ". Its aroma is described as " + str(df_local['taste/smell'][0]) + ", with a heat index of " + str(df_local['heat'][0]) + ".[^ucla_medicinal_2002]" + "\n\n"
    
    # Create references
    references = "[^powo]: POWO. (2022). Plants of the World Online (Botanical Database). Facilitated by the Royal Botanic Gardens, Kew. http://www.plantsoftheworldonline.org/\n[^ucla_medicinal_2002]: Medicinal Spices Exhibit. (2002). UCLA Biomedical Library: History & Special Collections. https://unitproj.library.ucla.edu/biomed/spice/index.cfm?spicefilename=taste.txt&itemsuppress=yes&displayswitch=0\n\n"
    # references = ""
    
    # Photo (todo, make list of img ext and source now they are string)
    # if df_local['img_count'][0] > 0:
    #     img_src = re.sub(',.*', '', df_local['img_source'][0])
    #     img_src = re.sub('_', ' ', img_src)
        
    #     photo = "![" + df_local['item'][0] + "](/images/photos/" + df_local['item'][0] + "-1-" + re.sub(',.*', '', df_local['img_source'][0]) + "." + re.sub(',.*', '', df_local['img_extension'][0]) + "?width=14rem&classes=shadow" + ' "Photo: ' + img_src + '")\n\n'
    # else:
    #     photo = ""
    
    # Illustration # &classes=shadow
    if pd.notna(df_local['ill source'].iloc[0]):
        illustration_alt = "Illustration of " + df_local['species'][0] + " from " + df_local['ill source'][0]
        illustration = "![" + df_local['species name'][0] + '](/images/illustrations/' + key + '.png?width=40rem "' + illustration_alt + '")' + '\n'
        illustration_source = df_local['ill source'][0] + r"{{< cite -" + str(df_local['ill key'][0]) + r" >}} " + str(df_local['ill page'][0]) + r"."
        illustration = illustration + "\n>Illustration of " + df_local['species'][0] + " from " + illustration_source
        if pd.notna(df_local['ill link'].iloc[0]):
            illustration = illustration + " [{{% icon image %}}](" + df_local['ill link'].iloc[0] + ")"
    else:
        illustration = ""
    illustration = illustration + "\n\n"
    
    # Top display gallery
    if len(list_files(f"../static/images/photos/{key}")) > 1:
        display_gallery = '{{< load-photoswipe >}}\n\n{{< gallery dir="/images/photos/' + key + '" hover-effect="slideup" caption-effect="fade" caption-position="none" />}}' + "\n\n"
    else:
        display_gallery = ""
        
    # Gallery on the bottom of the page
    if os.path.isdir(f"../static/images/photos/{key}/gallery"):
        gallery =  '## Gallery\n\n{{< load-photoswipe >}}\n\n{{< gallery dir="/images/photos/' + key + '/gallery" hover-effect="slideup" caption-effect="fade" />}}' + "\n\n"
    else:
        gallery = ""

    # Quick names (predefined)
    if pd.notna(df_local['English'][0]):
        en = "**English:** " + df_local['English'][0] + " · "
    else:
        en = ""
    if pd.notna(df_local['Hungarian'][0]):
        hu = "**Hungarian:** " + df_local['Hungarian'][0] + " · "
    else:
        hu = ""
    if pd.notna(df_local['Arabic'][0]):
        ar = '**Arabic:** <span class="arabic-text" dir="rtl">' + df_local['Arabic'][0] + '</span>' + " · "
    else:
        ar = ""
    if pd.notna(df_local['Chinese'][0]):
        zh = '**Chinese:** <span class="chinese-text">' + df_local['Chinese'][0] + '</span>' + " · "
    else:
        zh = ""
    
    quick_names = en + hu + ar + zh
    
    if pd.notna(df_local['French'][0]):
        fr = "**French:** " + df_local['French'][0] + ". "
        quick_names += fr
    # Translate here with WN, DL or GT
    # elif pd.notna(df_local['translated_wn_fra'][0]):
    #     fr = "**French:** " + str(df_local['translated_wn_fra'][0])
    # elif pd.notna(df_local['translated_dl_fr'][0]):
    #     fr = "**French:** " + str(df_local['translated_dl_fr'][0])
    else:
        quick_names = quick_names
    
    quick_names = '<p style="text-align:center;">\n\n' + quick_names + '\n\n</p>\n\n'
    
    # df_quick_names = df_local[['English', 'Arabic', 'Chinese', 'Hungarian']] # 'French' 
    # # df_names = df_names.T
    # # df_names.reset_index(inplace=True)
    # # df_names.columns = ['language', 'name(s)']
    # data = df_quick_names.to_dict(orient='records')
    # quick_names_mdt = markdown_table(data).set_params(row_sep = 'markdown', quote = False).get_markdown()
    # quick_names = quick_names_mdt + "\n\n"

    # Distribution
    distribution = "## Distribution\n\n"
    distribution = distribution + r'{{< load-plotly >}}' + '\n\n' + r'{{< plotly json="/aromatica/plotly/distributions/dist_' + key + r'.json" weight="600" height="300" >}}' + '\n\n'
    distribution = distribution + ">Native and introduced habitats of " + str(df_local['species name'][0]) + "[^powo]\n\n"
    # Check if 'native regions' is not empty and not 'NA' before adding to the string
    regions = ""
    if pd.notna(df_local['native regions'][0]) and df_local['native regions'][0] != 'NA':
        regions += "**Native regions:** &ensp; &ensp; &ensp; " + df_local['native regions'][0] + "\n\n"
    # Check if 'introduced regions' is not empty and not 'NA' before adding to the string
    if pd.notna(df_local['introduced regions'][0]) and df_local['introduced regions'][0] != 'NA':
        regions += "**Introduced regions:** " + df_local['introduced regions'][0] + "\n\n"
    distribution += '<p style="text-align:left;">\n\n' + regions + '</p>\n\n'
    
    ##################################
    ######## The Nomenclature ########
    ##################################

    # Dataframe of current item 
    df_names_local = df_names.loc[df_names['item'] == item]

    # Reset index
    df_names_local.reset_index(drop=True, inplace=True)

    # Names
    # names_head = "***\n\n## Nomenclature\n\n"
    # Heads
    # names_head_en = "### English\n\n"
    # names_head_ar = "### Arabic\n\n"
    # names_head_zh = "### Chinese\n\n"
    
    # # Language by language
    # language = "English"
    # df = df_names_local.loc[df_names_local['language'] == language]
    # df = df[['term', 'source human']]
    # df = df.rename(columns={'source human': 'source'})
    # data = df.to_dict(orient='records')
    # names_mdt_en = markdown_table(data).set_params(row_sep = 'markdown', quote = False).get_markdown()

    # language = "Arabic"
    # df = df_names_local.loc[df_names_local['language'] == language]
    # df = df[['script', 'term', 'literal', 'source human']]
    # df = df.rename(columns={'source human': 'source'})
    # data = df.to_dict(orient='records')
    # names_mdt_ar = markdown_table(data).set_params(row_sep = 'markdown', quote = False).get_markdown()

    # language = "Chinese"
    # df = df_names_local.loc[df_names_local['language'] == language]
    # df = df[['script', 'term', 'literal', 'source human']]
    # df = df.rename(columns={'source human': 'source'})
    # data = df.to_dict(orient='records')
    # names_mdt_zh = markdown_table(data).set_params(row_sep = 'markdown', quote = False).get_markdown()
    # names = names_head + names_head_en + names_mdt_en + "\n\n" + names_head_ar + names_mdt_ar + "\n\n" + names_head_zh + names_mdt_zh + "\n\n"
    names = ""

    ################################
    ####### The Etymologies #######
    ################################

    # wordlist = df_local['etymologies'][0].split("; ")
    # etymologies = ""
    # for word in wordlist:
    #     etymologies += dictionary_of_etymologies[word]
    # etymologies = "## Etymologies\n\n" + etymologies
    etymologies = ""

    # Plotly file
    # jsons = ""
    # key_ = re.sub(" ", "_", item)
    # jsons = r'{{< load-plotly >}}' + '\n' + r'{{< plotly json="/plotly/diffusion_name_' + key_ + r'.json" height="300px" >}}' + '\n\n'

    # Plotly files for all words (some missing)
    # jsons = ""
    # for word in wordlist:
    #     word_ = re.sub(" ", "_", word)
    #     json = r'{{< load-plotly >}}' + '\n' + r'{{< plotly json="/plotly/diffusion_name_' + word_ + r'.json" height="300px" >}}' + '\n\n'
    #     jsons += json
    # jsons = "## Etymology maps\n\n" + jsons
    
    # # Manuscripts (handwritten spice pages embedded into the generated one)
    # for key in list_of_manuscripts:
    # filename = re.sub(" ", "_", key)
    # with open(path + filename + ".md", 'r', encoding="utf8") as md:
    #     lines = md.readlines()
    #     text = "".join(lines)
    # text = re.sub('\n',' ',text)

    # manuscript?? = '{{% include "content/items/manuscripts/{' + key + '}_ms.md" %}}\n\n' # not working

    # Bibliography
    bibliography = "# Bibliography\n\n{{< bibliography cited >}}\n\n"
    
    # Write markdown file
    with open(path_out_md + key + '_bib.md', 'w', encoding='utf-8') as f:
        f.write(bibliography)

    ######## Assemble page ########
    page = preamble + display_gallery + page_description + quick_names + overview + intro + illustration + distribution + names + etymologies + gallery + references
    
    # Write markdown file
    with open(path_out_md + key + '_gen.md', 'w', encoding='utf-8') as f:
        f.write(page)
    return

In [29]:
# Loop through all spices
for key in list_of_items:
    spicepage(key)
print("Done.")

Working on Sichuan pepper
Working on allspice
Working on anise
Working on asafoetida
Working on caraway
Working on cardamom
Working on cassia
Working on chile
Working on cinnamon
Working on clove
Working on coriander
Working on cumin
Working on dill


Working on fennel
Working on fenugreek
Working on ginger
Working on long pepper
Working on mace
Working on nutmeg
Working on pepper
Working on saffron
Working on star anise
Working on turmeric
Working on vanilla
Done.


## Merge autogenerated files with manuscripts

In [30]:
def build_page(key):
    '''
    This function merges generated web pages with hand written parts (manuscripts), and page bibliographies into a final markdown file to show on a web page. Gen and bib are generated above, manuscripts are not. E.g., allspice_gen.md + allspice_ms.md + allspice_bib.md = allspice.md.
    '''
    filename = re.sub(" ", "_", key)
    filepath = website_md + '/manuscripts/' + filename + '_ms.md'
    if os.path.isfile(filepath) == True:
        # Read generated files
        with open(path_out_md + filename + '_gen.md', 'r', encoding='utf-8') as f:
            generated = f.read()
        # Read manuscript files    
        with open(website_md + '/manuscripts/' + filename + '_ms.md', 'r', encoding='utf-8') as f:
            manuscript = f.read()
        # Read bib files
        with open(path_out_md + filename + '_bib.md', 'r', encoding='utf-8') as f:
            bibliography = f.read()
        # Assemble    
        page = generated + "\n\n" + manuscript + "\n\n" + bibliography
        # Write out page file
        with open(website_md + filename + '.md', 'w', encoding='utf-8') as f:
            f.write(page)
    else:
        # Read generated files
        with open(path_out_md + filename + '_gen.md', 'r', encoding='utf-8') as f:
            generated = f.read()
        # Read bib files    
        with open(path_out_md + filename + '_bib.md', 'r', encoding='utf-8') as f:
            bibliography = f.read()
        # Assemble    
        page = generated + "\n\n" + bibliography
        # Write out page file
        with open(website_md + filename + '.md', 'w', encoding='utf-8') as f:
            f.write(page)
    return

In [31]:
for key in list_of_items:
    build_page(key)

In [32]:
# ...measure time
end_time = datetime.now()
print("All done at " + str(end_time) + ".")
print('Duration: {}'.format(end_time - start_time))

All done at 2023-11-01 15:43:22.196978.
Duration: 0:00:04.618657


# Maps

In [33]:
df = df_items.copy()

In [34]:
# # Basic example with Plotly Go
# # Create figure data
# fig = go.Figure(data=go.Scattergeo(
#         lon = df['lon'],
#         lat = df['lat'],
#         text = df['item'],
#         mode = 'markers',
#         # marker_color = df['cnt'],
#         ))

# # Update layout
# fig.update_layout(
#         title = 'Title',
#         geo_scope='world',
#         template = 'plotly_dark'
#     )

# # Show figure
# fig.show()

### Map settings

In [35]:
# Visual variables for map (dark mode)

font_size = 14
font_color = "white"
font_family = "Sans-Serif"
marker_symbol= 'circle'
marker_size = 14
max_marker_size = 32
edge_color = transparent
edge_size = 1
opacity = 0.7
line_width = 4
water = "#202020"
grid_color = "#282828"
land = "#303030"
lines = "#383838"
copyright_color = "#404040"
background_color = transparent
legend_background_color = quarter_transparent
color_scheme = prism

In [36]:
# Orthographic globe layout
ortho_traces = dict(
    textposition = 'top right', # middle left, bottom center, etc.
    textfont = dict(size=font_size, color=font_color, family=font_family),
    hovertemplate=
        "<b>%{text}</b><br><br>" +
        "Species: <i>%{customdata[0]}</i><br>" +
        "Family: <i>%{customdata[1]}</i><br>" +
        "Region of origin: %{customdata[2]}<br>" +
        "Arabic: %{customdata[3]} <i>%{customdata[4]}</i><br>" +
        "Chinese: %{customdata[5]} <i>%{customdata[6]}</i><br>" +
        # "Spreadability: %{customdata[7]:.2f}<br>" +
        "<extra></extra>",
    marker = dict(
        symbol = marker_symbol,
        size = marker_size,
        line = dict(
            color=edge_color,
            width=edge_size
        )
    )
)

ortho_layout = go.Layout(
    paper_bgcolor=background_color,
    plot_bgcolor=background_color,
    geo = dict(
        resolution=110, # 50 is large; 110 is small
        scope='world', # 'world', 'asia'
        projection_type = 'orthographic', # orthographic, natural earth
        projection_scale = 1,
        projection_rotation = {'lat': 20, 'lon': 80, 'roll': 0},
        bgcolor=background_color,
        showcoastlines=True, coastlinewidth = 1, coastlinecolor = lines,
        showcountries=False, countrywidth = 1, countrycolor = lines,
        showframe=True, framewidth = 1, framecolor = lines,
        showlakes=True, lakecolor = water,
        showland=True, landcolor = land,
        showocean=True, oceancolor = water,
        showrivers=True, riverwidth = 1, rivercolor = water,
        showsubunits=True, subunitwidth = 0, subunitcolor = lines,
        lonaxis = dict(showgrid = True, gridwidth = 0.5, dtick = 10, gridcolor=grid_color),
        lataxis = dict (showgrid = True, gridwidth = 0.5, dtick = 10, gridcolor=grid_color)),
    showlegend = True,
    legend=dict(x=0, y=0, xanchor="left", yanchor="bottom", bgcolor=legend_background_color,
                font=dict(color=font_color, size=font_size, family=font_family),
                title_font=dict(color=font_color, size=font_size+2, family=font_family),
                traceorder = 'normal', orientation="v"),
    title=dict(x=0.5, y=0.99, xanchor='center', yanchor='top', text='',
               font=dict(color=font_color, size=font_size+6, family=font_family)),
    margin={"r":0,"t":0,"l":0,"b":0},
    hoverlabel=dict(#bgcolor="white",
                    font_size=font_size,
                    font_family=font_family),
    )

# "Document size" for pdfs
document_size = dict(width = 600, height=600)

# Copyright
cr = dict(
    name="copyright",
    text="© Gábor Parti, 2023",
    font=dict(color=copyright_color, size=8, family=font_family),
    opacity=0.9,
    xref="paper",
    yref="paper",
    x=0.5,
    y=0,
    # xanchor="right", 
    # yanchor="bottom", 
    # align="center",
    showarrow=False,
)
# fig.update_layout(annotations = [cr]) # to call

## Info
info = dict(
    name="info",
    text="Click on a material to navigate to its corresponding page!",
    font=dict(color=font_color, size=font_size, family=font_family),
    opacity=0.9,
    xref="paper",
    yref="paper",
    x=0.5,
    y=0.05,
    # xanchor="right", 
    # yanchor="bottom", 
    # align="center",
    showarrow=False,
)
# fig.update_layout(annotations = [info]) # to call

# Adding layout images
logo = dict(
    source="https://upload.wikimedia.org/wikipedia/en/thumb/9/9e/PolyU_Logo_with_wordmark.svg/1024px-PolyU_Logo_with_wordmark.svg.png",
    sizex=0.15, sizey=0.15,
    x=1, y=0, 
    xanchor="right", 
    yanchor="bottom", 
)
# fig.add_layout_image(logo) # to call

In [37]:
# Natural earth layout for cloropleth maps (regions and distributions)
ne_traces = dict(
    hovertemplate=
        "Region: <i>%{customdata[0]}</i><br>" +
        "Code: <i>%{customdata[1]}</i><br>" +
        "<extra></extra>",
)

ne_layout = go.Layout(
    paper_bgcolor=background_color,
    plot_bgcolor=background_color,
    geo = dict(
        resolution=110, # 50 is large; 110 is small
        scope='world', # 'world', 'asia'
        projection_type = 'orthographic', #'natural earth',
        projection_scale = 2,
        projection_rotation = {'lat': 20, 'lon': 80, 'roll': 0},
        bgcolor=background_color,
        showcoastlines=True, coastlinewidth = 1, coastlinecolor = lines,
        showcountries=False, countrywidth = 1, countrycolor = lines,
        showframe=True, framewidth = 1, framecolor = lines,
        showlakes=True, lakecolor = water,
        showland=True, landcolor = land,
        showocean=True, oceancolor = water,
        showrivers=True, riverwidth = 1, rivercolor = water,
        showsubunits=True, subunitwidth = 0, subunitcolor = lines,
        lonaxis = dict(showgrid = True, gridwidth = 0.5, dtick = 10, gridcolor=grid_color),
        lataxis = dict (showgrid = True, gridwidth = 0.5, dtick = 10, gridcolor=grid_color)),
    showlegend = True,
    legend=dict(xanchor="left", yanchor="bottom", 
                # x=0.1, y=0.1, # for natural earth
                x=0, y=0, # for orthographic
                bgcolor=legend_background_color,
                font=dict(color=font_color, size=font_size, family=font_family),
                title_font=dict(color=font_color, size=font_size+2, family=font_family),
                traceorder = 'normal', orientation="v"),
    title=dict(x=0.5, y=0.99, xanchor='center', yanchor='top', text='',
               font=dict(color=font_color, size=font_size+6, family=font_family)),
    margin={"r":0,"t":0,"l":0,"b":0},
    hoverlabel=dict(#bgcolor="white",
                    font_size=font_size,
                    font_family=font_family),
    )

## Plot

In [38]:
# Set size
df['size'] = 1

# Create figure data
data = px.scatter_geo(df,
    lat='lat', 
    lon='lon',
    text='item',
    color='family',
    color_discrete_sequence=color_scheme,
    size_max = max_marker_size,
    size = 'size',
    opacity = opacity,
    hover_name='item',
    hover_data={'species':True, 'family':True, 'region of origin':True, 'Arabic':True, 'ar transliteration':True, 'Chinese':True, 'pinyin':True, 'lon':False, 'lat':False, 'size':False}, #'spreadability':':.2f', 
    labels={"group": "category"}
    )

# Save figure data
fig = data

###################################################
# Interactive visualization (HTML/JSON) for the web

# Call the orthographic traces and layout settings from above
fig.update_traces(ortho_traces)
fig.update_layout(ortho_layout)
# fig.update_layout(title_text = "Title")
# fig.update_layout(basemap_visible=True)

# Add copyrigth
fig.update_layout(annotations=[cr])

# Show figure
fig.show()

# Write interactive visualization (HTML/JSON) for the web
filename = "home"
# fig.write_html(path_out_html + filename + ".html")
fig.write_json(path_out_json + filename + ".json", validate=True, pretty=True)

###################################################
# Image (PNG/PDF) for documents

# Call figure data
# fig = data

# Call the orthographic traces and layout settings from above
# fig.update_traces(ortho_traces)
# fig.update_layout(ortho_layout)
# fig.update_layout(geo=dict(projection_rotation = {'lat': 20, 'lon': 80, 'roll': 0}))
# fig.update_layout(document_size)

# Show figure
# fig.show()

# filename = "home"
# fig.write_image(path_out_png + filename + ".png", scale=3)
# fig.write_image(filename + ".pdf", engine="kaleido")


In [39]:
# Move files to the website folder
move_dir(path_out_json, website_json, "*.json")

# Other maps

In [40]:
# # Basic map to show some countries, not very good

# import plotly.express as px
# import geopandas as gpd

# # Load the world map data
# world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))

# # Filter the data to include only India, Sri Lanka, and Maldives
# target_countries = ['India', 'Sri Lanka', 'Maldives', 'Wales']
# filtered_world = world[world['name'].isin(target_countries)]

# # Create a choropleth map
# fig = px.choropleth(
#     filtered_world,
#     locations='iso_a3',
#     color='name',
#     color_discrete_sequence = color_scheme,
#     projection='orthographic',
#     title='Highlighted Countries: India, Sri Lanka, Maldives'
# )

# # Call the orthographic traces and layout settings from above
# fig.update_layout(ortho_layout)
# # fig.update_layout(title_text = "Title")

# # Show the map
# fig.show()


In [41]:
df = df_items.copy()

key = "saffron"

## Habitat maps

In [42]:
def habitat_map(key):
    '''
    This function generates a map of the distributions of a spice.
    '''

    # Load the geographic data (shapefile or GeoJSON) # https://github.com/tdwg/wgsrpd
    gdf = gpd.read_file("data\\resources\\geo\\level3.geojson")

    # Rename columns
    gdf.columns = ['name', 'code', 'code_l2', 'code_l1', 'geometry']

    # Get local df and reset its index
    df_local = df_items.loc[df_items['item'] == key]
    df_local.reset_index(drop=True, inplace=True)

    # Get list of native regions
    native_regions = df_local.loc[0, 'native regions'].split(', ')

    # Get list of introduced regions, if it is not NA
    if pd.notna(df_local.loc[0, 'introduced regions']):
        introduced_regions = df_local.loc[0, 'introduced regions'].split(', ')
    else:
        introduced_regions = []

    # Filter data for native regions from gdf dataframe's LEVEL3_NAM column
    filtered_data_native = gdf[gdf['name'].isin(native_regions)].copy()  # Ensure a copy is made
    filtered_data_native.loc[:, 'region'] = 'native'

    # Filter data for introduced regions from gdf dataframe's LEVEL3_NAM column
    filtered_data_introduced = gdf[gdf['name'].isin(introduced_regions)].copy()  # Ensure a copy is made
    if not filtered_data_introduced.empty:
        filtered_data_introduced.loc[:, 'region'] = 'introduced'

    # Concatenate the filtered dataframes
    filtered_data = pd.concat([filtered_data_native, filtered_data_introduced])

    # ***

    # Create the choropleth map
    fig = px.choropleth(
        filtered_data,
        geojson = filtered_data.geometry,
        locations = filtered_data.index,
        color = 'region',
        color_discrete_sequence = ['#88ae43', '#6943ae'],
        hover_data = {'name': True, 'code': True},
        projection = 'natural earth'
    )

    # Call layout
    fig.update_traces(ne_traces)
    fig.update_layout(ne_layout)

    # Adjust map bounds
    # fig.update_geos(fitbounds='locations')  

    # Get centroid values for the native regions of the item
    for index, row in df_local.iterrows():
        native_distribution = row['native regions'].split(', ')
        # Filter data for native distribution from gdf dataframe's LEVEL3_NAM column
        native_data = gdf[gdf['name'].isin(native_distribution)].copy() 
        # Calculate centroid data
        native_centroid = native_data.centroid
        native_centroid_lon = native_centroid.x.iloc[0]
        native_centroid_lat = native_centroid.y.iloc[0]

    # Amend projection rotation with the native regions' centroid values
    fig.update_layout(geo=dict(projection_rotation = {'lat': native_centroid_lat, 'lon': native_centroid_lon, 'roll': 0}))

    # Show the map
    # fig.show()

    # Save
    filename = "dist_" + re.sub(" ", "_", key).lower()
    # fig.write_html(path_out_html + filename + ".html")
    fig.write_json(path_out_json + filename + ".json", validate=True, pretty=True)
    # fig.write_image(path_out_png + filename + ".png", scale=3)
    return

In [43]:
# # Loop through all spices
# for key in list_of_items:
#     print(key)
#     habitat_map(key)
# print("Done.")

# # Move files to the website folder
# move_dir(path_out_json, website_json + "/distributions", "*.json")

# Notes

In [44]:
# print(gpd.datasets.available)
# import geodatasets
# geodatasets.data

# with open("data\\resources\\geo\\level3.geojson", 'r') as f:
#     geojson_data = json.load(f)

# geojson_data