# Generate Website

# Libraries

In [1]:
# Import dependencies
import pandas as pd
import numpy as np
import re

from collections import defaultdict
from datetime import date, datetime, timedelta
from py_markdown_table.markdown_table import markdown_table
import nltk
from nltk.corpus import wordnet as wn
# nltk.download("wordnet")
# nltk.download("omw-1.4")
# nltk.download("extended_omw") # if you want the wiktionary data

# pip install babelnet

from plotly.io import write_image, write_json
import plotly.express as px
import plotly.graph_objs as go
from palette import *

# Paths

In [2]:
path_in = "data/"
path_out_tex = "output/tex/"
path_out_md = "output/md/"
path_out_json = "output/json/"
path_out_html = "output/html/"

# # Old
# path_in = "data/"
# path_out_html = "output/html/"
# path_out_json = "output/json/"
# path_out_md = "output/md/"
# path_out_pdf = "output/pdf/"
# path_out_png = "output/png/"
# path_out_tex = "output/tex/"
# path_out_draft = "website/drafts/"

# destination_pdf = "thesis/imgs/plots/"
# destination_tex = "thesis/envs/"
# destination_html = "website/static/plotly/"
# destination_json = "website/static/plotly/"

website_md = "../content/materials/"
website_json = "../static/plotly/"

# Functions

In [3]:
################################################################################
# List all files in a folder, including subfolders
def list_files(dir):                                                                                                  
    r = []                                                                                                            
    subdirs = [x[0] for x in os.walk(dir)]                                                                            
    for subdir in subdirs:                                                                                            
        files = os.walk(subdir).__next__()[2]                                                                             
        if (len(files) > 0):                                                                                          
            for file in files:                                                                                        
                r.append(os.path.join(subdir, file))                                                                         
    return r

################################################################################
# Move or copy files between folders
import os, shutil, pathlib, fnmatch

def move_dir(src: str, dst: str, pattern: str = '*'):
    if not os.path.isdir(dst):
        pathlib.Path(dst).mkdir(parents=True, exist_ok=True)
    for f in fnmatch.filter(os.listdir(src), pattern):
        shutil.move(os.path.join(src, f), os.path.join(dst, f))

def copy_dir(src: str, dst: str, pattern: str = '*'):
    if not os.path.isdir(dst):
        pathlib.Path(dst).mkdir(parents=True, exist_ok=True)
    for f in fnmatch.filter(os.listdir(src), pattern):
        shutil.copy(os.path.join(src, f), os.path.join(dst, f))

################################################################################
# Roman numerals from Arabic numerals
def roman(num: int) -> str:

    chlist = "VXLCDM"
    rev = [int(ch) for ch in reversed(str(num))]
    chlist = ["I"] + [chlist[i % len(chlist)] + "\u0304" * (i // len(chlist))
                    for i in range(0, len(rev) * 2)]

    def period(p: int, ten: str, five: str, one: str) -> str:
        if p == 9:
            return one + ten
        elif p >= 5:
            return five + one * (p - 5)
        elif p == 4:
            return one + five
        else:
            return one * p

    return "".join(reversed([period(rev[i], chlist[i * 2 + 2], chlist[i * 2 + 1], chlist[i * 2])
                            for i in range(0, len(rev))]))

def century(year):
    return (year) // 100 + 1 

# print(roman(17))

################################################################################
# Get coordinates for a place
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="MyApp")

def coordinates(place):
    location = geolocator.geocode(place)
    lat, lon = location.latitude, location.longitude
    coord = [lat, lon]
    return coord
# print(coordinates("Hong Kong"))

# Generate geo-coordinates from location in a df
def generate_coordinates(df):
    for index, row in df.iterrows():
        if pd.notna(row['location']):
            location = geolocator.geocode(row['location'])
            df.at[index, 'lat_gen'] = location.latitude
            df.at[index, 'lon_gen'] = location.longitude
    return df

################################################################################
# Convert Chinese Text to Simplified if needed
import opencc
tcsc = opencc.OpenCC('t2s.json')
sctc = opencc.OpenCC('s2t.json')
print(tcsc.convert('錫蘭肉桂'), sctc.convert('锡兰肉桂'))

# ################################################################################
# Transcribe Chinese into pinyin or jyutping
import pinyin
import jyutping
py = pinyin.get('錫蘭肉桂')
jp = jyutping.get('錫蘭肉桂')
print(py)
print(' '.join(jp))

# ################################################################################
# Wordnets using the Open Multilingual WordNet (https://omwn.org/)
wn_langs = ['als', 'arb', 'bul', 'cat', 'cmn', 'dan', 'ell', 'eng', 'eus', 'fin', 'fra', 'glg', 'heb', 'hrv', 'ind', 'isl', 'ita', 'ita_iwn', 'jpn', 'lit', 'nld', 'nno', 'nob', 'pol', 'por', 'slv', 'spa', 'swe', 'tha', 'zsm'] # 100%: cmn, fin, hrv
# print(wn.synset('allspice.n.03').definition())
# print(wn.synsets('allspice', pos='n'))
# print(wn.synset('allspice.n.03').lemma_names('ita'))

def wn_define(df):
    for index, row in df.iterrows():
        if pd.notna(row['wn']):
            wn_definition = wn.synset(row['wn']).definition()
            df.at[index, f"wn_def"] = wn_definition
    return df

def wn_translate(df, lan):
    for index, row in df.iterrows():
        if pd.notna(row['wn']):
            translated_list = wn.synset(row['wn']).lemma_names(lan)
            df.at[index, f"wn_{lan}"] = str(translated_list)
    return df

################################################################################
# Translate with DeepL, use any translator you like, in this example GoogleTranslator
# ChatGptTranslator, MicrosoftTranslator, DeeplTranslator; need API
from deep_translator import GoogleTranslator as dl
translated = dl(source='en', target='hu').translate("allspice") # api_key=openai
print(translated)

def translate(df, lg):
    for index, row in df.iterrows():
        if pd.notna(row['en']):
            translated = dl(source='en', target=lg).translate(row['en'])
            df.at[index, f"{lg}_gen"] = translated
    return df

################################################################################
# Hex to RGBA
from PIL import ImageColor as ic
def hex_to_rgba(hex):
    rgb = str(ic.getrgb(hex))
    rgba = re.sub('\)', ', 1.0)', rgb)
    return rgba

# hex_to_rgba(nord0)

# ################################################################################
# # Convert PDFs
# from pdf2image import convert_from_path

# def convert_pdf_to_png(file):
#     name = str(file)
#     name = re.sub(".*(?=/)", "", name)
#     name = re.sub("\..*", "", name)
#     pages = convert_from_path(file, 0)
#     for page in pages:
#         page.save(path + name + ".png", 'PNG')

################################################################################

# Regex cheatsheet

# (?!) - negative lookahead
# (?=) - positive lookahead
# (?<=) - positive lookbehind
# (?<!) - negative lookbehind

# (?>) - atomic group


锡兰肉桂 錫蘭肉桂
xílánròugùi
sek3 laan4 juk6 gwai3
vegyesfűszer


In [4]:
# import requests
 
# def fetch_wikidata(params):
#     url = 'https://www.wikidata.org/w/api.php'
#     try:
#         return requests.get(url, params=params)
#     except:
#         return 'There was and error'


# # What text to search for
# query = 'Elettaria cardamomum'
 
# # Which parameters to use
# params = {
#         'action': 'wbsearchentities',
#         'format': 'json',
#         'search': query,
#         'language': 'en'
#     }
 
# # Fetch API
# data = fetch_wikidata(params)
 
# #show response as JSON
# data = data.json()
# data

In [5]:
# Start timer
start_time = datetime.now()

# Data

## Spices

In [6]:
# Read and store content of an excel file 
df = pd.read_excel(path_in+"spices.xlsx")

# Write the dataframe object into csv file
df.to_csv (path_in+"spices.csv", index = None, header=True)

# Load in dataset of spices as a dataframe
df = pd.read_csv(path_in+'spices.csv', header=[0], delimiter=',', encoding="utf-8")

# Select ones to include
df = df.loc[(df['include'] == "in")]

# Info
list_of_spices = df['id'].tolist()
list_of_spices.sort()
print(len(list_of_spices), "spices in total.\n", list_of_spices)

# Add links
df['url'] = "https://partigabor.github.io/aromatica/materials/" + df['id'].str.replace(" ", "_")

# Add counts of regions
df['no. of native regions'] = df['native regions'].str.count(',') + 1
df['no. of introduced regions'] = df['introduced regions'].str.count(',') + 1




# Generate geo-coordinates from location
# generate_coordinates(df)

# Get a definition from wordnet
# wn_define(df)

# Translate the names to other languages using OMWN
# wn_translate(df, 'fra')

# Machine ranslate the names to other languages # https://developers.google.com/admin-sdk/directory/v1/languages
# translate(df, 'hi')

# Assign
df_spices = df.copy()
df.head()

24 spices in total.
 ['Sichuan pepper', 'allspice', 'anise', 'asafoetida', 'caraway', 'cardamom', 'cassia', 'chile', 'cinnamon', 'clove', 'coriander', 'cumin', 'dill seeds', 'fennel seeds', 'fenugreek', 'ginger', 'long pepper', 'mace', 'nutmeg', 'pepper', 'saffron', 'star anise', 'turmeric', 'vanilla']


Unnamed: 0,include,key,1,2,3,4,5,6,group,id,...,EOL,FOC,Hindi,Hi translit,Hi literal,Hi alt,Indonesian,Malay,Persian,url
0,in,S0001,M,P,D,,1.0,1.0,,allspice,...,,,गंधद्रव्य?,gandhadravya?,,,,,,https://partigabor.github.io/aromatica/materia...
1,in,S0002,A,P,A,,1.0,1.0,,anise,...,,,मोटी सौंफ़,moti saunf,fat fennel,,adas manis,,بادیان رومی، انیسون,https://partigabor.github.io/aromatica/materia...
2,in,S0003,A,F,F,,1.0,1.0,,asafoetida,...,,,हींग,hīng,,,,,,https://partigabor.github.io/aromatica/materia...
3,in,S0004,A,C,C,,1.0,1.0,,caraway,...,,,,,,,,jintan,,https://partigabor.github.io/aromatica/materia...
4,in,S0005,Z,E,C,,1.0,1.0,cardamoms,cardamom,...,https://eol.org/pages/1120064,,,,,,,,,https://partigabor.github.io/aromatica/materia...


## Names

In [7]:
# Read and store content of an excel file 
df = pd.read_excel(path_in+"names.xlsx")

# Write the dataframe object into csv file
df.to_csv (path_in+"names.csv", index = None, header=True)

# Load in dataset of names
df = pd.read_csv(path_in+'names.csv', header =[0], delimiter=',', encoding="utf-8")

# Select ones to include
df = df.loc[df['include'] == 'yes'] # include ones to include


In [8]:

# Change NaN to empty string
df.fillna('', inplace=True)

# Info
print(df.shape[0], "names in total.")

# Assign
df_names = df.copy()


360 names in total.


## Etymologies

In [9]:
# Read and store content of an excel file 
read_file = pd.read_excel(path_in+"etymologies.xlsx")

# Write the dataframe object into csv file
read_file.to_csv (path_in+"etymologies.csv", index = None, header=True)

# Load in dataset
df_etymologies=pd.read_csv(path_in+'etymologies.csv', header =[0], delimiter=',', encoding="utf-8")

# Split the dataset wherever an empty row is found
df_list_with_na = np.split(df_etymologies, df_etymologies[df_etymologies.isnull().all(1)].index)

# Drop NAs and reset the index
df_list = []
for df in df_list_with_na:
  df = df.dropna(how='all')
  df.reset_index(inplace=True, drop=True)
  df_list.append(df)

# Automatically extract IDs from the dataset
list_of_etymologies = []
for df in df_list:
  # print(df['id'].iloc[0])
  df = df.dropna(how='all')
  df.reset_index(drop=True)
  id = str(df['id'].iloc[0])
  list_of_etymologies.append(id)

# Print the number if IDs and what are they
length = len(df_list)
print(length, "words in total")
print(list_of_etymologies)


#Create a defaultdict of spice-word etymologies
etymologies=defaultdict(list)
for i in range(length):
  etymologies[list_of_etymologies[i]]=df_list[i]

# Testing
# print(etymologies['saffron'])


84 words in total
['tester', 'allspice', 'fulful ifranji', 'duoxiangguo', 'pimento', 'anise', 'anisun', 'huiqin', 'asafoetida', 'hing', 'hiltit', 'anjudan', 'awei', 'xingqu', 'caraway', 'karawiya', 'geluzi', 'cardamom', 'amomum', 'hal', 'qaqulla', 'doukou', 'cassia', 'salikha', 'rougui', 'cinnamon', 'darsini', 'qirfa', 'chile', 'fulful harr', 'lajiao', 'paprika', 'clove', 'qaranful', 'dingxiang', 'coriander', 'kuzbura', 'yansui', 'husui', 'cumin', 'kammun', 'ziran', 'dill', 'shibitt', 'shiluo', 'fennel', 'shamar', 'huixiang', 'fenugreek', 'hulba', 'huluba', 'ginger', 'zanjabil', 'jiang', 'long pepper', 'darfilfil', 'biba', 'mace', 'basbas', 'roudoukoupi', 'nutmeg', 'jawz al-tib', 'roudoukou', 'pepper', 'fulful', 'hujiao', 'bors', 'saffron', 'zafaran', 'fanhonghua', 'Sichuan pepper', 'fagara', 'fulful sitshuwan', 'huajiao', 'star anise', 'yansun najmi', 'bajiaohuixiang', 'badian', 'turmeric', 'kurkum', 'jianghuang', 'vanilla', 'faniliya', 'xiangcao']


## Etymology box for LaTeX

In [10]:
# key = "tester"

# ################################################################################

# # The following code will create an etymology box environment for the key, to be used in LaTeX
# print("Started the generation of '" + key + "' as etymbox...")

# df_local = etymologies[key]
# # df_local.fillna('', inplace=True)

# # # Skipping those marked
# df_local = df_local[df_local['boxskip'] != 'yes']
# df_local.reset_index(inplace=True, drop=True)

# # # Replace empty cells with NaNs
# # df_local.replace(r'^\s*$', np.nan, regex=True, inplace=True)
# # df_local.replace(r'^nan$', np.nan, regex=True, inplace=True)

# # Create content and sources stage by stage
# content = ""
# source = ""
# sources = ""
# source_list = []
# nl = "\n"

# for index, row in df_local.iterrows():
#   stage = "< "
#   if pd.notna(row['complex']): # complex relationships
#     row['complex'] = re.sub("and from", "+", row['complex'])
#     stage += row['complex'] + " "
#   if pd.notna(row['language']): # language
#     stage += "\\textbf{" + row['language'] + "} "
#   if pd.notna(row['script']): # script
#     script = "{" + row['script'] + "} "
#     if row['language'] == 'Chinese':
#       script = "\\tc{" + row['script'] + "} "
#     stage += script
#   if pd.notna(row['term']): # term
#     stage += "\\textit{" + row['term'] + "} "
#   if pd.notna(row['IPA']): # IPA
#     stage += row['IPA'] + " "
#   if pd.notna(row['meaning']): # meaning
#     stage += "`" + row['meaning'] + "' "
#   if pd.notna(row['literal']): # literal meaning
#     stage += "[" + row['literal'] + "] "
#   stage = re.sub(' $', '', stage)
#   stage += ", "
#   if pd.notna(row['explanation']): # explanation
#     stage += row['explanation'] + " "
#   if pd.notna(row['remark']): # remark
#     stage += "(" + row['remark'] + ") "
#   stage = re.sub(',? ?$', '', stage)

#   if pd.notna(row['date']): # dates
#     stage += ", "
#     row['date'] = re.sub('a(?=\d)', 'a. ', row['date'])
#     row['date'] = re.sub('c(?=\d)', 'ca. ', row['date'])
#     if re.match('^-\d\d?$', row['date']): # if is a century BC
#       row['date'] = re.sub("-", "", row['date'])
#       date = "\\nth{" + row['date'] + "} c. \BC{}" # ARAB NUMERALS
#       # date = " " + roman(row['date']) + " \BC{}" # ROMAN NUMERALS
#     elif re.match('^\d\d?$', row['date']): # if is a century AD
#       date = "\\nth{" + row['date'] + "} c. \AD{}" # ARAB NUMERALS
#       # date = " " + roman(row['date']) + " \AD{}" # ROMAN NUMERALS
#     else:
#       date = row['date'] + " " # if it's a year
#     stage += date
#     stage = re.sub(',? ?$', '', stage)
#   if pd.notna(row['cognates']): # cognates
#     stage += "; cf. cognates " + row['cognates'] + " "
#   if pd.notna(row['derivates']): # cognates
#     if pd.notna(row['cognates']):
#       stage = re.sub(' $', '', stage)
#       stage += "; " + row['derivates'] + " "
#     else:
#       stage = re.sub(' $', '', stage)
#       stage += "; cf. " + row['derivates'] + " "
#   stage = re.sub(',? ?$', '', stage)
#   # stage = re.sub('cf\..*?(cf\.)', '', stage)

# # Final touches
#   if row['doubt'] == 'yes':
#     stage = re.sub(r'<', '<\\\\textss{?}', stage) # ???
#   if row['complex'] == '+':
#     stage = re.sub('<', '', stage)
#   if row['complex'] == 'or from':
#     stage = re.sub('<', '', stage)
#   content += stage + nl

# # Sources
#   source=""
#   if pd.notna(row['source zotero']):
#     source = row['source zotero']
#     print("1",source)
#     if '{' in source:
#       source = "s" + row['source zotero'].lower()
#       print(source)
#     else:
#       source = "{" + row['source zotero'].lower() + "}"
#       print(source)
#     if pd.notna(row['source page']):
#       source = "[" + str(row['source page']) + "]{" + row['source zotero'].lower() + "}"
#       print("4",source)
#       if row['source page'].isalpha() == True:
#         source = "[s.v. " + str(row['source page']) + "]{" + row['source zotero'].lower() + "}"
#         print("5",source)
#     source = "\\textcite" + source
#   print(source)
#   source_list.append(source)

# # clear duplicates from sources:
# print("SL1: ", source_list)
# # source_set = sorted(set(source_list), key=source_list.index)
# source_set = set(source_list)
# print("SS2: ", source_set)
# source_list2 = list(source_set)
# print("S3: ", source_list2)
# sources_unduplicated = '; '.join(source_list2)
# print("S4: ", sources_unduplicated)
# # test for duplicates
# newlist = [] # empty list to hold unique elements from the list
# duplist = [] # empty list to hold the duplicate elements from the list
# for i in source_list:
#     if i not in newlist:
#         newlist.append(i)
#     else:
#         duplist.append(i) # this method catches the first duplicate entries, and appends them to the list

# # The next stage is to print the duplicate entries, and the unique entries
# print("List of duplicates", duplist)
# print("Unique Item List", newlist) 
# if len(duplist) > 0:
#   # print("UNDUPL")
#   sources = sources_unduplicated
# else:
#   # print("ORI")
#   sources =  '; '.join(source_list)
# # print("S5: ", sources)

# sources =  '; '.join(source_list)

# # Cleaning
# sources = re.sub("; $", "", sources)
# sources = re.sub("^; ", "", sources)
# sources = re.sub("(; )+", "; ", sources)
# sources = "\\footnote{" + sources + "}\n"

# content = re.sub("\n$", "", content)
# content = re.sub(r"^< ", "", content) # delete the first <
# content = re.sub(r"\n,", ",", content)
# content = re.sub(r" nan ", " ", content)
# content = re.sub("(<\.\n?)+$", "", content)

# content += sources

# env_begin = r"\begin{etymology}" + "\label{ety:" + df_local['id'].iloc[0] + "}" + nl 
# env_end = r"\end{etymology}"

# box = env_begin + content + env_end
# box = re.sub(r"\u200e", "", box) #removes right to left mark

# # Save the spicebox as a standalone tex file

# filename = re.sub(" ", "_", key)
# filename = filename.lower()
# f = open(path_out_tex + "{}.tex".format("etymbox_" + filename), "w", encoding='utf-8')
# f.write(box)
# f.close()
# print("Etymology-box '" + str(key) + "' as a tex file was created.")
# box


In [11]:
# def etymbox(key):
  
#   # The following code will create a etymology box environment for the key, to be used in LaTeX
#   print("Started the generation of '" + key + "' as etymbox...")

#   df_local = etymologies[key]
#   # df_local.fillna('', inplace=True)

#   # # Skipping those marked
#   df_local = df_local[df_local['boxskip'] != 'yes']
#   df_local.reset_index(inplace=True, drop=True)

#   # # Replace empty cells with NaNs
#   # df_local.replace(r'^\s*$', np.nan, regex=True, inplace=True)
#   # df_local.replace(r'^nan$', np.nan, regex=True, inplace=True)

#   # Create content and sources stage by stage
#   content = ""
#   source = ""
#   sources = ""
#   source_list = []
#   nl = "\n"

#   for index, row in df_local.iterrows():
#     stage = "< "
#     if pd.notna(row['complex']): # complex relationships
#       row['complex'] = re.sub("and from", "+", row['complex'])
#       stage += row['complex'] + " "
#     if pd.notna(row['language']): # language
#       stage += "\\textbf{" + row['language'] + "} "
#     if pd.notna(row['script']): # script
#       script = "{" + row['script'] + "} "
#       if row['language'] == 'Chinese':
#         script = "\\tc{" + row['script'] + "} "
#       stage += script
#     if pd.notna(row['term']): # term
#       stage += "\\textit{" + row['term'] + "} "
#     if pd.notna(row['IPA']): # IPA
#       stage += row['IPA'] + " "
#     if pd.notna(row['meaning']): # meaning
#       stage += "`" + row['meaning'] + "' "
#     if pd.notna(row['literal']): # literal meaning
#       stage += "[" + row['literal'] + "] "
#     stage = re.sub(' $', '', stage)
#     stage += ", "
#     if pd.notna(row['explanation']): # explanation
#       stage += row['explanation'] + " "
#     if pd.notna(row['remark']): # remark
#       stage += "(" + row['remark'] + ") "
#     stage = re.sub(',? ?$', '', stage)

#     if pd.notna(row['date']): # dates
#       stage += ", "
#       row['date'] = re.sub('a(?=\d)', 'a. ', row['date'])
#       row['date'] = re.sub('c(?=\d)', 'ca. ', row['date'])
#       if re.match('^-\d\d?$', row['date']): # if is a century BC
#         row['date'] = re.sub("-", "", row['date'])
#         date = "\\nth{" + row['date'] + "} c. \BC{}" # ARAB NUMERALS
#         # date = " " + roman(row['date']) + " \BC{}" # ROMAN NUMERALS
#       elif re.match('^\d\d?$', row['date']): # if is a century AD
#         date = "\\nth{" + row['date'] + "} c. \AD{}" # ARAB NUMERALS
#         # date = " " + roman(row['date']) + " \AD{}" # ROMAN NUMERALS
#       else:
#         date = row['date'] + " " # if it's a year
#       stage += date
#       stage = re.sub(',? ?$', '', stage)
#     if pd.notna(row['cognates']): # cognates
#       stage += "; cf. cognates " + row['cognates'] + " "
#     if pd.notna(row['derivates']): # cognates
#       if pd.notna(row['cognates']):
#         stage = re.sub(' $', '', stage)
#         stage += "; " + row['derivates'] + " "
#       else:
#         stage = re.sub(' $', '', stage)
#         stage += "; cf. " + row['derivates'] + " "
#     stage = re.sub(',? ?$', '', stage)
#     # stage = re.sub('cf\..*?(cf\.)', '', stage)

#   # Final touches
#     if row['doubt'] == 'yes':
#       stage = re.sub(r'<', '<\\\\textss{?}', stage) # ???
#     if row['complex'] == '+':
#       stage = re.sub('<', '', stage)
#     if row['complex'] == 'or from':
#       stage = re.sub('<', '', stage)
#     content += stage + nl

#   # Sources
#     source=""
#     if pd.notna(row['source zotero']):
#       source = row['source zotero']
#       # print(source)
#       if '{' in source:
#         source = "s" + row['source zotero'].lower()
#         # print(source)
#       else:
#         source = "{" + row['source zotero'].lower() + "}"
#         # print(source)
#       if pd.notna(row['source page']):
#         source = "[" + str(row['source page']) + "]{" + row['source zotero'].lower() + "}"
#         # print(source)
#         if row['source page'].isalpha() == True:
#           source = "[s.v. " + str(row['source page']) + "]{" + row['source zotero'].lower() + "}"
#           # print(source)
#       source = "\\textcite" + source
#     # print(source)
#     source_list.append(source)

#   # clear duplicates from sources:
#   # print("SL1: ", source_list)
#   # source_set = sorted(set(source_list), key=source_list.index)
#   source_set = set(source_list)
#   # print("SS2: ", source_set)
#   source_list2 = list(source_set)
#   # print("S3: ", source_list2)
#   sources_unduplicated = '; '.join(source_list2)
#   # print("S4: ", sources_unduplicated)
#   # test for duplicates
#   newlist = [] # empty list to hold unique elements from the list
#   duplist = [] # empty list to hold the duplicate elements from the list
#   for i in source_list:
#       if i not in newlist:
#           newlist.append(i)
#       else:
#           duplist.append(i) # this method catches the first duplicate entries, and appends them to the list
#   # The next stage is to print the duplicate entries, and the unique entries
#   # print("List of duplicates", duplist)
#   # print("Unique Item List", newlist) 
#   if len(duplist) > 0:
#     # print("UNDUPL")
#     sources = sources_unduplicated
#   else:
#     # print("ORI")
#     sources =  '; '.join(source_list)
#   # print("S5: ", sources)

#   sources =  '; '.join(source_list)

#   # Cleaning
#   sources = re.sub("; $", "", sources)
#   sources = re.sub("^; ", "", sources)
#   sources = re.sub("(; )+", "; ", sources)
#   sources = "\\footnote{" + sources + "}\n"

#   content = re.sub("\n$", "", content)
#   content = re.sub(r"^< ", "", content) # delete the first <
#   content = re.sub(r"\n,", ",", content)
#   content = re.sub(r" nan ", " ", content)
#   content = re.sub("(<\.\n?)+$", "", content)

#   content += sources

#   env_begin = r"\begin{etymology}" + "\label{ety:" + df_local['id'].iloc[0] + "}" + nl 
#   env_end = r"\end{etymology}"

#   box = env_begin + content + env_end
#   box = re.sub(r"\u200e", "", box) #removes right to left mark

#   # Save the spicebox as a standalone tex file
#   filename = re.sub(" ", "_", key)
#   filename = filename.lower()
#   f = open(path_out_tex + "{}.tex".format("etymbox_" + filename), "w", encoding='utf-8')  
#   f.write(box)
#   f.close()
#   print("Etymology-box '" + str(key) + "' as a tex file was created.")

#   return box

# etymbox("tester")

## Etymology box for Markdown

In [12]:
def etymbox(key):

  # The following code will create a etymology box environment for the key, to be used in Markdown
  print("Started the generation of '" + key + "' as etymbox...")

  # Select word
  df_local = etymologies[key]
  # df_local.fillna('', inplace=True)

  # Skipping those marked
  df_local = df_local[df_local['skip'] != 'yes']
  df_local.reset_index(inplace=True, drop=True)

  # # Replace empty cells with NaNs
  # df_local.replace(r'^\s*$', np.nan, regex=True, inplace=True)
  # df_local.replace(r'^nan$', np.nan, regex=True, inplace=True)

  # Initialize
  content = ""
  sources = ""
  source_pages = ""

  # Iterate through a word's etymology dataframe (stage by stage = row by row)
  for index, row in df_local.iterrows():
    # Initialize stage, add an '<' if its not the first stage
    if index == 0:
      stage = ""
    else:
      stage = "< "
    # If there are complex relations, add them (e.g., partly, and, or)
    if pd.notna(row['complex']):
      stage += row['complex'] + " "
    # Add language (in bold)
    if pd.notna(row['language']):
      stage += "**" + row['language'] + "** "
    # Add the term with native script (if exists)
    if pd.notna(row['script']):
      stage += row['script'] + " "
    # Add the term with transcription (in italics)
    if pd.notna(row['term']):
      stage += "*" + row['term'] + "* "
    # Add /IPA/
    if pd.notna(row['IPA']):
      stage += "/" + row['IPA'] + "/ "
    # Add 'meaning', gloss
    if pd.notna(row['meaning']):
      stage += "'" + row['meaning'] + "' "
    # Add the [literal meaning] if there is one
    if pd.notna(row['literal']):
      stage += "[" + row['literal'] + "] "
    # Clear ending
    stage = re.sub(' $', '', stage)
    # Add explanation
    if pd.notna(row['explanation']):
      stage += ", " + row['explanation'] + " "
    # Add (remark)
    if pd.notna(row['remark']):
      stage += " (" + row['remark'] + ") "
    # Clear ending
    stage = re.sub(" +", " ", stage)
    stage = re.sub(",? ?$", "", stage)

    # Add date, if there is a date
    if pd.notna(row['date']):
      # If it's a year
      date = row['date']

    # Add century if there is no date
    if pd.notna(row['century']) and pd.isna(row['date']):
      
      # If it's a century BC
      if re.match('^-\d\d?\??$', row['century']):
        # Remove dash
        row['century'] = re.sub("-", "", row['century'])
        # If there is question mark
        if re.match('\d\?', row['century']):
          # Remove the question mark
          row['century'] = re.sub("\?", "", row['century'])
          # Date BC
          # date = roman(int(row['century'])) + " BC?" # ROMAN NUMERALS
          date = row['century'] + " c. BC?" # ARAB NUMERALS
        else:
          # date = roman(int(row['century'])) + " BC" # ROMAN NUMERALS
          date = row['century'] + " c. BC" # ARAB NUMERALS

      # If it is a century AD
      elif re.match('^\d\d?\??$', row['century']):
        # If there is question mark
        if re.match('\d\?', row['century']):
          # Remove the question mark
          row['century'] = re.sub("\?", "", row['century'])
          # Date AD
          # date = "AD " + roman(int(row['century'])) + "?" # ROMAN NUMERALS
          date = "AD " + row['century'] + " c." # ARAB NUMERALS
        else:
          # date = "AD " + roman(int(row['century'])) # ROMAN NUMERALS
          date = "AD " + row['century'] + " c." # ARAB NUMERALS

      # Add date to stage
      stage += ", " + date
      
    # Clear ending
    stage = re.sub(',? ?$', '', stage)

    # If both cognates and derivates
    if pd.notna(row['cognates']) and pd.notna(row['derivates']):
      stage += "; cf. cognates " + row['cognates'] + "; derivates " + row['derivates'] + " "
    # If cognates only
    if pd.notna(row['cognates']) and pd.isna(row['derivates']):
      stage += "; cf. cognates " + row['cognates'] + " "
    # If derivates only
    if pd.notna(row['derivates']) and pd.isna(row['cognates']):
      stage += "; cf. derivates " + row['derivates'] + " "
    
    # Clear ending
    stage = re.sub(';?,? ?$', '', stage)

    # If stage is doubtful, use '<?'
    if row['doubt'] == 'yes':
      stage = re.sub('<', '<\?', stage)
    # If stage is "complex", remove '<'
    if pd.notna(row['complex']):
      stage = re.sub('<', '', stage)

    # # Sources (at each stage) A
    # source = ""
    # # If there is source (zotero), add
    # if pd.notna(row['source']):
    #   source = '\"' + row['source'] + '\"'
    #   # If there is page, add
    #   if pd.notna(row['source page']):
    #     source = '\"' + str(row['source'].lower()) + '\" \"' + str(row['source page']) + '\"'
    # # Add the Hugo shortcode syntax 
    # source = r' {{< cite ' + source + r' >}}'
    # # Create content
    # content += stage + source + "\n"

    # Sources (once in the end, removing duplicates) B
    # If there is source (zotero), add
    if pd.notna(row['source']):
      sources += row['source'] + ";"
      # If there is page, add
      if pd.notna(row['source page']):
        source_pages += str(row['source page']) + ";"
      else:
        source_pages += ";"
    # Create content
    content += stage + "\n"

  # If using version B of sources
  # Clean ending
  sources = re.sub(';?$', '', sources)
  source_pages = re.sub(';?$', '', source_pages)
  # Add the Hugo-cite shortcode syntax 
  source = r' {{< cite "' + sources + r'" "' + source_pages + r'" >}}'
  # Add source to content
  content += source



  # Cleaning
  box = content
  box = re.sub(r"\u200e", "", box) # Removes right-to-left mark

  # # Save the spicebox as a standalone markdown file (if ever needed)
  # filename = re.sub(" ", "_", key)
  # filename = filename.lower()
  # f = open(path_out_md + "{}.md".format("etymbox_" + filename), "w", encoding='utf-8')  
  # f.write(box)
  # f.close()
  # print("Etymology-box '" + str(key) + "' as a md file was created.")

  return box

etymbox("tester")

Started the generation of 'tester' as etymbox...


'**Language A** тест *test* /tɛst/ \'meaning1\' [literal1], explanation1 (remark1); cf. cognates cognates1; derivates derivates1\n< **Language B** тестер *tester* /ˈtɛstə/ \'meaning2\' [literal2], explanation2, AD 12 c.; cf. cognates cognates2\n< **Language C** тестинг *testing* /ˈtɛstɪŋ/ \'meaning3\' [literal3] (remark3), 9 c. BC; cf. derivates derivates3\n< **Language D** тесте *teste* /ˈaltə/ \'meaning4\' [literal4], explanation4 (remark4); cf. cognates cognates4; derivates derivates4\n {{< cite "oed;wehr_dictionary_1976;wehr_dictionary_1976;liddell_greekenglish_1940;wehr_dictionary_1976;lewis_latin_1879;liddell_greekenglish_1940" "1;2-3;4;5;6;;7" >}}'

In [13]:
# Creating a dictionary of etymologies
dictionary_of_etymologies = {}
# Loop
for key in list_of_etymologies:
    box = (etymbox(key))
    # box = r'{{% notice style="primary" title="Pirates" icon="skull-crossbones" %}}' + "\n" + text + "\n" + r"{{% /notice %}}" + "\n\n"
    dictionary_update = {key: box}
    dictionary_of_etymologies.update(dictionary_update)
print('Done.')


Started the generation of 'tester' as etymbox...
Started the generation of 'allspice' as etymbox...
Started the generation of 'fulful ifranji' as etymbox...
Started the generation of 'duoxiangguo' as etymbox...
Started the generation of 'pimento' as etymbox...
Started the generation of 'anise' as etymbox...
Started the generation of 'anisun' as etymbox...
Started the generation of 'huiqin' as etymbox...
Started the generation of 'asafoetida' as etymbox...
Started the generation of 'hing' as etymbox...
Started the generation of 'hiltit' as etymbox...
Started the generation of 'anjudan' as etymbox...
Started the generation of 'awei' as etymbox...
Started the generation of 'xingqu' as etymbox...
Started the generation of 'caraway' as etymbox...
Started the generation of 'karawiya' as etymbox...
Started the generation of 'geluzi' as etymbox...
Started the generation of 'cardamom' as etymbox...
Started the generation of 'amomum' as etymbox...
Started the generation of 'hal' as etymbox...
St

In [14]:
# Check
dictionary_of_etymologies['allspice']

'**English** *allspice*, from *all* + *spice*; after the flavor profile that resembles the combined aroma of cloves, nutmeg, cinnamon, and black pepper\n {{< cite "oed" "allspice" >}}'

# Website Generation

## Create a Spice Page

In [15]:
def spicepage(key):
    '''
    This cell generates website pages from the spice datasets and writes them out to a markdown file.
    '''

    pd.options.mode.copy_on_write = True # to avoid SettingWithCopyWarning, https://stackoverflow.com/questions/20625582/how-to-deal-with-settingwithcopywarning-in-pandas
    
    # Dataframe of current item
    print("Working on", key)
    df_local = df_spices.loc[df_spices['id'] == key]

    # Reset index
    df_local.reset_index(drop=True, inplace=True)

    ###########################################################################
    # The Spice ###############################################################
    ###########################################################################
    
    # Create description
    description = df_local['description'][0].capitalize()

    # Create "related"
    if pd.notna(df_local['related'].iloc[0]):
        related = ", related to " + str(df_local['related'].iloc[0])
    else:
        related = ""

    # Create "also known as..."
    if pd.notna(df_local['en alt'].iloc[0]):
        aka = ", also known as " + str(df_local['en alt'].iloc[0])
    else:
        aka = ""

    description = description + aka + related + "."

    # Extract categories
    category = df_local['category'][0]
    if ";" in df_local['category'][0]:
        category_list = category.split("; ")
    else:
        category_list = "['" + category + "']"

    # Extract tags
    tag = df_local['tag'][0]
    if ";" in df_local['tag'][0]:
        tag_list = tag.split(";")
    else:
        tag_list = "['" + tag + "']"

    # Preamble (US timezone can make pages not appear in "future timezones")
    preamble = '+++\ntitle = "' + key.title() + '"\nauthor = "Gabor Parti"\ndate = "' + str(date.today()) + '"\ndescription = "' + description + '"\nweight = 10\n# draft = "true"\n# hidden = "true"\n# plotly = true\ncategories = ' + str(category_list)  + "\ntags = " + str(tag_list) + '\nbibFile = "static/files/bibliography.json"\n+++\n\n'

    # Illustration
    illustration_source = 'Illustration of *' + df_local['species'][0] + '* ' + df_local['species by'][0] + " from " + df_local['ill source'][0] + ", " + str(df_local['ill page'][0]) + "."

    illustration = r'![' + 'Illustration of *' + df_local['species'][0] + '* ' + df_local['species by'][0] + '](/images/illustrations/' + re.sub(" ", "_", key) + '.png?width=25vw "' + illustration_source + '")' + '\n'

    illustration = illustration + "\n>" + illustration_source + "\n\n"

    # Overview 
    overview_head = "## Overview\n\n"
    # Merge species name
    df_local['species name'] = "*" + df_local['species'] + "* " + df_local['species by']
    # Set link
    df_local['botanical database'] = "[POWO](" + df_local['powo'] + ")"
    # Prepare overview tables
    df_overview = df_local[['species name', 'family', 'part used', 'macroarea', 'region of origin', 'cultivation', 'color', 'botanical database']]
    # Transpose table
    df_overview = df_overview.T
    # Reset index
    df_overview.reset_index(inplace=True)
    # Rename columns
    df_overview.columns = ['id', key]
    # Prepare data to create markdown table
    data = df_overview.to_dict(orient='records')
    # Create markdown table
    overview_mdt = markdown_table(data).set_params(row_sep = 'markdown', quote = False).get_markdown()
    # Table
    overview = overview_head + overview_mdt + "\n\n"

    # Intro
    category = re.sub('; ', ' and ', category)
    tag = re.sub('; ', ' and ', tag)
    part = df_local['part used'][0]
    part = re.sub('; ', ' and ', part)
    intro = key.title() + " (" + str(df_local['species name'][0]) + ")" + " is a " + tag + " " + category + " from the *" + str(df_local['family'][0]) + "* family,[^powo] originating in the region(s) of " + str(df_local['region of origin'][0]) + ".[^van_wyk_culinary_2014] " + "It is used for its " + part + ", primarily for " + str(df_local['major uses'][0]) + ". Its aroma is described as " + str(df_local['taste/smell'][0]) + ", with a heat index of " + str(df_local['heat'][0]) + ".[^ucla_medicinal_2002]" + "\n\n"
    
    # Create references
    references = "[^powo]: POWO. (2022). Plants of the World Online (Botanical Database). Facilitated by the Royal Botanic Gardens, Kew. http://www.plantsoftheworldonline.org/\n[^van_wyk_culinary_2014]: van Wyk, B.-E. (2014). Culinary Herbs and Spices of the World. University of Chicago Press, joint publication with the Royal Botanic Gardens, Kew. https://doi.org/10.7208/chicago/9780226091839.001.0001\n[^ucla_medicinal_2002]: Medicinal Spices Exhibit. (2002). UCLA Biomedical Library: History & Special Collections. https://unitproj.library.ucla.edu/biomed/spice/index.cfm?spicefilename=taste.txt&itemsuppress=yes&displayswitch=0\n\n"
    # references = ""

    # # Quick names
    # if type(df_local['wn_fra'][0]) == str:
    #     fra = df_local['wn_fra'][0]
    #     fra = re.sub("[\[\]]", "", fra)
    #     fra = re.sub("\_", " ", fra)
    #     df_local['French'] = fra
    # else:
    #     df_local['French'] = ""

    df_quick_names = df_local[['Hungarian', 'Arabic', 'Chinese', ]] # 'French' 
    # df_names = df_names.T
    # df_names.reset_index(inplace=True)
    # df_names.columns = ['language', 'name(s)']
    data = df_quick_names.to_dict(orient='records')
    quick_names_mdt = markdown_table(data).set_params(row_sep = 'markdown', quote = False).get_markdown()
    quick_names = quick_names_mdt + "\n\n"

    ############################################
    ######## The Nomenclature ########

    # Dataframe of current item 
    df_names_local = df_names.loc[df_names['id'] == key]
    # Reset index
    df_names_local.reset_index(drop=True, inplace=True)
    # Names
    names_head = "***\n\n## Nomenclature\n\n"
    # Heads
    names_head_en = "### English\n\n"
    names_head_ar = "### Arabic\n\n"
    names_head_zh = "### Chinese\n\n"
    
    # Language by language
    language = "English"
    df = df_names_local.loc[df_names_local['language'] == language]
    df = df[['term', 'source human']]
    df = df.rename(columns={'source human': 'source'})
    data = df.to_dict(orient='records')
    names_mdt_en = markdown_table(data).set_params(row_sep = 'markdown', quote = False).get_markdown()

    language = "Arabic"
    df = df_names_local.loc[df_names_local['language'] == language]
    df = df[['script', 'term', 'literal', 'source human']]
    df = df.rename(columns={'source human': 'source'})
    data = df.to_dict(orient='records')
    names_mdt_ar = markdown_table(data).set_params(row_sep = 'markdown', quote = False).get_markdown()

    language = "Chinese"
    df = df_names_local.loc[df_names_local['language'] == language]
    df = df[['script', 'term', 'literal', 'source human']]
    df = df.rename(columns={'source human': 'source'})
    data = df.to_dict(orient='records')
    names_mdt_zh = markdown_table(data).set_params(row_sep = 'markdown', quote = False).get_markdown()
    names = names_head + names_head_en + names_mdt_en + "\n\n" + names_head_ar + names_mdt_ar + "\n\n" + names_head_zh + names_mdt_zh + "\n\n"
    names = ""

    ######## Extract etymologies ########
    wordlist = df_local['words'][0].split("; ")
    etymologies = ""
    for word in wordlist:
        etymologies += dictionary_of_etymologies[word]
    etymologies = "## Etymologies\n\n" + etymologies
    etymologies = ""

    # Plotly file
    jsons = ""
    # key_ = re.sub(" ", "_", key)
    # jsons = r'{{< load-plotly >}}' + '\n' + r'{{< plotly json="/plotly/diffusion_name_' + key_ + r'.json" height="300px" >}}' + '\n\n'

    # Plotly files for all words (some missing)
    # jsons = ""
    # for word in wordlist:
    #     word_ = re.sub(" ", "_", word)
    #     json = r'{{< load-plotly >}}' + '\n' + r'{{< plotly json="/plotly/diffusion_name_' + word_ + r'.json" height="300px" >}}' + '\n\n'
    #     jsons += json
    # jsons = "## Etymology maps\n\n" + jsons
    
    # # Manuscripts (handwritten spice pages embedded into the generated one)
    # for key in list_of_manuscripts:
    # filename = re.sub(" ", "_", key)
    # with open(path + filename + ".md", 'r', encoding="utf8") as md:
    #     lines = md.readlines()
    #     text = "".join(lines)
    # text = re.sub('\n',' ',text)

    ######## Assemble page ########
    page = preamble + illustration + intro + quick_names + overview + etymologies + jsons + names + references
    
    # Write markdown file
    filename = re.sub(" ", "_", key)
    with open(path_out_md + filename + '_gen.md', 'w', encoding='utf-8') as f:
        f.write(page)
    return

In [16]:
# Loop through all spices
for key in list_of_spices:
    spicepage(key)
print("Done.")

Working on Sichuan pepper
Working on allspice
Working on anise
Working on asafoetida
Working on caraway
Working on cardamom
Working on cassia
Working on chile
Working on cinnamon
Working on clove
Working on coriander
Working on cumin
Working on dill seeds


TypeError: can only concatenate str (not "float") to str

In [None]:
# # Move files to the website folder
# move_dir(path_out_md, website_md, "*.md")

## Merge autogenerated files with manuscripts

In [None]:
def build_page(key):
    filename = re.sub(" ", "_", key)
    filepath = website_md + '/manuscripts/' + filename + '_ms.md'
    if os.path.isfile(filepath) == True:
        # Read generated files
        with open(path_out_md + filename + '_gen.md', 'r', encoding='utf-8') as f:
            generated = f.read()
        # Read manuscript files    
        with open(website_md + '/manuscripts/' + filename + '_ms.md', 'r', encoding='utf-8') as f:
            manuscript = f.read()
        # Assemple    
        page = generated + "***\n\n" + manuscript
        # Write out page file
        with open(website_md + filename + '.md', 'w', encoding='utf-8') as f:
            f.write(page)
    else:
        # Read generated files
        with open(path_out_md + filename + '_gen.md', 'r', encoding='utf-8') as f:
            generated = f.read()
        # Assemple    
        page = generated
        # Write out page file
        with open(website_md + filename + '.md', 'w', encoding='utf-8') as f:
            f.write(page)
    return


In [None]:
for key in list_of_spices:
    build_page(key)

In [17]:
# ...measure time
end_time = datetime.now()
print("All done at " + str(end_time) + ".")
print('Duration: {}'.format(end_time - start_time))

All done at 2023-09-27 14:45:06.155522.
Duration: 0:00:03.518861


# Maps

In [18]:
df = df_spices.copy()

In [19]:
# # Basic example with Plotly Go
# # Create figure data
# fig = go.Figure(data=go.Scattergeo(
#         lon = df['lon'],
#         lat = df['lat'],
#         text = df['id'],
#         mode = 'markers',
#         # marker_color = df['cnt'],
#         ))

# # Update layout
# fig.update_layout(
#         title = 'Title',
#         geo_scope='world',
#         template = 'plotly_dark'
#     )

# # Show figure
# fig.show()

### Settings for fancy maps

In [20]:
# Visual variables for map (dark mode)

font_size = 14
font_color = "white"
font_family = "Sans-Serif"
marker_symbol= 'circle'
marker_size = 14
max_marker_size = 32
edge_color = transparent
edge_size = 1
opacity = 0.7
line_width = 4
water = nord0
grid_color = nord1
land = nord2
lines = nord2
copyright_color = nord3
background_color = transparent
legend_background_color = quarter_transparent
color_scheme = prism

In [21]:
# Orthographic globe layout
ortho_traces = dict(
    textposition = 'top right', # middle left, bottom center, etc.
    textfont = dict(size=font_size, color=font_color, family=font_family),
    hovertemplate=
        "<b>%{text}</b><br><br>" +
        "Species: <i>%{customdata[0]}</i><br>" +
        "Family: <i>%{customdata[1]}</i><br>" +
        "Region of origin: %{customdata[2]}<br>" +
        "Arabic: %{customdata[3]} <i>%{customdata[4]}</i><br>" +
        "Chinese: %{customdata[5]} <i>%{customdata[6]}</i><br>" +
        # "Spreadability: %{customdata[7]:.2f}<br>" +
        "<extra></extra>",
    marker = dict(
        symbol = marker_symbol,
        size = marker_size,
        line = dict(
            color=edge_color,
            width=edge_size
        )
    )
)

ortho_layout = go.Layout(
    paper_bgcolor=background_color,
    plot_bgcolor=background_color,
    geo = dict(
        resolution=110, # 50 is large; 110 is small
        scope='world', # 'world', 'asia'
        projection_type = 'orthographic', # orthographic, natural earth
        projection_scale = 1,
        projection_rotation = {'lat': 20, 'lon': 80, 'roll': 0},
        bgcolor=background_color,
        showcoastlines=True, coastlinewidth = 1, coastlinecolor = lines,
        showcountries=False, countrywidth = 1, countrycolor = lines, 
        showframe=True, framewidth = 1, framecolor = lines, 
        showlakes=True, lakecolor = water,
        showland=True, landcolor = land, 
        showocean=True, oceancolor = water,
        showrivers=True, riverwidth = 1, rivercolor = water,
        showsubunits=False, subunitwidth = 1, subunitcolor = lines, 
        lonaxis = dict(showgrid = True, gridwidth = 0.5, dtick = 10, gridcolor=grid_color),
        lataxis = dict (showgrid = True, gridwidth = 0.5, dtick = 10, gridcolor=grid_color)),
    showlegend = True,
    legend=dict(x=0, y=0, xanchor="left", yanchor="bottom", bgcolor=legend_background_color,  
                font=dict(color=font_color, size=font_size, family=font_family), 
                title_font=dict(color=font_color, size=font_size+2, family=font_family),
                traceorder = 'normal', orientation="v"),
    title=dict(x=0.5, y=0.99, xanchor='center', yanchor='top', text='',
               font=dict(color=font_color, size=font_size+6, family=font_family)),
    margin={"r":0,"t":0,"l":0,"b":0},
    hoverlabel=dict(#bgcolor="white", 
                    font_size=font_size, 
                    font_family=font_family),
    )

# "Document size" for pdfs
document_size = dict(width = 600, height=600)

# Copyright
cr = dict(
    name="copyright",
    text="© Gábor Parti, 2023",
    font=dict(color=copyright_color, size=8, family=font_family),
    opacity=0.9,
    xref="paper",
    yref="paper",
    x=0.5,
    y=0,
    # xanchor="right", 
    # yanchor="bottom", 
    # align="center",
    showarrow=False,
)
# fig.update_layout(annotations = [cr]) # to call

## Info
info = dict(
    name="info",
    text="Click on a material to navigate to its corresponding page!",
    font=dict(color=font_color, size=font_size, family=font_family),
    opacity=0.9,
    xref="paper",
    yref="paper",
    x=0.5,
    y=0.05,
    # xanchor="right", 
    # yanchor="bottom", 
    # align="center",
    showarrow=False,
)
# fig.update_layout(annotations = [info]) # to call

# Adding layout images
logo = dict(
    source="https://upload.wikimedia.org/wikipedia/en/thumb/9/9e/PolyU_Logo_with_wordmark.svg/1024px-PolyU_Logo_with_wordmark.svg.png",
    sizex=0.15, sizey=0.15,
    x=1, y=0, 
    xanchor="right", 
    yanchor="bottom", 
)
# fig.add_layout_image(logo) # to call

## Plot

In [22]:
# Set size
df['size'] = 1

# Create figure data
data = px.scatter_geo(df,
    lat='lat', 
    lon='lon',
    text='id',
    color='family',
    color_discrete_sequence=color_scheme,
    size_max = max_marker_size,
    size = 'size',
    opacity = opacity,
    hover_name='id',
    hover_data={'species':True, 'family':True, 'region of origin':True, 'Arabic':True, 'ar transliteration':True, 'Chinese':True, 'pinyin':True, 'lon':False, 'lat':False, 'size':False}, #'spreadability':':.2f', 
    labels={"group": "category"}
    )

# Save figure data
fig = data

###################################################
# Interactive visualization (HTML/JSON) for the web

# Call the orthographic traces and layout settings from above
fig.update_traces(ortho_traces)
fig.update_layout(ortho_layout)
# fig.update_layout(title_text = "Title")

# Add copyrigth
fig.update_layout(annotations=[cr])

# Show figure
fig.show()

# Write interactive visualization (HTML/JSON) for the web
filename = "test"
# fig.write_html(path_out_html + filename + ".html")
fig.write_json(path_out_json + filename + ".json", validate=True, pretty=True)

###################################################
# Image (PNG/PDF) for documents

# Call figure data
# fig = data

# Call the orthographic traces and layout settings from above
# fig.update_traces(ortho_traces)
# fig.update_layout(ortho_layout)
# fig.update_layout(geo=dict(projection_rotation = {'lat': 20, 'lon': 80, 'roll': 0}))
# fig.update_layout(document_size)

# Show figure
# fig.show()

# filename = "test"
# fig.write_image(path_out_png + filename + ".png", scale=3)
# fig.write_image(filename + ".pdf", engine="kaleido")


In [23]:
# Move files to the website folder
move_dir(path_out_json, website_json, "*.json")

# Note

In [24]:
df_airports = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/2011_february_us_airport_traffic.csv')
df_airports.head()


Unnamed: 0,iata,airport,city,state,country,lat,long,cnt
0,ORD,Chicago O'Hare International,Chicago,IL,USA,41.979595,-87.904464,25129
1,ATL,William B Hartsfield-Atlanta Intl,Atlanta,GA,USA,33.640444,-84.426944,21925
2,DFW,Dallas-Fort Worth International,Dallas-Fort Worth,TX,USA,32.895951,-97.0372,20662
3,PHX,Phoenix Sky Harbor International,Phoenix,AZ,USA,33.434167,-112.008056,17290
4,DEN,Denver Intl,Denver,CO,USA,39.858408,-104.667002,13781


In [31]:
df_flight_paths = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/2011_february_aa_flight_paths.csv')
df_flight_paths.head()
df_flight_paths

Unnamed: 0,start_lat,start_lon,end_lat,end_lon,airline,airport1,airport2,cnt
0,32.895951,-97.037200,35.040222,-106.609194,AA,DFW,ABQ,444
1,41.979595,-87.904464,30.194533,-97.669872,AA,ORD,AUS,166
2,32.895951,-97.037200,41.938874,-72.683228,AA,DFW,BDL,162
3,18.439417,-66.001833,41.938874,-72.683228,AA,SJU,BDL,56
4,32.895951,-97.037200,33.562943,-86.753550,AA,DFW,BHM,168
...,...,...,...,...,...,...,...,...
173,25.793250,-80.290556,27.975472,-82.533250,AA,MIA,TPA,298
174,41.979595,-87.904464,27.975472,-82.533250,AA,ORD,TPA,150
175,18.439417,-66.001833,27.975472,-82.533250,AA,SJU,TPA,56
176,32.895951,-97.037200,36.198372,-95.888242,AA,DFW,TUL,390


In [35]:
fig = go.Figure()

fig.add_trace(go.Scattergeo(
    locationmode = 'USA-states',
    lon = df_airports['long'],
    lat = df_airports['lat'],
    hoverinfo = 'text',
    text = df_airports['airport'],
    mode = 'markers',
    marker = dict(
        size = 3,
        color = 'rgb(255, 0, 0)',
        line = dict(
            width = 3,
            color = 'rgba(68, 68, 68, 0)'
        )
    )))

# Initialize empty lists to contain latitudes and longitudes
lons = []
lats = []

# 
lons = np.empty(3 * len(df_flight_paths))
lons[::3] = df_flight_paths['start_lon']
lons[1::3] = df_flight_paths['end_lon']
lons[2::3] = None
lats = np.empty(3 * len(df_flight_paths))
lats[::3] = df_flight_paths['start_lat']
lats[1::3] = df_flight_paths['end_lat']
lats[2::3] = None


In [36]:
lons

array([ -97.0372    , -106.6091944 ,           nan,  -87.90446417,
        -97.66987194,           nan,  -97.0372    ,  -72.68322833,
                 nan,  -66.00183333,  -72.68322833,           nan,
        -97.0372    ,  -86.75354972,           nan,  -80.29055556,
        -86.67818222,           nan,  -97.0372    ,  -71.00517917,
                 nan,  -80.29055556,  -71.00517917,           nan,
        -87.90446417,  -71.00517917,           nan,  -66.00183333,
        -71.00517917,           nan,  -64.97336111,  -71.00517917,
                 nan,  -80.29055556,  -76.66819833,           nan,
        -66.00183333,  -76.66819833,           nan,  -97.0372    ,
        -80.94312583,           nan,  -97.0372    , -104.70025   ,
                 nan,  -97.0372    ,  -84.219375  ,           nan,
        -97.0372    ,  -77.03772222,           nan,  -80.29055556,
       -104.6670019 ,           nan,  -87.90446417, -104.6670019 ,
                 nan,  -84.42694444,  -97.0372    ,           

In [37]:
fig.add_trace(
    go.Scattergeo(
        locationmode = 'USA-states',
        lon = lons,
        lat = lats,
        mode = 'lines',
        line = dict(width = 1,color = 'red'),
        opacity = 0.5
    )
)

fig.update_layout(
    title_text = 'Feb. 2011 American Airline flight paths<br>(Hover for airport names)',
    showlegend = False,
    geo = go.layout.Geo(
        scope = 'north america',
        projection_type = 'azimuthal equal area',
        showland = True,
        landcolor = 'rgb(243, 243, 243)',
        countrycolor = 'rgb(204, 204, 204)',
    ),
    height=700,
)

fig.show()


In [38]:
import plotly.express as px
df = px.data.gapminder().query("year == 2007")
df


Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap,iso_alpha,iso_num
11,Afghanistan,Asia,2007,43.828,31889923,974.580338,AFG,4
23,Albania,Europe,2007,76.423,3600523,5937.029526,ALB,8
35,Algeria,Africa,2007,72.301,33333216,6223.367465,DZA,12
47,Angola,Africa,2007,42.731,12420476,4797.231267,AGO,24
59,Argentina,Americas,2007,75.320,40301927,12779.379640,ARG,32
...,...,...,...,...,...,...,...,...
1655,Vietnam,Asia,2007,74.249,85262356,2441.576404,VNM,704
1667,West Bank and Gaza,Asia,2007,73.422,4018332,3025.349798,PSE,275
1679,"Yemen, Rep.",Asia,2007,62.698,22211743,2280.769906,YEM,887
1691,Zambia,Africa,2007,42.384,11746035,1271.211593,ZMB,894


In [39]:
fig = px.line_geo(df, locations="iso_alpha",
                  color="continent", # "continent" is one of the columns of gapminder
                  projection="orthographic")
fig.show()

In [40]:
# A plotly figure with cloropleth map to show certain regions of the world, e.g, Brasil, Argentina, and Chile.

import plotly.graph_objects as go
import pandas as pd

df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/2014_world_gdp_with_codes.csv')

fig = go.Figure(data=go.Choropleth(
    locations = df['CODE'],
    z = df['GDP (BILLIONS)'],
    text = df['COUNTRY'],
    colorscale = 'Blues',
    autocolorscale=False,
    reversescale=True,
    marker_line_color='darkgray',
    marker_line_width=0.5,
    colorbar_tickprefix = '$',
    colorbar_title = 'GDP<br>Billions US$',
))

fig.update_layout(
    title_text='2014 Global GDP<br>Source: <a href="https://www.cia.gov/library/publications/the-world-factbook/fields/2195.html">\
        CIA World Factbook</a>',
    geo=dict(
        showframe=False,
        showcoastlines=False,
        projection_type='equirectangular'
    ),

    annotations = [dict(
        x=0.55,
        y=0.1,
        xref='paper',
        yref='paper',
        text='Source: <a href="https://www.cia.gov/library/publications/the-world-factbook/fields/2195.html">\
            CIA World Factbook</a>',
        showarrow = False
    )]
)

fig.show()
