## Import

Load data.

In [14]:
import os
import pandas as pd
from datetime import date
from utils import *

save = True
delim = "|"
timestamp = date.today().strftime("%Y%m%d")
in_path = os.path.join("input", "pleco_20230529.csv")
out_path = os.path.join("output", f"pleco_{timestamp}.csv")
audio_dir = os.path.join("output", "audio")

cols = ["Simp", "Trad", "Meaning", "Simp (Color)", "Trad (Color)", "Pinyin (Color)", "Sound", "Silhouette"]
df = pd.read_csv(in_path, dtype = str).rename(columns = {"Pinyin": "Pinyin (Color)"})
df = df[[col for col in cols if col in df.columns]]
df_edited = df.copy()

print("Shape:", df.shape)
df.head()

Shape: (109, 4)


Unnamed: 0,Simp,Trad,Meaning,Pinyin (Color)
0,苍白,蒼白,pale<br>pallid,cāngbái
1,细瘦,細瘦,thin<br>scrawny,xìshòu
2,裂缝,裂縫,rift<br>crack<br>fissure<br>estrangement of af...,lièfèng
3,黏性,黏性,stickiness<br>viscosity,niánxìng
4,嗅,嗅,to smell<br>to sniff,xiù


# Basic Formatting

In [15]:
df_edited["Silhouette"] = df["Simp"].apply(get_silhouette)
df_edited = df_edited[[col for col in cols if col in df_edited.columns]]

print("\nView table:")
df_edited


View table:


Unnamed: 0,Simp,Trad,Meaning,Pinyin (Color),Silhouette
0,苍白,蒼白,pale<br>pallid,cāngbái,_ _
1,细瘦,細瘦,thin<br>scrawny,xìshòu,_ _
2,裂缝,裂縫,rift<br>crack<br>fissure<br>estrangement of af...,lièfèng,_ _
3,黏性,黏性,stickiness<br>viscosity,niánxìng,_ _
4,嗅,嗅,to smell<br>to sniff,xiù,_
...,...,...,...,...,...
104,投,投,throw<br>fling<br>hurl,tóu,_
105,遮蔽,遮蔽,to hide from view<br>to screen<br>to obstruct,zhēbì,_ _
106,乌鸦,烏鴉,crow,wūyā,_ _
107,塔楼,塔樓,tower<br>turret,tǎlóu,_ _


## Coloring

Convert characters to pinyin. Then add color.

In [16]:
pin_split = (
    df_edited["Pinyin (Color)"]
        .apply(PINYIN_RE.split)
        .apply(delim.join)
        .apply(full_split)
)
simp_split = df_edited["Simp"].apply(delim.join).apply(full_split, args = delim)
trad_split = df_edited["Trad"].apply(delim.join).apply(full_split, args = delim)

df_edited["Simp (Color)"] = pd.concat([pin_split, simp_split], axis = 1).apply(lambda row: add_color_hanzi(row[0], row[1]), axis = 1)
df_edited["Trad (Color)"] = pd.concat([pin_split, trad_split], axis = 1).apply(lambda row: add_color_hanzi(row[0], row[1]), axis = 1)
df_edited["Pinyin (Color)"] = pin_split.apply(simplify_split).apply(tag_split)

df_edited = df_edited[[col for col in cols if col in df_edited.columns]]
df_edited.head()

Unnamed: 0,Simp,Trad,Meaning,Simp (Color),Trad (Color),Pinyin (Color),Silhouette
0,苍白,蒼白,pale<br>pallid,<span class = 'tone1'>苍</span><span class = 't...,<span class = 'tone1'>蒼</span><span class = 't...,<span class = 'tone1'>cāng</span><span class =...,_ _
1,细瘦,細瘦,thin<br>scrawny,<span class = 'tone4'>细瘦</span>,<span class = 'tone4'>細瘦</span>,<span class = 'tone4'>xìshòu</span>,_ _
2,裂缝,裂縫,rift<br>crack<br>fissure<br>estrangement of af...,<span class = 'tone4'>裂缝</span>,<span class = 'tone4'>裂縫</span>,<span class = 'tone4'>lièfèng</span>,_ _
3,黏性,黏性,stickiness<br>viscosity,<span class = 'tone2'>黏</span><span class = 't...,<span class = 'tone2'>黏</span><span class = 't...,<span class = 'tone2'>nián</span><span class =...,_ _
4,嗅,嗅,to smell<br>to sniff,<span class = 'tone4'>嗅</span>,<span class = 'tone4'>嗅</span>,<span class = 'tone4'>xiù</span>,_


In [10]:
df_edited[df_edited["Simp (Color)"] == "(color error)"]

Unnamed: 0,Simp,Trad,Meaning,Simp (Color),Trad (Color),Pinyin (Color),Silhouette


# Sound

In [17]:
df_edited["Sound"] = df_edited["Simp"].apply(lambda s : save_audio(s, audio_dir, save))

errors = df_edited["Sound"][df_edited["Sound"].apply(lambda s : s.startswith("FAILED"))]
print("Errors:\n", errors)
df_edited.head()

Errors:
 Series([], Name: Sound, dtype: object)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_edited["Sound"] = df_edited["Simp"].apply(lambda s : save_audio(s, audio_dir, save))


Unnamed: 0,Simp,Trad,Meaning,Simp (Color),Trad (Color),Pinyin (Color),Silhouette,Sound
0,苍白,蒼白,pale<br>pallid,<span class = 'tone1'>苍</span><span class = 't...,<span class = 'tone1'>蒼</span><span class = 't...,<span class = 'tone1'>cāng</span><span class =...,_ _,[sound:苍白.mp3]
1,细瘦,細瘦,thin<br>scrawny,<span class = 'tone4'>细瘦</span>,<span class = 'tone4'>細瘦</span>,<span class = 'tone4'>xìshòu</span>,_ _,[sound:细瘦.mp3]
2,裂缝,裂縫,rift<br>crack<br>fissure<br>estrangement of af...,<span class = 'tone4'>裂缝</span>,<span class = 'tone4'>裂縫</span>,<span class = 'tone4'>lièfèng</span>,_ _,[sound:裂缝.mp3]
3,黏性,黏性,stickiness<br>viscosity,<span class = 'tone2'>黏</span><span class = 't...,<span class = 'tone2'>黏</span><span class = 't...,<span class = 'tone2'>nián</span><span class =...,_ _,[sound:黏性.mp3]
4,嗅,嗅,to smell<br>to sniff,<span class = 'tone4'>嗅</span>,<span class = 'tone4'>嗅</span>,<span class = 'tone4'>xiù</span>,_,[sound:嗅.mp3]


# Save

In [18]:
if save:
    df_edited.to_csv(out_path, index = False)