## Import

Load data.

In [1]:
import os
import pandas as pd
from datetime import date
from utils import *
from unicodedata import normalize

save = True
delim = "|"
timestamp = date.today().strftime("%Y%m%d")
in_path = os.path.join("input", "single_character_20230705.csv")
out_path = os.path.join("output", f"manual_input_{timestamp}.csv")
audio_dir = os.path.join("output", "audio")

cols = ["Simp", "Trad", "Meaning", "Simp (Color)", "Trad (Color)", "Pinyin (Color)", "Sound", "Silhouette"]
df = pd.read_csv(in_path, dtype = str)
df.columns = ["Simp", "Trad", "Pinyin (Color)", "Meaning"]
df = df[[col for col in cols if col in df.columns]]
df_edited = df.copy()

print("Shape:", df.shape)
df.head()

Shape: (37, 4)


Unnamed: 0,Simp,Trad,Meaning,Pinyin (Color)
0,硬木,硬木,hardwood,yìng mù
1,遥控器,遙控器,remote control,yáo kòng qì
2,乐福鞋,樂福鞋,loafer shoes (loanword),lè fú xié
3,落地灯,落地燈,floor lamp,luò dì dēng
4,单人沙发,單人沙發,(upholstered) armchair,dān rén shā fā


# Basic Formatting

In [2]:
df_edited["Meaning"] = df["Meaning"].astype(str).apply(lambda x : x.replace("; ", "<br>"))
df_edited["Silhouette"] = df["Simp"].astype(str).apply(get_silhouette)
df_edited = df_edited[[col for col in cols if col in df_edited.columns]]

print("\nView table:")
df_edited.head()


View table:


Unnamed: 0,Simp,Trad,Meaning,Pinyin (Color),Silhouette
0,硬木,硬木,hardwood,yìng mù,_ _
1,遥控器,遙控器,remote control,yáo kòng qì,_ _ _
2,乐福鞋,樂福鞋,loafer shoes (loanword),lè fú xié,_ _ _
3,落地灯,落地燈,floor lamp,luò dì dēng,_ _ _
4,单人沙发,單人沙發,(upholstered) armchair,dān rén shā fā,_ _ _ _


## Coloring

Convert characters to pinyin. Then add color.

In [3]:
pin_split = (
    df_edited["Pinyin (Color)"]
        .apply(lambda x : normalize("NFC", x))
        .apply(PINYIN_RE.split)
        .apply(delim.join)
        .apply(full_split)
)
simp_split = df_edited["Simp"].apply(delim.join).apply(full_split, args = delim)
trad_split = df_edited["Trad"].apply(delim.join).apply(full_split, args = delim)

df_edited["Simp (Color)"] = pd.concat([pin_split, simp_split], axis = 1).apply(lambda row: add_color_hanzi(row[0], row[1]), axis = 1)
df_edited["Trad (Color)"] = pd.concat([pin_split, trad_split], axis = 1).apply(lambda row: add_color_hanzi(row[0], row[1]), axis = 1)
df_edited["Pinyin (Color)"] = pin_split.apply(simplify_split).apply(tag_split)

df_edited = df_edited[[col for col in cols if col in df_edited.columns]]
df_edited.head()

Unnamed: 0,Simp,Trad,Meaning,Simp (Color),Trad (Color),Pinyin (Color),Silhouette
0,硬木,硬木,hardwood,<span class = 'tone4'>硬木</span>,<span class = 'tone4'>硬木</span>,<span class = 'tone4'>yìng mù</span>,_ _
1,遥控器,遙控器,remote control,<span class = 'tone2'>遥</span><span class = 't...,<span class = 'tone2'>遙</span><span class = 't...,<span class = 'tone2'>yáo </span><span class =...,_ _ _
2,乐福鞋,樂福鞋,loafer shoes (loanword),<span class = 'tone4'>乐</span><span class = 't...,<span class = 'tone4'>樂</span><span class = 't...,<span class = 'tone4'>lè </span><span class = ...,_ _ _
3,落地灯,落地燈,floor lamp,<span class = 'tone4'>落地</span><span class = '...,<span class = 'tone4'>落地</span><span class = '...,<span class = 'tone4'>luò dì </span><span clas...,_ _ _
4,单人沙发,單人沙發,(upholstered) armchair,<span class = 'tone1'>单</span><span class = 't...,<span class = 'tone1'>單</span><span class = 't...,<span class = 'tone1'>dān </span><span class =...,_ _ _ _


In [4]:
df_edited[df_edited["Simp (Color)"] == "(color error)"]

Unnamed: 0,Simp,Trad,Meaning,Simp (Color),Trad (Color),Pinyin (Color),Silhouette
32,DVD播放机,DVD播放機,DVD player,(color error),(color error),<span class = 'tone5'>DVD </span><span class =...,DVD _ _ _
33,路对面,路對面,across the street<br>the other side of the street,(color error),(color error),<span class = 'tone5'>對面</span>,_ _ _


# Sound

In [5]:
df_edited["Sound"] = df_edited["Simp"].apply(lambda s : save_audio(s, audio_dir, save))
df_edited = df_edited[[col for col in cols if col in df_edited.columns]]

errors = df_edited["Sound"][df_edited["Sound"].apply(lambda s : s.startswith("FAILED"))]
print("Errors:\n", errors)
df_edited.head()

Errors:
 Series([], Name: Sound, dtype: object)


Unnamed: 0,Simp,Trad,Meaning,Simp (Color),Trad (Color),Pinyin (Color),Sound,Silhouette
0,硬木,硬木,hardwood,<span class = 'tone4'>硬木</span>,<span class = 'tone4'>硬木</span>,<span class = 'tone4'>yìng mù</span>,[sound:硬木.mp3],_ _
1,遥控器,遙控器,remote control,<span class = 'tone2'>遥</span><span class = 't...,<span class = 'tone2'>遙</span><span class = 't...,<span class = 'tone2'>yáo </span><span class =...,[sound:遥控器.mp3],_ _ _
2,乐福鞋,樂福鞋,loafer shoes (loanword),<span class = 'tone4'>乐</span><span class = 't...,<span class = 'tone4'>樂</span><span class = 't...,<span class = 'tone4'>lè </span><span class = ...,[sound:乐福鞋.mp3],_ _ _
3,落地灯,落地燈,floor lamp,<span class = 'tone4'>落地</span><span class = '...,<span class = 'tone4'>落地</span><span class = '...,<span class = 'tone4'>luò dì </span><span clas...,[sound:落地灯.mp3],_ _ _
4,单人沙发,單人沙發,(upholstered) armchair,<span class = 'tone1'>单</span><span class = 't...,<span class = 'tone1'>單</span><span class = 't...,<span class = 'tone1'>dān </span><span class =...,[sound:单人沙发.mp3],_ _ _ _


# Save

In [10]:
if save:
    df_edited.to_csv(out_path, index = False, header = False)