## Import

Load data.

In [1]:
import os
import pandas as pd
import pinyin
from datetime import date
from opencc import OpenCC
from utils import *

save = True
delim = "|"
timestamp = date.today().strftime("%Y%m%d")
in_path = os.path.join("input", "Active_Chinese_20230503.csv")
out_path = os.path.join("output", f"active_chinese_formatted_{timestamp}.csv")
audio_dir = os.path.join("output", "audio")

cols = ["Simp", "Trad", "Meaning", "Simp (Color)", "Trad (Color)", "Pinyin (Color)", "Sound", "Silhouette"]
df = pd.read_csv(in_path, dtype = str).rename(columns = {"Hanzi": "Simp"})
df = df[[col for col in cols if col in df.columns]]
df_edited = df.copy()

print("Shape:", df.shape)
df.head()

Shape: (1895, 4)


Unnamed: 0,Simp,Meaning,Sound,Silhouette
0,滑稽,funny (w/ h)<div>comical</div><div>amusing</div>,,
1,不禁,"""can't help (doing something)""",,_ _
2,湖面,surface of a lake,,
3,打翻,<div>to knock over</div>to overturn<div>to ove...,,_ _
4,友谊,friendship<div>companionship</div>,,


# Basic Formatting

In [2]:
simp = df["Simp"]
meaning = df["Meaning"]
simp_edited = simp.apply(strip_html)
meaning_edited = meaning.apply(strip_html)
df_edited["Simp"] = simp_edited
df_edited["Trad"] = simp_edited.apply(OpenCC('s2t').convert)
df_edited["Meaning"] = meaning_edited
df_edited["Silhouette"] = simp_edited.apply(get_silhouette)
df_edited = df_edited[[col for col in cols if col in df_edited.columns]]

num_changes = (meaning_edited != meaning).sum()
print("Number of meaning alterations:", num_changes)
print("Examples:\n")
print(meaning[meaning_edited != meaning].head())

num_changes = (simp_edited != simp).sum()
print("\nNumber of hanzi alterations:", num_changes)
print("Examples:\n")
print(simp[simp_edited != simp].head())

print("\nView table:")
df_edited

Number of meaning alterations: 249
Examples:

0     funny (w/ h)<div>comical</div><div>amusing</div>
3    <div>to knock over</div>to overturn<div>to ove...
4                   friendship<div>companionship</div>
5                               core<div>nucleus</div>
6    to master<div>to be proficient in<br><div>to b...
Name: Meaning, dtype: object

Number of hanzi alterations: 8
Examples:

63     <table><tbody><tr><td>布</td></tr></tbody></table>
218                                      <div>时间观念</div>
282    <table><tbody><tr><td>国庆节</td></tr><tr></tr></...
346    <table><tbody><tr><td>发电厂</td></tr><tr></tr></...
362    <table><tbody><tr><td>温室气体</td></tr><tr></tr><...
Name: Simp, dtype: object

View table:


Unnamed: 0,Simp,Trad,Meaning,Sound,Silhouette
0,滑稽,滑稽,funny (w/ h)<br>comical<br>amusing,,_ _
1,不禁,不禁,"""can't help (doing something)""",,_ _
2,湖面,湖面,surface of a lake,,_ _
3,打翻,打翻,to knock over<br>to overturn<br>to overthrow,,_ _
4,友谊,友誼,friendship<br>companionship,,_ _
...,...,...,...,...,...
1890,撤退,撤退,to retreat,,_ _
1891,损毁,損毀,"to destroy, to ruin<br>to cause damage",,_ _
1892,卑微,卑微,"lowly, humble",,_ _
1893,微,微,"tiny, miniature",,_


## Coloring

Convert characters to pinyin. Then add color.

In [3]:
pin_split = df_edited["Simp"].apply(pinyin.get, args = delim).apply(replace_special_char).apply(full_split, args = delim)
simp_split = df_edited["Simp"].apply(delim.join).apply(full_split, args = delim)
trad_split = df_edited["Trad"].apply(delim.join).apply(full_split, args = delim)

df_edited["Simp (Color)"] = pd.concat([pin_split, simp_split], axis = 1).apply(lambda row: add_color_hanzi(row[0], row[1]), axis = 1)
df_edited["Trad (Color)"] = pd.concat([pin_split, trad_split], axis = 1).apply(lambda row: add_color_hanzi(row[0], row[1]), axis = 1)
df_edited["Pinyin (Color)"] = pin_split.apply(simplify_split).apply(tag_split)

df_edited = df_edited[[col for col in cols if col in df_edited.columns]]
df_edited.head()

Unnamed: 0,Simp,Trad,Meaning,Simp (Color),Trad (Color),Pinyin (Color),Sound,Silhouette
0,滑稽,滑稽,funny (w/ h)<br>comical<br>amusing,<span class = 'tone2'>滑</span><span class = 't...,<span class = 'tone2'>滑</span><span class = 't...,<span class = 'tone2'>huá</span><span class = ...,,_ _
1,不禁,不禁,"""can't help (doing something)""",<span class = 'tone4'>不禁</span>,<span class = 'tone4'>不禁</span>,<span class = 'tone4'>bùjìn</span>,,_ _
2,湖面,湖面,surface of a lake,<span class = 'tone2'>湖</span><span class = 't...,<span class = 'tone2'>湖</span><span class = 't...,<span class = 'tone2'>hú</span><span class = '...,,_ _
3,打翻,打翻,to knock over<br>to overturn<br>to overthrow,<span class = 'tone3'>打</span><span class = 't...,<span class = 'tone3'>打</span><span class = 't...,<span class = 'tone3'>dǎ</span><span class = '...,,_ _
4,友谊,友誼,friendship<br>companionship,<span class = 'tone3'>友</span><span class = 't...,<span class = 'tone3'>友</span><span class = 't...,<span class = 'tone3'>yǒu</span><span class = ...,,_ _


# Sound

In [4]:
df_edited["Sound"] = df_edited["Simp"].apply(lambda s : save_audio(s, audio_dir, save))

errors = df_edited["Sound"][df_edited["Sound"].apply(lambda s : s.startswith("FAILED"))]
print("Errors:\n", errors)
df_edited.head()

Errors:
 1653    FAILED WRITING: output\audio\调 (w/ t).mp3
Name: Sound, dtype: object


Unnamed: 0,Simp,Trad,Meaning,Simp (Color),Trad (Color),Pinyin (Color),Sound,Silhouette
0,滑稽,滑稽,funny (w/ h)<br>comical<br>amusing,<span class = 'tone2'>滑</span><span class = 't...,<span class = 'tone2'>滑</span><span class = 't...,<span class = 'tone2'>huá</span><span class = ...,output\audio\滑稽.mp3,_ _
1,不禁,不禁,"""can't help (doing something)""",<span class = 'tone4'>不禁</span>,<span class = 'tone4'>不禁</span>,<span class = 'tone4'>bùjìn</span>,output\audio\不禁.mp3,_ _
2,湖面,湖面,surface of a lake,<span class = 'tone2'>湖</span><span class = 't...,<span class = 'tone2'>湖</span><span class = 't...,<span class = 'tone2'>hú</span><span class = '...,output\audio\湖面.mp3,_ _
3,打翻,打翻,to knock over<br>to overturn<br>to overthrow,<span class = 'tone3'>打</span><span class = 't...,<span class = 'tone3'>打</span><span class = 't...,<span class = 'tone3'>dǎ</span><span class = '...,output\audio\打翻.mp3,_ _
4,友谊,友誼,friendship<br>companionship,<span class = 'tone3'>友</span><span class = 't...,<span class = 'tone3'>友</span><span class = 't...,<span class = 'tone3'>yǒu</span><span class = ...,output\audio\友谊.mp3,_ _


# Save

In [5]:
if save:
    df_edited.to_csv(out_path, index = False)