## Import

Load data.

In [3]:
import os
import pandas as pd
import pinyin
from datetime import date
from opencc import OpenCC
from utils import *

save = True
delim = "|"
timestamp = date.today().strftime("%Y%m%d")
in_path = os.path.join("input", "Retired_Chinese_20230529.csv")
out_path = os.path.join("output", f"retired_chinese_formatted_{timestamp}.csv")
audio_dir = os.path.join("output", "audio")

cols = ["Simp", "Trad", "Meaning", "Simp (Color)", "Trad (Color)", "Pinyin (Color)", "Sound", "Silhouette"]
df = pd.read_csv(in_path, dtype = str).rename(columns = {"Hanzi": "Simp"})
df = df[[col for col in cols if col in df.columns]]
df_edited = df.copy()

print("Shape:", df.shape)
df.head()

Shape: (6274, 2)


Unnamed: 0,Simp,Meaning
0,星期二,Tuesday
1,星期三,Wednesday
2,星期四,Thursday
3,星期六,Saturday
4,星期天,Sunday


# Basic Formatting

In [5]:
simp = df["Simp"]
meaning = df["Meaning"]
simp_edited = simp.apply(strip_html)
meaning_edited = meaning.apply(strip_html)
df_edited["Simp"] = simp_edited
df_edited["Trad"] = simp_edited.apply(OpenCC('s2t').convert)
df_edited["Meaning"] = meaning_edited
df_edited["Silhouette"] = simp_edited.apply(get_silhouette)
df_edited = df_edited[[col for col in cols if col in df_edited.columns]]

num_changes = (meaning_edited != meaning).sum()
print("Number of meaning alterations:", num_changes)
print("Examples:\n")
print(meaning_edited[meaning_edited != meaning].head())

num_changes = (simp_edited != simp).sum()
print("\nNumber of hanzi alterations:", num_changes)
print("Examples:\n")
print(simp_edited[simp_edited != simp].head())

print("\nView table:")
df_edited

Number of meaning alterations: 2138
Examples:

16             (human) body
50     can<br>to be able to
87            teacher (w/老)
96      dish (type of food)
112             man  person
Name: Meaning, dtype: object

Number of hanzi alterations: 105
Examples:

156      请
516     放松
565    办公桌
654     化学
935     其中
Name: Simp, dtype: object

View table:


Unnamed: 0,Simp,Trad,Meaning,Silhouette
0,星期二,星期二,Tuesday,_ _ _
1,星期三,星期三,Wednesday,_ _ _
2,星期四,星期四,Thursday,_ _ _
3,星期六,星期六,Saturday,_ _ _
4,星期天,星期天,Sunday,_ _ _
...,...,...,...,...
6269,停火,停火,ceasefire<br>to cease fire,_ _
6270,休战,休戰,truce<br>armistice,_ _
6271,撤退,撤退,to retreat,_ _
6272,损毁,損毀,"to destroy, to ruin<br>to cause damage",_ _


## Coloring

Convert characters to pinyin. Then add color.

In [6]:
pin_split = df_edited["Simp"].apply(pinyin.get, args = delim).apply(replace_special_char).apply(full_split, args = delim)
simp_split = df_edited["Simp"].apply(delim.join).apply(full_split, args = delim)
trad_split = df_edited["Trad"].apply(delim.join).apply(full_split, args = delim)

df_edited["Simp (Color)"] = pd.concat([pin_split, simp_split], axis = 1).apply(lambda row: add_color_hanzi(row[0], row[1]), axis = 1)
df_edited["Trad (Color)"] = pd.concat([pin_split, trad_split], axis = 1).apply(lambda row: add_color_hanzi(row[0], row[1]), axis = 1)
df_edited["Pinyin (Color)"] = pin_split.apply(simplify_split).apply(tag_split)

df_edited = df_edited[[col for col in cols if col in df_edited.columns]]
df_edited.head()

Unnamed: 0,Simp,Trad,Meaning,Simp (Color),Trad (Color),Pinyin (Color),Silhouette
0,星期二,星期二,Tuesday,<span class = 'tone1'>星期</span><span class = '...,<span class = 'tone1'>星期</span><span class = '...,<span class = 'tone1'>xīngqī</span><span class...,_ _ _
1,星期三,星期三,Wednesday,<span class = 'tone1'>星期三</span>,<span class = 'tone1'>星期三</span>,<span class = 'tone1'>xīngqīsān</span>,_ _ _
2,星期四,星期四,Thursday,<span class = 'tone1'>星期</span><span class = '...,<span class = 'tone1'>星期</span><span class = '...,<span class = 'tone1'>xīngqī</span><span class...,_ _ _
3,星期六,星期六,Saturday,<span class = 'tone1'>星期</span><span class = '...,<span class = 'tone1'>星期</span><span class = '...,<span class = 'tone1'>xīngqī</span><span class...,_ _ _
4,星期天,星期天,Sunday,<span class = 'tone1'>星期天</span>,<span class = 'tone1'>星期天</span>,<span class = 'tone1'>xīngqītiān</span>,_ _ _


# Sound

In [7]:
df_edited["Sound"] = df_edited["Simp"].apply(lambda s : save_audio(s, audio_dir, save))

errors = df_edited["Sound"][df_edited["Sound"].apply(lambda s : s.startswith("FAILED"))]
print("Errors:\n", errors)
df_edited.head()

Errors:
 1993     FAILED WRITING: 身份
1994     FAILED WRITING: 母语
1995     FAILED WRITING: 华人
1996     FAILED WRITING: 根源
1997    FAILED WRITING: 年轻人
               ...         
6269     FAILED WRITING: 停火
6270     FAILED WRITING: 休战
6271     FAILED WRITING: 撤退
6272     FAILED WRITING: 损毁
6273     FAILED WRITING: 卑微
Name: Sound, Length: 4260, dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_edited["Sound"] = df_edited["Simp"].apply(lambda s : save_audio(s, audio_dir, save))


Unnamed: 0,Simp,Trad,Meaning,Simp (Color),Trad (Color),Pinyin (Color),Silhouette,Sound
0,星期二,星期二,Tuesday,<span class = 'tone1'>星期</span><span class = '...,<span class = 'tone1'>星期</span><span class = '...,<span class = 'tone1'>xīngqī</span><span class...,_ _ _,[sound:星期二.mp3]
1,星期三,星期三,Wednesday,<span class = 'tone1'>星期三</span>,<span class = 'tone1'>星期三</span>,<span class = 'tone1'>xīngqīsān</span>,_ _ _,[sound:星期三.mp3]
2,星期四,星期四,Thursday,<span class = 'tone1'>星期</span><span class = '...,<span class = 'tone1'>星期</span><span class = '...,<span class = 'tone1'>xīngqī</span><span class...,_ _ _,[sound:星期四.mp3]
3,星期六,星期六,Saturday,<span class = 'tone1'>星期</span><span class = '...,<span class = 'tone1'>星期</span><span class = '...,<span class = 'tone1'>xīngqī</span><span class...,_ _ _,[sound:星期六.mp3]
4,星期天,星期天,Sunday,<span class = 'tone1'>星期天</span>,<span class = 'tone1'>星期天</span>,<span class = 'tone1'>xīngqītiān</span>,_ _ _,[sound:星期天.mp3]


# Save

In [5]:
if save:
    df_edited.to_csv(out_path, index = False)