## Import

Load data.

In [2]:
import pandas as pd
from bs4 import BeautifulSoup
from pinyin_tone_converter.pinyin_tone_converter import PinyinToneConverter
from util_funcs import *
import opencc

path = "Active_Chinese_20230503.csv"
cols = ["Hanzi", "Meaning", "Reading", "Color", "Mean Word", "Sound", "Traditional", "Silhouette"]
df = pd.read_csv(path)[cols]

print("Shape:", df.shape)
df.head()

Shape: (1895, 8)


Unnamed: 0,Hanzi,Meaning,Reading,Color,Mean Word,Sound,Traditional,Silhouette
0,滑稽,funny (w/ h)<div>comical</div><div>amusing</div>,huájī,滑稽,,,,
1,不禁,"""can't help (doing something)""",bùjīn,不禁,,,,_ _
2,湖面,surface of a lake,húmiàn,湖面,,,,
3,打翻,<div>to knock over</div>to overturn<div>to ove...,dǎfān,打翻,,,,_ _
4,友谊,friendship<div>companionship</div>,yǒuyì,友谊,,,,


## Reading

Convert reading to pinyin. Then add color.

In [12]:
%%time

reading = df["Reading"].astype(str)
reading_edited = (reading
                    # .swifter
                    .apply(BeautifulSoup, features = "html.parser")
                    .apply(lambda x: x.get_text().strip())
                    .apply(PinyinToneConverter().convert_text)
                )
reading_colored = reading_edited.apply(stylize_str)
df["Reading"] = reading_colored

num_edits = sum(reading != reading_edited)
print(f"Number of cells edited: {num_edits} ({num_edits / df.shape[0]:.1%})")
df.head()

Number of cells edited: 170 (9.0%)
CPU times: total: 2min 23s
Wall time: 7min 58s


Unnamed: 0,Hanzi,Meaning,Reading,Color,Mean Word,Sound,Traditional,Silhouette
0,滑稽,funny (w/ h)<div>comical</div><div>amusing</div>,<span class = 'tone2'>huá</span><span class = ...,滑稽,,,,
1,不禁,"""can't help (doing something)""",<span class = 'tone4'>bù</span><span class = '...,不禁,,,,_ _
2,湖面,surface of a lake,<span class = 'tone2'>hú</span><span class = '...,湖面,,,,
3,打翻,<div>to knock over</div>to overturn<div>to ove...,<span class = 'tone3'>dǎ</span><span class = '...,打翻,,,,_ _
4,友谊,friendship<div>companionship</div>,<span class = 'tone3'>yǒu</span><span class = ...,友谊,,,,


In [13]:
df.to_csv("Active_Chinese_reading_edited_colored.csv")

## Meaning

In [15]:
%%time 

meaning = df["Meaning"].astype(str)
meaning_edited = meaning.apply(strip_html)
meaning_edited[meaning_edited != meaning].head()
df["Meaning"] = meaning_edited
df.to_csv("Active_Chinese_meaning_edited_20230527.csv")

CPU times: total: 0 ns
Wall time: 37.5 ms


# Hanzi

In [45]:
hanzi = df["Hanzi"]
hanzi_edited = hanzi.apply(strip_html)
df["Hanzi"] = hanzi_edited

num_changes = (hanzi_edited != hanzi).sum()
print("Number of alterations:", num_changes)

Number of alterations: 8


# Color

In [53]:
%%time

hanzi = df[["Hanzi", "Reading"]]
hanzi["Reading"] = hanzi["Reading"].apply(strip_html)
hanzi["Hanzi"] = hanzi.apply(lambda row: safe_stylize_hanzi(s_pin = row["Reading"], s_char = row["Hanzi"]), axis = 1)
# for i in range(hanzi.shape[0]):
#     row = hanzi.iloc[i]
#     stylize_str(s_pin = row["Reading"], s_char = row["Hanzi"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


CPU times: total: 3min 40s
Wall time: 11min 55s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [60]:
color = hanzi.rename(columns = {"Hanzi": "Color"})["Color"]
df["Color"] = color
color[color.apply(lambda x : "color error" in x)]

814            白白... (color error)
977               贵族 (color error)
1653        调 (w/ t) (color error)
1697    倒 (4th tone) (color error)
1698    倒 (3rd tone) (color error)
1796        钻 (noun) (color error)
1797          钻 (v.) (color error)
1800    觉 (2nd tone) (color error)
1802    兴 (1st tone) (color error)
1803    兴 (4th tone) (color error)
1805    数 (3rd tone) (color error)
1806     数(4th tone) (color error)
1809         扇(noun) (color error)
1810           扇(v.) (color error)
1839    佣 (4th tone) (color error)
1840    佣 (1st tone) (color error)
Name: Color, dtype: object

In [61]:
df.to_csv("hanzi_meaning_reading_color.csv")

# Silhouette

In [69]:
df["Silhouette"] = df["Hanzi"].apply(get_silhouette)
df.to_csv("hanzi_meaning_reading_color_silhouette.csv")

# Traditional

In [94]:
converter = opencc.OpenCC('s2t.json')
df["Traditional"] = df["Hanzi"].apply(converter.convert)
df.to_csv("hanzi_meaning_reading_color_traditional_silhouette.csv")

# Traditional Colored

In [3]:
df = pd.read_csv("hanzi_meaning_reading_color_traditional_silhouette.csv")
traditional_colored = df[["Traditional", "Reading"]].apply(lambda row: safe_stylize_hanzi(s_pin = row["Reading"], s_char = row["Traditional"]), axis = 1)
traditional_colored.head()

0    <span class = 'tone5'>滑稽</span>
1    <span class = 'tone5'>不禁</span>
2    <span class = 'tone5'>湖面</span>
3    <span class = 'tone5'>打翻</span>
4    <span class = 'tone5'>友誼</span>
dtype: object

In [4]:
traditional_colored

0       <span class = 'tone5'>滑稽</span>
1       <span class = 'tone5'>不禁</span>
2       <span class = 'tone5'>湖面</span>
3       <span class = 'tone5'>打翻</span>
4       <span class = 'tone5'>友誼</span>
                     ...               
1890    <span class = 'tone5'>撤退</span>
1891    <span class = 'tone5'>損毀</span>
1892    <span class = 'tone5'>卑微</span>
1893     <span class = 'tone5'>微</span>
1894     <span class = 'tone5'>毀</span>
Length: 1895, dtype: object