## Import

Load data.

In [1]:
import pandas as pd
import swifter
from bs4 import BeautifulSoup
from pinyin_tone_converter.pinyin_tone_converter import PinyinToneConverter
from util_funcs import *
swifter.set_defaults(allow_dask_on_strings = True)

path = "Active_Chinese_20230503.csv"
cols = ["Hanzi", "Meaning", "Reading", "Color", "Mean Word", "Sound", "Traditional", "Silhouette"]
df = pd.read_csv(path)[cols]

print("Shape:", df.shape)
df.head()

Shape: (1895, 8)


Unnamed: 0,Hanzi,Meaning,Reading,Color,Mean Word,Sound,Traditional,Silhouette
0,滑稽,funny (w/ h)<div>comical</div><div>amusing</div>,huájī,滑稽,,,,
1,不禁,"""can't help (doing something)""",bùjīn,不禁,,,,_ _
2,湖面,surface of a lake,húmiàn,湖面,,,,
3,打翻,<div>to knock over</div>to overturn<div>to ove...,dǎfān,打翻,,,,_ _
4,友谊,friendship<div>companionship</div>,yǒuyì,友谊,,,,


## Reading Col

Convert reading to pinyin. Then add color.

In [2]:
reading = df["Reading"].astype(str)
reading_edited = (reading
                    # .swifter
                    .apply(BeautifulSoup, features = "html.parser")
                    .apply(lambda x: x.get_text().strip())
                    .apply(PinyinToneConverter().convert_text)
                )
reading_colored = reading_edited.apply(stylize_str)
df["Reading"] = reading_colored

num_edits = sum(reading != reading_edited)
print(f"Number of cells edited: {num_edits} ({num_edits / df.shape[0]:.1%})")
df.head()

KeyboardInterrupt: 

## Meaning

In [None]:
meaning = df["Meaning"].astype(str)
meaning_edited = meaning.apply(strip_html)
meaning_edited[meaning_edited != meaning].head()

## Color

In [None]:
def pinyin_to_numbers_basic(x: str): 
    """Preliminary version that ignores neutral tone and leaves numbers as is."""
    conv_dict = {v : k for (k,v) in pin2num.items()}
    for k, v in conv_dict.items():
        x = x.replace(k, v)
    x = re.findall(r'\d+', x)
    return x

def pinyin_to_numbers(x: str):
    """Final version"""
    nested_list = [pinyin_to_numbers_basic(syl) if pinyin_to_numbers_basic(syl) != [] else ['0'] for syl in x.strip().split(' ')]
    out_flat = [element for sublist in nested_list for element in sublist]
    return out_flat
    

pinyin_to_numbers("A shì B de héxīn")
pinyin_to_numbers("gōngyuán qián 44 nian2")

x = "gōngbá'nián"