## Import

In [1]:
import pandas as pd

path = "Active_Chinese_20230503.csv"
cols = ["Hanzi", "Meaning", "Reading", "Color", "Mean Word", "Sound", "Traditional", "Silhouette"]
df = pd.read_csv(path)[cols]

print("Shape:", df.shape)
df.head()

Shape: (1895, 8)


Unnamed: 0,Hanzi,Meaning,Reading,Color,Mean Word,Sound,Traditional,Silhouette
0,滑稽,funny (w/ h)<div>comical</div><div>amusing</div>,huájī,滑稽,,,,
1,不禁,"""can't help (doing something)""",bùjīn,不禁,,,,_ _
2,湖面,surface of a lake,húmiàn,湖面,,,,
3,打翻,<div>to knock over</div>to overturn<div>to ove...,dǎfān,打翻,,,,_ _
4,友谊,friendship<div>companionship</div>,yǒuyì,友谊,,,,


## Reading Col

In [2]:
def has_numbers(inputString):
    return any(char.isdigit() for char in inputString)

reading = df["Reading"].astype(str)
reading[reading.apply(has_numbers)].head()

6     "<span class=""tone1"">jīng</span><span class=...
10    "<span class=""tone2"">cí</span><span class=""...
15                                                 dun4
20    "<span class=""tone4"">zhàn</span> <span class...
26    "<span class=""tone1"">huī</span><span class="...
Name: Reading, dtype: object

In [3]:
from bs4 import BeautifulSoup
from pinyin_tone_converter.pinyin_tone_converter import PinyinToneConverter
from pinyin_dict import *

reading_edited = (reading
                    .apply(BeautifulSoup, features = "html.parser")
                    .apply(lambda x: x.get_text().strip())
                    .apply(PinyinToneConverter().convert_text)
                )

df["Reading"] = reading_edited
df["Reading"][df["Reading"].apply(has_numbers)]

212       gōngyuán 1500 nián
213    gōngyuán qián 44 nián
Name: Reading, dtype: object

## Meaning

In [4]:
import re

def strip_html(x: str):
    CLEANR = re.compile(r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')

    while True:
        input = x
        x = x.strip()
        x = re.sub(r"(^<div>|</div>$)", r"", x)
        x = re.sub(r"(<div>|</div>|<br>)", r"\n", x)
        x = re.sub(r"\n\s*\n", r"\n", x)
        x = re.sub(CLEANR, r"", x)
        if x == input:
            break

    while True:
        input = x
        x = re.sub(r"\n", r"<br>", x)
        if x == input:
            break

    return(x)

strip_html('<div>Your """<span style=""""color: var(--field-fg); font-family: -apple-system-body; background: var(--field-bg);"""">"""Mom</span></div> <div> <br><br>\n\n    <br>\nis awesome</div> ')

'Your """"""Mom<br>is awesome'

In [5]:
meaning = df["Meaning"].astype(str)
meaning_edited = meaning.apply(strip_html)
meaning_edited[meaning_edited != meaning].head()

0                   funny (w/ h)<br>comical<br>amusing
3         to knock over<br>to overturn<br>to overthrow
4                          friendship<br>companionship
5                                      core<br>nucleus
6    to master<br>to be proficient in<br>to be well...
Name: Meaning, dtype: object

## Color

In [39]:
def pinyin_to_numbers_basic(x: str): 
    conv_dict = {v : k for (k,v) in correctInputAndOutput.items()}
    for k, v in conv_dict.items():
        x = x.replace(k, v)
    x = re.findall(r'\d+', x)
    return x

def pinyin_to_numbers(x: str):
    nested_list = [pinyin_to_numbers_basic(syl) if pinyin_to_numbers_basic(syl) != [] else ['0'] for syl in x.split(' ')]
    out_flat = [element for sublist in nested_list for element in sublist]
    return out_flat

pinyin_to_numbers('gōngyuán qián')
pinyin_to_numbers('A shì B de héxīn')
pinyin_to_numbers_basic('gōngyuán 44 qián')


['1', '2', '44', '2']

In [35]:
flatten?

Object `flatten` not found.


In [18]:
reading_edited.apply(pinyin_to_numbers)

0       [2, 1]
1       [4, 1]
2       [2, 4]
3       [3, 1]
4       [3, 4]
         ...  
1890    [4, 4]
1891    [3, 3]
1892    [1, 1]
1893       [1]
1894       [3]
Name: Reading, Length: 1895, dtype: object

In [24]:
hanzi = df["Hanzi"]
df[hanzi.apply(len) != reading_edited.apply(pinyin_to_numbers).apply(len)]

Unnamed: 0,Hanzi,Meaning,Reading,Color,Mean Word,Sound,Traditional,Silhouette
12,A是B的核心,A is central to B,A shì B de héxīn,A是B的核心,,,,
24,玫瑰花,rose,méigui huā,玫瑰花,朵,,,
28,靠得住,reliable<br>can be counted on,kào de zhù,靠得住,,,,_ 得 _
31,棒子,stick<br>club<br>cudgel,bàngzi,棒子,,,,_ _
55,相对于。。。,as opposed to,xiāngduì yú...,相对于。。。,,,,
...,...,...,...,...,...,...,...,...
1823,牛顿,Newton,Niúdùn,牛顿,,,,
1839,佣 (4th tone),commission/fee,yòng,佣,,,,
1840,佣 (1st tone),"to hire, to employ",yōng,佣,,,,
1883,合法化,to legalize<br>legalization,合法化,合法化,,,,
