In [None]:
import os
import numpy as np
import pandas as pd
import jieba
from wordfreq import zipf_frequency


# load data '../../chars.csv'
chars = pd.read_csv('../汉字.csv', sep=',', encoding='utf-8')
words = pd.read_csv('../词汇.csv', sep=',', encoding='utf-8')

In [None]:
# new dataframe
df = pd.DataFrame(columns=['page', 'chars', 'words',
                  'level', 'pinyin', 'part_of_speech'])

page = 0
for index, row in chars.iterrows():

    page += 1

    df.loc[index, 'page'] = page
    df.loc[index, 'chars'] = row['汉字']
    df.loc[index, 'level'] = row['级别']
    df.loc[index, 'pinyin'] = row['拼音']

In [None]:
df['freq'] = ''
for page, row in df.iterrows():
    frequency = zipf_frequency(row['chars'], 'zh')
    df.loc[page, 'freq'] = frequency

# sort by frequency in descending order
df = df.sort_values(by=['freq'], ascending=False)
df = df.reset_index(drop=True)

In [None]:
df['old_chars'] = ''
for page, row in df.iterrows():
    cum_chars = df.loc[:page, 'chars'].str.cat(sep=', ')
    df.at[page, 'old_chars'] = cum_chars

In [None]:
def find_words(df, words, num_chars_interval=(2, 10), max_page=-1):
    for page, row in df.iterrows():
        chars = row['old_chars']
        contains = words['词语'].apply(lambda x: all(
            char in chars for char in x) and num_chars_interval[0] <= len(x) <= num_chars_interval[1])

        # filter the found words in the same level
        # contains &= words['级别'] == row['level']

        df.at[page, 'words'] = words[contains]['词语'].values.tolist()
        # remove duplicates
        df.at[page, 'words'] = list(set(df.at[page, 'words']))

        # print(
        #     f"Page {row['page']}:\n\tChars: {chars}\n\tWords: {len(df.at[page, 'words'])}\n")

        # stop at max_page
        if row['page'] == max_page:
            break


find_words(df, words, num_chars_interval=(2, 10))

In [None]:
# Define the order
level_order = ['一级', '二级', '三级', '四级', '五级', '六级', '高等']

# Sort the dataframe
df['level'] = pd.Categorical(df['level'], level_order)
df.sort_values(['level', 'freq'], inplace=True, ascending=[True, False])

df.groupby('level').count()

# Save to csv
# df.to_csv('guide.csv', sep=',', encoding='utf-8', index=False)

In [None]:
import pandas as pd

# Assuming df and 'old_chars' is a correct DataFrame with relevant columns set up properly.
# Convert 'words' column to a list of words if it is a string representation.
if isinstance(df.at[0, 'words'], str):
    # Update this split according to actual delimiter.
    df['words'] = df['words'].apply(lambda x: x.split(', '))

# Assume 'old_chars' is separated by comma and space as in the provided code.
if 'old_chars' in df.columns:
    # Update this split according to actual delimiter.
    df['old_chars'] = df['old_chars'].apply(lambda x: x.split(', '))
else:
    # If 'old_chars' column does not exist, create empty lists.
    df['old_chars'] = [[] for _ in range(len(df))]

# Create a cumulative set of old characters introduced up to each row.
cumulative_old_chars = set()
df['cumulative_old_chars'] = pd.Series(dtype='object')
for i in range(len(df)):
    cumulative_old_chars.update(df.at[i, 'old_chars'])
    df.at[i, 'cumulative_old_chars'] = cumulative_old_chars.copy()

# Define a function to find new words

def find_new_words(row):
    return [word for word in row['words'] if row['chars'] in word and word not in row['cumulative_old_chars']]


# Apply the function to get new_words
df['new_words'] = df.apply(find_new_words, axis=1)

# sort new_words by frequency
df['new_words'] = df['new_words'].apply(
    lambda x: sorted(x, key=lambda y: zipf_frequency(y, 'zh'), reverse=True))

# Reorder the columns
df2 = df[['level', 'chars', 'pinyin', 'new_words']]

# Save to CSV
df2.to_csv('guide.csv', sep=',', encoding='utf-8', index=False)

In [None]:
# save every level (as numbers) to csv
for i, level in enumerate(level_order, 1):
    df2[df2['level'] == level].to_csv(
        f'guide_{i}.csv', sep=',', encoding='utf-8', index=False)

In [None]:

vocabulary = pd.read_csv('../词汇.csv', sep=',', encoding='utf-8')
guide_words = []

for i, level in enumerate(level_order, 1):
    temp_vocab = vocabulary[vocabulary['级别'] == level]
    temp_vocab = temp_vocab[temp_vocab['词语'].str.len() >= 2]
    df = pd.read_csv(f'guide_{i}.csv', sep=',', encoding='utf-8')
    guide_words.extend(df['new_words'])

    guide_words = [word.strip('[]').strip("'") for word in guide_words]
    guide_words = list(filter(None, guide_words))

    print(f"Guide {level}: {len(guide_words)}")
    print(f'Vocab {level}: {len(temp_vocab.values.tolist())}')

    # print(set(guide_words))

    # not_in_vocab = set(temp_vocab['词语'].values.tolist()) - set(guide_words)

    # print(f"not in vocab: {len(not_in_vocab)}")

    # find all words not present in the vocabulary
    not_in_guide = set(guide_words) - set(temp_vocab['词语'].values.tolist())
    print(f"not in guide: {len(not_in_guide)}")

    break

In [None]:
guide_words = pd.read_csv(f'guide.csv', sep=',', encoding='utf-8')
vocabulary = pd.read_csv('../词汇.csv', sep=',', encoding='utf-8')

guide_words = guide_words['new_words'].values.tolist()
guide_words = [word.strip('[]').strip("'") for word in guide_words]
guide_words = list(filter(None, guide_words))

print(f"Guide: {len(guide_words)}")
print(f'Vocab: {len(vocabulary)}')

In [None]:
# find same char but different pinyin
homophones = df2[df2.duplicated(subset=['chars'], keep=False)]
# sort by level and chars
homophones.sort_values(['level', 'chars'], inplace=True)
homophones.to_csv('homophones.csv', sep=',', encoding='utf-8', index=False)