In [3]:
import os
import numpy as np
import pandas as pd

# load data '../../chars.csv'
chars = pd.read_csv('../characters_freq.csv', sep=',', encoding='utf-8')
words = pd.read_csv('../vocabulary_freq.csv', sep=',', encoding='utf-8')

In [4]:
# new dataframe
df = pd.DataFrame(columns=['page', 'chars', 'words',
                  'level', 'pinyin', 'part_of_speech'])

page = 0
for index, row in chars.iterrows():

    page += 1

    df.loc[index, 'page'] = page
    df.loc[index, 'chars'] = row['character']
    df.loc[index, 'level'] = row['level']
    df.loc[index, 'pinyin'] = row['pinyin']

In [5]:
df['old_chars'] = ''
for page, row in df.iterrows():
    cum_chars = df.loc[:page, 'chars'].str.cat(sep=', ')
    df.at[page, 'old_chars'] = cum_chars

In [6]:
def find_words(df, words, num_chars_interval=(2, 10), max_page=-1):
    for page, row in df.iterrows():
        chars = row['old_chars']
        contains = words['word'].apply(lambda x: all(
            char in chars for char in x) and num_chars_interval[0] <= len(x) <= num_chars_interval[1])

        df.at[page, 'words'] = words[contains]['word'].values.tolist()
        # remove duplicates
        df.at[page, 'words'] = list(set(df.at[page, 'words']))

        # print(
        #     f"Page {row['page']}:\n\tChars: {chars}\n\tWords: {len(df.at[page, 'words'])}\n")

        # stop at max_page
        if row['page'] == max_page:
            break


find_words(df, words, num_chars_interval=(2, 10))

In [7]:
# Define the order
level_order = ['一级', '二级', '三级', '四级', '五级', '六级', '高等']

# Sort the dataframe
df['level'] = pd.Categorical(df['level'], level_order)
df.sort_values(['level', 'page'], inplace=True)

# Save to csv
# df.to_csv('guide.csv', sep=',', encoding='utf-8', index=False)

In [8]:
import pandas as pd

# Assuming df and 'old_chars' is a correct DataFrame with relevant columns set up properly.
# Convert 'words' column to a list of words if it is a string representation.
if isinstance(df.at[0, 'words'], str):
    # Update this split according to actual delimiter.
    df['words'] = df['words'].apply(lambda x: x.split(', '))

# Assume 'old_chars' is separated by comma and space as in the provided code.
if 'old_chars' in df.columns:
    # Update this split according to actual delimiter.
    df['old_chars'] = df['old_chars'].apply(lambda x: x.split(', '))
else:
    # If 'old_chars' column does not exist, create empty lists.
    df['old_chars'] = [[] for _ in range(len(df))]

# Create a cumulative set of old characters introduced up to each row.
cumulative_old_chars = set()
df['cumulative_old_chars'] = pd.Series(dtype='object')
for i in range(len(df)):
    cumulative_old_chars.update(df.at[i, 'old_chars'])
    df.at[i, 'cumulative_old_chars'] = cumulative_old_chars.copy()

# Define a function to find new words.

def find_new_words(row):
    return [word for word in row['words'] if row['chars'] in word and word not in row['cumulative_old_chars']]

# Apply the function to get new_words
df['new_words'] = df.apply(find_new_words, axis=1)

# Reorder the columns
df2 = df[['level', 'chars', 'pinyin', 'new_words']]

# Save to CSV
df2.to_csv('guide.csv', sep=',', encoding='utf-8', index=False)

In [9]:
# save every level (as numbers) to csv
for i, level in enumerate(level_order, 1):
    df2[df2['level'] == level].to_csv(
        f'guide_{i}.csv', sep=',', encoding='utf-8', index=False)

In [10]:
# find same char but different pinyin
homophones = df2[df2.duplicated(subset=['chars'], keep=False)]
# sort by level and chars
homophones.sort_values(['chars', 'level'], inplace=True)
homophones.to_csv('homophones.csv', sep=',', encoding='utf-8', index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  homophones.sort_values(['chars', 'level'], inplace=True)
