# 中文字符统计\n统计9个章节文件中的中文汉字数量，判断是否达到2000字以上。

## 1. 导入所需库

In [None]:
import os
import re
import pandas as pd

## 2. 定义中文字符统计函数\n使用正则表达式匹配 Unicode 范围 `\\u4e00-\\u9fff` 内的中文汉字，仅统计汉字数量。

In [None]:
def count_chinese_chars(text: str) -> int:
    """统计文本中的中文汉字数量（仅 CJK 统一汉字 U+4E00-U+9FFF）"""
    return len(re.findall(r'[\u4e00-\u9fff]', text))

## 3. 定义文件路径列表

In [None]:
file_paths = [
    '/workspaces/Memory-Pawnshop/script/chapters/chapter_12.md',
    '/workspaces/Memory-Pawnshop/script/chapters/chapter_13.md',
    '/workspaces/Memory-Pawnshop/script/chapters/chapter_47.md',
    '/workspaces/Memory-Pawnshop/script/chapters/chapter_52.md',
    '/workspaces/Memory-Pawnshop/script/chapters/chapter_53.md',
    '/workspaces/Memory-Pawnshop/script/chapters/chapter_54.md',
    '/workspaces/Memory-Pawnshop/script/chapters/chapter_55.md',
    '/workspaces/Memory-Pawnshop/script/chapters/chapter_58.md',
    '/workspaces/Memory-Pawnshop/script/chapters/chapter_65.md',
]
print(f"共 {len(file_paths)} 个文件待统计")

## 4. 逐个读取文件并统计中文字符数

In [None]:
results = []
for fp in file_paths:
    filename = os.path.basename(fp)
    with open(fp, 'r', encoding='utf-8') as f:
        text = f.read()
    char_count = count_chinese_chars(text)
    results.append({'文件名': filename, '中文字数': char_count})
    print(f"{filename}: {char_count} 个中文汉字")

## 5. 生成达标判断结果表格

In [None]:
df = pd.DataFrame(results)
df['达标（≥2000）'] = df['中文字数'].apply(lambda x: '✅ 达标' if x >= 2000 else '❌ 未达标')

# 显示结果表格
print("=" * 55)
print("章节中文字数统计结果")
print("=" * 55)
display(df)

# 汇总
pass_count = (df['中文字数'] >= 2000).sum()
fail_count = len(df) - pass_count
print(f"\n达标文件数: {pass_count}/{len(df)}")
print(f"未达标文件数: {fail_count}/{len(df)}")
if fail_count > 0:
    print("\n未达标文件:")
    for _, row in df[df['中文字数'] < 2000].iterrows():
        print(f"  - {row['文件名']}: {row['中文字数']} 字 (差 {2000 - row['中文字数']} 字)")