In [1]:
import json
import os
import re
from typing import Iterable, Dict, List, Set

In [13]:
import os
import re
from typing import Iterable, Dict, List, Set

# Regex covering:
# - CJK Extension A:      U+3400–U+4DBF
# - CJK Unified:          U+4E00–U+9FFF
# - CJK Compatibility:    U+F900–U+FAFF
# - Ideographic number:   U+3007 (〇)
# - CJK Ext. B–G approx.: U+20000–U+2EBEF
CJK_CHAR_RE = re.compile(
    r"[\u3400-\u4DBF\u4E00-\u9FFF\uF900-\uFAFF\u3007\U00020000-\U0002EBEF]"
)

def extract_first_col_chinese_chars(line: str) -> List[str]:
    """Take a TSV line, grab the first column (text before the first tab),
    and return the list of Chinese characters found in that first column."""
    first_col = line.split("\t", 1)[0]
    return CJK_CHAR_RE.findall(first_col)

def compute_unique_chars_per_tsv(tsv_paths_in_order: Iterable[str]) -> Dict[str, List[str]]:
    """Given an ordered iterable of TSV file paths, return a dict mapping
    each filename (without .tsv) to the list of unique Chinese characters
    that appear in that file's FIRST COLUMN and have not appeared in any earlier files.
    Order of first appearance within each file is preserved."""
    seen: Set[str] = set()
    result: Dict[str, List[str]] = {}

    for path in tsv_paths_in_order:
        basename = os.path.basename(path)
        name_no_ext, _ = os.path.splitext(basename)  # remove ".tsv"

        file_unique_seen: Set[str] = set()
        new_chars_in_this_file: List[str] = []

        with open(path, "r", encoding="utf-8") as f:
            for line in f:
                if not line.strip():
                    continue
                for ch in extract_first_col_chinese_chars(line):
                    # add if not yet in this file
                    if ch not in file_unique_seen:
                        file_unique_seen.add(ch)
                        # add if not seen globally
                        if ch not in seen:
                            seen.add(ch)
                            new_chars_in_this_file.append(ch)

        result[name_no_ext] = new_chars_in_this_file

    return result

In [14]:
tsv_files_in_order = [
    "l1a.tsv",
    "l1b.tsv",
    "l2a.tsv",
    "l2b.tsv",
    "l3a.tsv",
    "l3b.tsv",
    "l4a.tsv",
    "l4b.tsv",
    "l5a.tsv",
    "l5b.tsv",
    "l6a.tsv",
    "l6b.tsv",
    "l7a.tsv",
    "l7b.tsv",
    "l8a.tsv",
    "l8b.tsv",
    "l9a.tsv",
    "l9b.tsv",
    "l10a.tsv",
    "l10b.tsv",
    "l11a.tsv", 
    "l11b.tsv",
    "l12a.tsv",
    "l12b.tsv",
    "l13a.tsv",
    "l13b.tsv",
    "l14a.tsv",
    "l14b.tsv",
    "l15a.tsv",
    "l15b.tsv",
    "l16a.tsv",
    "l16b.tsv",
    "l17a.tsv",
    "l17b.tsv",
    "l18a.tsv",
    "l18b.tsv",
    "l19a.tsv",
    "l19b.tsv",
    "l20a.tsv",
    "l20b.tsv",
    "第一课：开学.tsv",
    "第二课：宿舍生活.tsv",
    "第三课：在饭馆儿.tsv",
    "第四课：买东西.tsv",
    "第五课：选课.tsv",
    "第六课：男朋友女朋友.tsv",
    "第七课：电脑和网络.tsv",
    "第八课：打工.tsv",
    "第九课：教育.tsv",
    "第十课：中国地理.tsv",
    "第十一课：中国的节日.tsv",
    "第十二课：中国的变化.tsv",
    "第十三课：去云南旅游.tsv",
    "第十四课：生活与健康.tsv",
    "第十五课：男女平等.tsv",
    "第十六课：环境保护与节约能源.tsv",
    "第十七课：理财与投资.tsv",
    "第十八课：中国历史.tsv",
    "第十九课：面试.tsv",
    "第二十课：世界变小了.tsv"
]

In [15]:
tsv_files_w_path = [f"../dicts/chinese/{name}" for name in tsv_files_in_order]
unique_dict = compute_unique_chars_per_tsv(tsv_files_w_path)

with open("unique_chars_per_tsv.json", "w", encoding="utf-8") as out:
    json.dump(unique_dict, out, ensure_ascii=False, indent=2)

In [21]:
for key, val_list in unique_dict.items():
    # join list elements with spaces into one string
    val_str = "".join(val_list)
    print(f"{key}\t{val_str}")

l1a	你好请问贵姓我呢小姐叫什么名字先生李友王朋
l1b	是老师吗不学也人中国京北美纽约韩
l2a	那的照片这爸妈个女孩子谁她男弟他大哥儿有没高文
l2b	家几口两妹和二做工作律英都医白爱
l3a	九月十号星期四天日今年多八岁吃饭怎样太了谢喜欢菜还可们点半晚上见再
l3b	现在刻事很忙明为因同认识
l4a	周末打球看电视唱歌跳舞听音乐书对时候影常去外客昨所以
l4b	久错想觉得意思只睡算找别
l5a	呀进快来介绍一下兴漂亮坐哪校喝茶咖啡吧要瓶起给杯水
l5b	玩图馆聊才回
l6a	话喂就您位午间题开会节课级考试后空方便到办公室行等气
l6b	帮准备练习说啊但跟面
l7a	复写慢枝笔张纸教懂真里预第语法容易词汉难
l7b	平早功始念录帅酷
l8a	篇记累床洗澡边发新脑餐厅网宿舍正前告诉已经知道
l8b	封信最近除专业希望能用笑祝
l9a	商店买东西售货员衣服件衬衫颜色黄红穿裤宜如果长短合适共少钱块毛分百
l9b	双鞋换虽然种黑挺它刷卡收过付
l10a	寒假飞机票场汽车或者地铁走路站绿线蓝麻烦出租送
l10b	邮让花每城市特速紧自己手
l11a	比雪园滑冰冷刚报更而且暖
l11b	非糟糕雨又冬夏热春秋舒加州
l12a	务桌盘饺素豆腐放肉碗酸辣汤味精盐卖完青渴些够饿
l12b	傅糖醋鱼甜极烧牛凉拌瓜米忘带清楚关系海
l13a	心运动旁远离活
l13b	拿次从直往南拐哎灯右左谷本
l14a	表礼物饮料把苹梨住重接楼
l14b	钟头聪暑班属狗脸圆眼睛鼻嘴像定蛋伦姆
l15a	病院肚疼死夜厕箱躺检查坏针药遍
l15b	感冒身体痒敏健康保险赶越休息懒乱
l16a	印象成演费力俩言
l16b	码搬扫整理房旅
l17a	吵连广附套寓卧厨卫具
l17b	干净沙椅架安静元民币差押金当另养宠趣
l18a	胖怕简单跑步受拍篮游泳危淹愿
l18b	提足赛际式应该脚踢抱压被担棒
l19a	马司实计划父母首政治化胜古迹导护签证社订香港台
l19b	初程返航千折转靠窗户份
l20a	托包超登牌哭顾
l20b	叔阿姨迎瘦爷奶烤鸭
第一课：开学	辆研究弓内全较省由处拉柯林
第二课：宿舍生活	屋摆毯柜挂门调栋旧恐品层般着急
第三课：在饭馆儿	留鸡蒸芥兰嫩菠鲜淡咸油巾筷各虑主微丽莎梅杭川湖
第四课：买东西	恤仔无论需牙膏粉购价纯棉髦质量标乎税
第五课：选课	选世界历史其章轻松授讨碰毕济决解建议管谈将挣融数

In [19]:
count = 0
for key, val_list in unique_dict.items():
    if key.startswith("l"):
        # join list elements with spaces into one string
        count += len(val_list)
print(f"Total unique chars in l1a to l20b: {count}")

Total unique chars in l1a to l20b: 641


In [20]:
count = 0
for key, val_list in unique_dict.items():
    if key.startswith("第"):
        # join list elements with spaces into one string
        count += len(val_list)
print(f"Total unique chars in 第1课 to 第20课: {count}")

Total unique chars in 第1课 to 第20课: 460


# OLD

In [1]:
import numpy as np
import pandas as pd

In [2]:
path = r"C:\Users\enoch\OneDrive\Documents\GitHub\lang\dicts\chinese\integrated_chinese_1-2.tsv"
df = pd.read_csv(path, sep='\t', header=None)
df

Unnamed: 0,0,1,2
0,人,rén,person
1,刀,dāo,knife
2,力,lì,power
3,又,yòu,"right hand, again"
4,口,kǒu,mouth
...,...,...,...
863,瘦,shòu,"thin, skinny, lean"
864,爷爷,yéye,paternal grandfather
865,奶奶,nǎinai,paternal grandmother
866,烤鸭,kǎoyā,roast dusk


In [3]:
def unique_chars(col):
    all_chars = np.array([])
    for i, word in enumerate(col):
        for char in word:
            if char not in all_chars:
                all_chars = np.append(all_chars, char)
    return all_chars

In [4]:
total_chars = unique_chars(df[0])
print(total_chars)
with open("output.txt", "w") as file:
    for char in total_chars:
        file.write(char)
        file.write("\n")

['人' '刀' '力' '又' '口' '囗' '土' '夕' '大' '女' '子' '寸' '小' '工' '幺' '弓' '心' '戈'
 '手' '日' '月' '木' '水' '火' '田' '目' '示' '糸' '耳' '衣' '言' '贝' '走' '足' '金' '门'
 '隹' '雨' '食' '马' '一' '二' '三' '四' '五' '六' '七' '八' '九' '十' '百' '你' '好' '请'
 '问' '贵' '姓' '我' '呢' '姐' '叫' '什' '么' '名' '字' '先' '生' '李' '友' '王' '朋' '是'
 '老' '师' '吗' '不' '学' '也' '中' '国' '京' '北' '美' '纽' '约' '韩' '那' '的' '照' '片'
 '这' '爸' '妈' '个' '孩' '谁' '她' '男' '弟' '他' '哥' '儿' '有' '没' '高' '文' '家' '几'
 '两' '妹' '和' '做' '作' '律' '英' '都' '医' '白' '爱' '号' '星' '期' '天' '今' '年' '多'
 '岁' '吃' '饭' '怎' '样' '太' '…' '了' '谢' '喜' '欢' '菜' '还' '可' '们' '点' '半' '晚'
 '上' '见' '再' '现' '在' '刻' '事' '很' '忙' '明' '为' '因' '同' '认' '识' '周' '末' '打'
 '球' '看' '电' '视' '唱' '歌' '跳' '舞' '听' '音' '乐' '书' '对' '时' '候' '影' '常' '去'
 '外' '客' '昨' '所' '以' '久' '错' '想' '觉' '得' '意' '思' '只' '睡' '算' '找' '别' '呀'
 '进' '快' '来' '介' '绍' '下' '兴' '漂' '亮' '坐' '哪' '校' '喝' '茶' '咖' '啡' '吧' '要'
 '瓶' '起' '给' '杯' '玩' '图' '馆' '聊' '才' '回' '话' '喂' '就' '您' '位' '午' '间' '题'
 '开' '会' '节' '课' '级' '考' '试' '后' '空' '方' '便' '到' '办

  if char not in all_chars:
