In [6]:
import re
import json
from pathlib import Path
from collections import Counter
from itertools import chain
from tqdm.auto import tqdm
import pandas as pd
import zhon.hanzi

In [7]:
dyn_list = Path("../data/ctext_dynasty.txt").read_text().split("\n")
ctext_dyns = {dyn_x: year_x
              for dyn_x, year_x in map(lambda x: x.split(","), dyn_list)}

In [13]:
cjk_pat = re.compile(f"[{zhon.hanzi.characters}]")
def flatten(x):
    if isinstance(x, str):
        return x
    elif isinstance(x, list):
        return "".join(x)
    else:
        assert False, ""

def count_char_dict(text):
    return Counter(cjk_pat.findall(text))

dyn_char_freq = {dyn_x: Counter() for dyn_x in ctext_dyns.keys()}
for path_x in tqdm(Path("../corpus/dynasty_split/").glob("*.jsonl")):
    fin = path_x.open("rt")
    for ln in fin:   
        text = json.loads(ln)             
        dyn_x = text.get("dynFrom")
        if dyn_x:
            fulltext = ''.join(chain.from_iterable(flatten(x["c"]) for x in text["text"]))
            char_counter = count_char_dict(fulltext)
            dyn_char_freq[dyn_x].update(char_counter)                

0it [00:00, ?it/s]

In [14]:
char_freq_df = pd.DataFrame(dict(
            ntoken=[sum(x.values()) for x in dyn_char_freq.values()],
            ntype=[len(x) for x in dyn_char_freq.values()]),
        index=dyn_char_freq.keys())

In [15]:
char_freq_df

Unnamed: 0,ntoken,ntype
Western Zhou,99242,3184
Spring and Autumn,253736,3325
Warring States,3212548,7660
Qin,202179,5187
Han,7790366,7727
Western Han,2488551,8280
Xin,1444258,5582
Eastern Han,2534949,12543
Three Kingdoms,1353750,5604
Jin,8418428,10103


In [16]:
char_freq_df.to_csv("../data/ctext.char_freq_stat.csv")