In [1]:
import psutil, os
import json
from pathlib import Path
from datetime import datetime
from collections import Counter
from tqdm.auto import tqdm
import pandas as pd


In [2]:
text_path_list = Path("../corpus/dynasty_split/").glob("*.jsonl")

In [3]:
querys = """
睜,瞪,瞪眼,咪,咪縫,䀹,眨,眨巴,擠咕,瞎,目眩,
瞋,盱,瞤,瞚,瞑,矉,盷,眩
""".replace("\n", "").strip().split(",")
query_targets = [x[0] for x in querys]

In [4]:
def build_index(json_objs):
  index_dict = {}
  for j_idx, jobj in enumerate(tqdm(json_objs)):
    for text_idx, text_obj in enumerate(jobj["text"]):
      for ch_idx, ch in enumerate(text_obj["t"]):
        if ch not in query_targets:
          continue
        ch_index = index_dict.setdefault(ch, [])
        ch_index.append((j_idx, text_idx, "t", ch_idx))
      for ch_idx, ch in enumerate(text_obj["c"]):
        if ch not in query_targets:
          continue
        ch_index = index_dict.setdefault(ch, [])
        ch_index.append((j_idx, text_idx, "c", ch_idx))
  return index_dict

In [5]:
def no_lf(x):
  return x.replace("\n", " ")

def search_character(chs, index_dict, json_objs, n=-1, window=10, offset=0):
  kiwcs = []
  if len(chs)>1 and n >= 0:
    print("query more than one character, n and offset only refer to the first character")
  if chs[0] not in index_dict:
    pass
    # print("WARN: character not found in index_dict. Did you specify it in querys?")
  indices = index_dict.get(chs[0], [])
  if n < 0:
    n = len(indices)
  for idx in indices[offset:offset+n]:
    (jidx, tidx, tkey, chidx) = idx
    jobj = json_objs[jidx]
    tobj = jobj["text"][tidx]
    text_content = tobj[tkey]
    hit_chs = text_content[chidx:chidx+len(chs)]
    if hit_chs != chs:
      continue
    lwin = max(0, chidx-window)
    rwin = min(len(text_content), chidx+len(chs)+window+1)
    dynFromId = jobj["dynFromId"]
    meta = "{}-{}-{}".format(jobj["dynFrom"], jobj["title"], tobj["t"])
    kiwc_entry = (
      no_lf(text_content[lwin:chidx]),
      text_content[chidx:chidx+len(chs)],
      no_lf(text_content[chidx+len(chs)+1:rwin]))
    kiwcs.append((chs, dynFromId, meta, *kiwc_entry))
  
  return kiwcs

## Main Loop

In [7]:
timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
timestamp

'20230116083921'

In [6]:
for text_path in text_path_list:
  print("processing: {}".format(text_path))
  # reading text file
  fin = open(text_path, "r", encoding="UTF-8")

  json_objs = []
  for ln in fin:
    json_data = json.loads(ln)
    json_objs.append(json_data)
  fin.close()

  # building index
  index_dict = build_index(json_objs)
  proc = psutil.Process(os.getpid())
  rss_1 = proc.memory_info().rss / 1024 / 1024
  print("RSS: {:.2f} MB".format(rss_1))

  # generating kwic
  query_kiwcs = []
  for query in tqdm(querys):
    kiwcs = search_character(query, index_dict, json_objs)
    query_kiwcs.extend(kiwcs)

  # write to file
  query_df = pd.DataFrame(query_kiwcs, columns=["query", "dynFromId", "meta", "left", "keyword", "right"])
  query_df.to_csv(f"../data/kwic_{Path(text_path).stem}_{timestamp}.csv", index=False)



processing: ../corpus/dynasty_split/宋元.jsonl


  0%|          | 0/811 [00:00<?, ?it/s]

RSS: 673.21 MB


  0%|          | 0/19 [00:00<?, ?it/s]

processing: ../corpus/dynasty_split/魏晉南北.jsonl


  0%|          | 0/79 [00:00<?, ?it/s]

RSS: 710.07 MB


  0%|          | 0/19 [00:00<?, ?it/s]

processing: ../corpus/dynasty_split/tier1.jsonl


  0%|          | 0/120 [00:00<?, ?it/s]

RSS: 801.05 MB


  0%|          | 0/19 [00:00<?, ?it/s]

processing: ../corpus/dynasty_split/明.jsonl


  0%|          | 0/430 [00:00<?, ?it/s]

RSS: 1139.52 MB


  0%|          | 0/19 [00:00<?, ?it/s]

processing: ../corpus/dynasty_split/民國.jsonl


  0%|          | 0/3 [00:00<?, ?it/s]

RSS: 1153.31 MB


  0%|          | 0/19 [00:00<?, ?it/s]

processing: ../corpus/dynasty_split/清.jsonl


  0%|          | 0/1281 [00:00<?, ?it/s]

RSS: 1769.09 MB


  0%|          | 0/19 [00:00<?, ?it/s]

processing: ../corpus/dynasty_split/漢.jsonl


  0%|          | 0/79 [00:00<?, ?it/s]

RSS: 1754.80 MB


  0%|          | 0/19 [00:00<?, ?it/s]

processing: ../corpus/dynasty_split/唐五代十國.jsonl


  0%|          | 0/182 [00:00<?, ?it/s]

RSS: 1743.71 MB


  0%|          | 0/19 [00:00<?, ?it/s]

processing: ../corpus/dynasty_split/先秦.jsonl


  0%|          | 0/41 [00:00<?, ?it/s]

RSS: 1743.71 MB


  0%|          | 0/19 [00:00<?, ?it/s]