In [1]:
from pathlib import Path
import json
import numpy as np
from tqdm.auto import tqdm
from gensim import corpora
from gensim.models import LsiModel
from gensim.models import KeyedVectors

In [2]:
def extract_text(text_path):
    fin = open(text_path, "r", encoding="UTF-8")
    json_objs = []
    for ln in fin:
        json_data = json.loads(ln)
        json_objs.append(json_data)
    fin.close()
    
    text = []
    for j_idx, jobj in enumerate(tqdm(json_objs)):
        for text_idx, text_obj in enumerate(jobj["text"]):            
            if isinstance(text_obj["c"], list):
                text.append(list(text_obj["t"]) + 
                            list("\n".join(text_obj["c"])))
            else:
                text.append(list(text_obj["t"]) + 
                            list(text_obj["c"]))
    return text

In [3]:
text_path_list = list(Path("../corpus/dynasty_split/").glob("*.jsonl"))
text_path_list

[PosixPath('../corpus/dynasty_split/宋元.jsonl'),
 PosixPath('../corpus/dynasty_split/魏晉南北.jsonl'),
 PosixPath('../corpus/dynasty_split/tier1.jsonl'),
 PosixPath('../corpus/dynasty_split/明.jsonl'),
 PosixPath('../corpus/dynasty_split/民國.jsonl'),
 PosixPath('../corpus/dynasty_split/清.jsonl'),
 PosixPath('../corpus/dynasty_split/漢.jsonl'),
 PosixPath('../corpus/dynasty_split/唐五代十國.jsonl'),
 PosixPath('../corpus/dynasty_split/先秦.jsonl')]

In [4]:
dyn0 = extract_text("../corpus/dynasty_split/先秦.jsonl")
dyn1 = extract_text("../corpus/dynasty_split/漢.jsonl")
dyn2 = extract_text('../corpus/dynasty_split/魏晉南北.jsonl')

  0%|          | 0/41 [00:00<?, ?it/s]

  0%|          | 0/79 [00:00<?, ?it/s]

  0%|          | 0/79 [00:00<?, ?it/s]

In [5]:
len(dyn0), len(dyn1), len(dyn2)

(755, 1562, 1380)

## Basic statistics

In [6]:
print("dyn0 char count", sum(len(x) for x in dyn0))
print("dyn1 char count", sum(len(x) for x in dyn1))
print("dyn2 char count", sum(len(x) for x in dyn2))

dyn0 char count 2241345
dyn1 char count 13152213
dyn2 char count 16740426


In [7]:
def make_lsa_vec(dyn_data):
    vocab = corpora.Dictionary(dyn_data)
    vocab.filter_extremes(no_below=5)    
    docmat = [vocab.doc2bow(x) for x in dyn_data]
    lsa = LsiModel(docmat)
    lsavec = lsa.projection.u * lsa.projection.s
    kv = KeyedVectors(200)    
    kv.add_vectors(list(vocab.token2id.keys()), lsavec)
    print("vocab: ", len(kv.index_to_key))
    return kv

In [8]:
print("Running dyn0"); kv0 = make_lsa_vec(dyn0)
print("Running dyn1"); kv1 = make_lsa_vec(dyn1)
print("Running dyn2"); kv2 = make_lsa_vec(dyn2)

Running dyn0
vocab:  4324
Running dyn1
vocab:  6609
Running dyn2
vocab:  7347


In [9]:
kv_list = [
    {"kv": kv0, "note": "dynasty_split/先秦.jsonl"},
    {"kv": kv1, "note": "dynasty_split/漢.jsonl"},
    {"kv": kv2, "note": "dynasty_split/魏晉南北.jsonl"}
]

In [10]:
import joblib
kv_list_path = joblib.dump(kv_list, "../data/dyn_kvlist_3dyns.joblib")[0]
!sha1sum $kv_list_path

e7ca62ab6a722b1b18c2bcddccdedc16e157666e  ../data/dyn_kvlist_3dyns.joblib


In [11]:
list(set(kv0.index_to_key) & set(kv1.index_to_key) & set(kv2.index_to_key))[:10]

['苔', '秩', '琢', '搆', '稿', '宅', '纓', '莧', '泠', '盈']

In [12]:
kv0.most_similar("雪")

[('隔', 0.8219001293182373),
 ('落', 0.8136487603187561),
 ('飄', 0.8051883578300476),
 ('雲', 0.8004015684127808),
 ('霜', 0.796699047088623),
 ('斜', 0.768998384475708),
 ('星', 0.7672511339187622),
 ('殞', 0.7604832053184509),
 ('裹', 0.747040867805481),
 ('奇', 0.7435941100120544)]

In [13]:
kv1.most_similar("雪")

[('蟲', 0.8492670655250549),
 ('董', 0.8466299176216125),
 ('畢', 0.8313943147659302),
 ('謹', 0.8287528157234192),
 ('墮', 0.828660786151886),
 ('暑', 0.8245339393615723),
 ('翼', 0.8240716457366943),
 ('藝', 0.8206169009208679),
 ('暮', 0.8173409700393677),
 ('羅', 0.816857635974884)]

In [14]:
kv2.most_similar("雪")

[('霜', 0.718539297580719),
 ('鸚', 0.6935447454452515),
 ('揣', 0.670968234539032),
 ('燭', 0.6566643714904785),
 ('吹', 0.6470581293106079),
 ('蘭', 0.6361311078071594),
 ('聊', 0.6321390271186829),
 ('賦', 0.6305357813835144),
 ('誘', 0.6301935911178589),
 ('翔', 0.6280685067176819)]