In [1]:
import sys, os
from pathlib import Path
if "../src" not in sys.path:
    sys.path.append("../src")
from dietcoke import fp_lst, corpus_lst, Text, Author
import re
import pickle
import json
import requests

In [2]:
authors_tier1 = []
authors_tier12 = []
for fp, corpus in zip(fp_lst, corpus_lst()):
    authors = [Text(line).author for line in corpus]
    if fp.stem == 'tier1':
        authors_tier1 = authors
    else:
        authors_tier12 += authors

cnt_tier1 = sum([n != '' for n in authors_tier1])
cnt_tier12 = sum([n != '' for n in authors_tier12])

print('---\nCount of tier1:', cnt_tier1, '/', len(authors_tier1))
print('Coverage of tier1:', round(cnt_tier1 / len(authors_tier1), 2))

print('---\nCount of tier12:', cnt_tier12, '/', len(authors_tier12))
print('Coverage of tier12:', round(cnt_tier12 / len(authors_tier12), 2))

100%|██████████| 9/9 [00:15<00:00,  1.73s/it]


---
Count of tier1: 30 / 120
Coverage of tier1: 0.25
---
Count of tier12: 3957 / 4109
Coverage of tier12: 0.96


In [3]:
authors_uni = sorted(set(authors_tier12))

with open('../data/authors.txt', 'w', encoding='utf-8') as f:
    for author in authors_uni:
        f.write(author)
        f.write('\n')

In [4]:
delim = '、|，|；'
print('Regex for delimiters:', delim, '\n---')

replaced = Author().PAT_ROLES + Author().PAT_PREFIX
replaced = '|'.join(replaced)
print('Regex for replaced:', replaced, '\n---')

def clean(author, check_mode=False):
    if author == '':
        author_cleaned = author
    else:
        author_split = re.split(delim, author)
        
        author_replaced = []
        for n in author_split:
            author_replaced += re.sub(replaced, ';', n).split(';')

        author_cleaned = list(filter(lambda x: x != '', author_replaced))

        if check_mode:
            # if author != ''.join(author_cleaned):
            print(author, '->', author_cleaned)
    return author_cleaned

tier12_cleaned = [clean(author, check_mode=True) for author in authors_uni]

Regex for delimiters: 、|，|； 
---
Regex for replaced: 輯|撰|(?<!李心|劉銘)傳(?!恒|遜|金)|(?<!畢弘|陳文)述(?!祖)|奉敕譯|註|注|編|著|輯註|箋|章句|疏|原題( *)|舊題( *)|題( *) 
---
Ban Gu -> ['Ban Gu']
Dong Zhong Shu -> ['Dong Zhong Shu']
Du You -> ['Du You']
Huan Kuan -> ['Huan Kuan']
Jia Yi -> ['Jia Yi']
Jiao Yanshou -> ['Jiao Yanshou']
Jing Fang -> ['Jing Fang']
Li Si -> ['Li Si']
Liang Qi Chao -> ['Liang Qi Chao']
Liu Xiang -> ['Liu Xiang']
Liu Yi Qing -> ['Liu Yi Qing']
Lu Bu-wei -> ['Lu Bu-wei']
Lu Sheng -> ['Lu Sheng']
Shen Dao -> ['Shen Dao']
Sima Qian -> ['Sima Qian']
Sun Wu -> ['Sun Wu']
Sun Yi-Rang -> ['Sun Yi-Rang']
Wang Bi -> ['Wang Bi']
Wang Chong -> ['Wang Chong']
Wang Fu -> ['Wang Fu']
Wu Qi -> ['Wu Qi']
Xu Gan -> ['Xu Gan']
Xu Shen -> ['Xu Shen']
Yan Zhi Tui -> ['Yan Zhi Tui']
Yang Xiong -> ['Yang Xiong']
Zhu Xi -> ['Zhu Xi']
[Chu]宏 -> ['[Chu]宏']
[佚名] -> ['[佚名]']
一然 -> ['一然']
丁丙 -> ['丁丙']
丁寶楨 -> ['丁寶楨']
丁度 -> ['丁度']
丁易東 -> ['丁易東']
丁晏 -> ['丁晏']
丁曰健 -> ['丁曰健']
丁杰 -> ['丁杰']
丁特起 -> ['丁特起']
世宗 -> ['世宗']
世宗憲皇帝 ->

In [5]:
authors_uni_cleaned = sorted(set([n for m in tier12_cleaned for n in m]))

with open('../data/authors_cleaned.txt', 'w', encoding='utf-8') as f:
    for author_cleaned in authors_uni_cleaned:
        f.write(author_cleaned)
        f.write('\n')

In [6]:
# !pip install wptools
# # !pip install wikipedia
# # !pip install wordcloud

# import json
# import wptools

# f = open('authors_cleaned.txt')
# authors = [n.strip() for n in f.readlines()]

# retrieved_data = {}
# for i, author in enumerate(authors):
#   if i >= 10:
#     break
#   try:
#     page = wptools.page(author, lang='zh')
#     page.get_parse()

#     data = page.data
#     infobox, wikitext = None, None
#     if 'infobox' in data.keys():
#       infobox = data['infobox']
#     if 'wikitext' in data.keys():
#       wikitext = data['wikitext']

#     retrieved_data[author] = {
#         'data': data,
#         'infobox': infobox,
#         'wikitext': wikitext
#     }
#   except Exception as e:
#     print(e)

In [7]:
retrieved_data = pickle.load(open('../data/test_wiki_api.pkl', 'rb'))

def match2time(match):
    # match = ('前1世紀', '前', '1', '世紀')
    year = int(match[2])
    if match[3] in ['世紀', '世纪']:
        year = (year - 1) * 100 + 50
    if match[1] == '前':
        year *= -1
    return year

author_life = []
for author, data in retrieved_data.items():
    match = None
    if data['wikitext'] is not None:
        try:
            # {{bd|1609年|6月21日|1672年|1月23日|catIdx=W吴}}
            match = re.search('\{\{' + '(bd|BD)([^}]+)' + '(\}\})', data['wikitext'])
            if match:
                life = []
                for n in match.group(0).split('|'):
                    life_match = re.findall('((前*)(\d{1,4})(年|世紀|世纪))', n)
                    if len(life_match) > 0:
                        life_year = match2time(life_match[0])
                        life.append(life_year)
                if len(life) > 2:
                    print('Need to check time:', match.group(0), '->', life)
                elif len(life) > 0:
                    author_life.append([author, life])
        except Exception as e:
            print(e)

    if match == None:
        if data['infobox'] is not None:
            life_year = [None, None]
            for idx, key in enumerate(['birth_date', 'death_date']):
                if key in data['infobox']:
                    life_year[idx] = int(re.sub('.*(（|\()(\d+)年.*(）|\)).*', r'\2', data['infobox'][key]))
            life = list(filter(lambda x: x != None, life_year))
            if len(life) > 0:
                author_life.append([author, life])
author_life = sorted(author_life, key=lambda x: x[1][0])

print('Count of retrieved data:', len(retrieved_data))
print('Count of author time info:', len(author_life))
print('Coverage of author time info:', round(len(author_life) / len(retrieved_data), 2))

Count of retrieved data: 1237
Count of author time info: 901
Coverage of author time info: 0.73


In [8]:
with open('author_mapping_log.txt', 'w', encoding='utf-8') as f:
    for n in author_life:
        f.write(str(n))
        f.write('\n')

In [9]:
URL_CHINESE_AUTHOR = 'https://zh.m.wikisource.org/w/api.php?action=query&prop=info&titles=Portal:中国作者&format=json'
res = requests.get(URL_CHINESE_AUTHOR)
json.loads(res.text)#['query']['pages']

{'batchcomplete': '',
 'query': {'pages': {'93251': {'pageid': 93251,
    'ns': 100,
    'title': 'Portal:中国作者',
    'contentmodel': 'wikitext',
    'pagelanguage': 'zh',
    'pagelanguagehtmlcode': 'zh',
    'pagelanguagedir': 'ltr',
    'touched': '2021-08-23T12:53:02Z',
    'lastrevid': 2069813,
    'length': 955}}}}