In [1]:
import sys, os
from pathlib import Path
if "../src" not in sys.path:
    sys.path.append("../src")

import re
import pandas as pd
import json
import pickle
import itertools
from itertools import chain
from collections import defaultdict
import requests
from dietcoke import dynaspan_lst, corpus_lst, Text, Author, match2time

In [2]:
PAT_ANONYM = '^$|\[*佚名\]*'

authors_tier1 = []
authors_tier12 = []
for corpus in corpus_lst(dynaspan_lst + ['tier1']):
    corpus.read_corpus()
    authors = [Text(line).author for line in corpus.corpus]
    if corpus.dynaspan == 'tier1':
        authors_tier1 = authors
    else:
        authors_tier12 += authors

## print
cnt_tier1 = sum([not re.match(PAT_ANONYM, n) for n in authors_tier1])
cnt_tier12 = sum([not re.match(PAT_ANONYM, n) for n in authors_tier12])

print('---\nCount of tier1:', cnt_tier1, '/', len(authors_tier1))
print('Coverage of tier1:', round(cnt_tier1 / len(authors_tier1), 2))

print('---\nCount of tier12:', cnt_tier12, '/', len(authors_tier12))
print('Coverage of tier12:', round(cnt_tier12 / len(authors_tier12), 2))

100%|██████████| 9/9 [00:00<00:00, 2428.51it/s]


---
Count of tier1: 30 / 120
Coverage of tier1: 0.25
---
Count of tier12: 3938 / 4109
Coverage of tier12: 0.96


In [3]:
authors_uni = [Author(n).name_clean for n in set(authors_tier12)]
authors_uni = sorted(list(chain.from_iterable(authors_uni)))

f = open('../data/names_clean.txt', 'w', encoding='utf-8')
for author in authors_uni:
    f.write(author + '\n')

In [4]:
# !pip install wptools
# # !pip install wikipedia
# # !pip install wordcloud

# import json
# import wptools

# f = open('../data/names_clean.txt')
# authors = [n.strip() for n in f.readlines()]

# retrieved_data = {}
# for i, author in enumerate(authors):
#   if i >= 10:
#     break
#   try:
#     page = wptools.page(author, lang='zh')
#     page.get_parse()

#     data = page.data
#     infobox, wikitext = None, None
#     if 'infobox' in data.keys():
#       infobox = data['infobox']
#     if 'wikitext' in data.keys():
#       wikitext = data['wikitext']

#     retrieved_data[author] = {
#         'data': data,
#         'infobox': infobox,
#         'wikitext': wikitext
#     }
#   except Exception as e:
#     print(e)

In [5]:
# {{bd|1609年|6月21日|1672年|1月23日|catIdx=W吴}}
PAT_LIFE = '\{\{' + '(bd|BD)([^}]+)' + '(\}\})'
PAT_TIMEPOINT = '((前*)(\d{1,4})(年|世紀|世纪))'
PAT_INFOBOX_YEAR = '.*(（|\()(\d+)年.*(）|\)).*'

retrieved_data = pickle.load(open('../data/test_wiki_api.pkl', 'rb'))

author_life = []
for author, data in retrieved_data.items():
    match = None
    result = None
    if data['wikitext'] is not None:
        try:
            match = re.search(PAT_LIFE, data['wikitext'])
            if match:
                life = []
                for n in match.group(0).split('|'):
                    life_timepoints = re.findall(PAT_TIMEPOINT, n)
                    if len(life_timepoints) > 0:
                        life.append(match2time(life_timepoints[0]))
                if len(life) > 2:
                    print('Need to check time:', match.group(0), '->', life)
                elif len(life) > 0:
                    result = life
                    author_life.append([Author(author).name_norm[0], life])
        except Exception as e:
            print(e)

    if match == None:
        if data['infobox'] is not None:
            life_year = [None, None]
            for idx, key in enumerate(['birth_date', 'death_date']):
                if key in data['infobox']:
                    life_year[idx] = int(re.sub(PAT_INFOBOX_YEAR, r'\2', data['infobox'][key]))
            life = [n for n in life_year if n != None]
            if len(life) > 0:
                result = life

    if result is not None:
        author_life.append([Author(author).name_norm[0], result])

author_life = sorted(author_life)
author_life = list(k for k,_ in itertools.groupby(author_life))
author_life = sorted(author_life, key=lambda x: x[1][0])

print('Count of retrieved data:', len(retrieved_data))
print('Count of author time info:', len(author_life))

Count of retrieved data: 1237
Count of author time info: 883


In [6]:
with open('../data/mapping_author_time.pkl', 'wb') as f:
    pickle.dump(author_life, f)

with open('../data/mapping_author_time.pkl', 'rb') as f:
    author_life = pickle.load(f)

In [7]:
group_authors = defaultdict(list)
for author, time in author_life:
    group_authors[str(time)].append(author)

for life, author_lst in group_authors.items():
    if len(author_lst) >= 2:
        print(life, author_lst)

[150, 219] ['宋衷', '張機']
[1007, 1072] ['契嵩', '歐陽脩']
[1079, 1154] ['汪藻', '許叔微']
[1420, 1474] ['岳正', '葉盛']
[1472, 1529] ['李夢陽', '王守仁']
[1512, 1565] ['胡宗憲', '韓叔陽']
[1558, 1639] ['郝敬', '陳繼儒']
[1755, 1837] ['石韞玉', '范照藜']
[1799, 1873] ['何紹基', '王柏心']
[1824, 1890] ['曾國荃', '黃彭年']
[1838, 1894] ['薛福成', '陸心源']
[1842, 1906] ['姚振宗', '王之春']


In [8]:
df = pd.DataFrame(list(group_authors.items()))
df.columns = ('life', 'author_lst')

df['latest_tp'] = [json.loads(n)[-1] for n in df['life']]
df = df.sort_values('latest_tp')
df['lag'] = df['latest_tp'].shift(1)
df['tp_diff'] = df['latest_tp'] - df['lag']
df

Unnamed: 0,life,author_lst,latest_tp,lag,tp_diff
0,"[-725, -645]",[管仲],-645,,
1,"[-468, -376]",[墨翟],-376,-645.0,269.0
2,"[-450, -375]",[列禦寇],-375,-376.0,1.0
5,[-235],[呂不韋],-235,-375.0,140.0
3,"[-281, -233]",[韓非],-233,-235.0,2.0
...,...,...,...,...,...
868,[1973],[陳煒],1973,1969.0,4.0
869,[1986],[許嵩],1986,1973.0,13.0
870,[1988],[張昱],1988,1986.0,2.0
861,"[1896, 1991]",[王德溥],1991,1988.0,3.0


In [9]:
df.describe()

Unnamed: 0,latest_tp,lag,tp_diff
count,871.0,870.0,870.0
mean,1430.445465,1429.771264,3.05977
std,483.162105,483.029907,11.86949
min,-645.0,-645.0,0.0
25%,1139.5,1138.75,0.0
50%,1628.0,1628.0,1.0
75%,1801.0,1800.5,2.0
max,2017.0,1991.0,269.0


In [10]:
# URL_CHINESE_AUTHOR = 'https://zh.m.wikisource.org/w/api.php?action=query&prop=info&titles=Portal:中国作者&format=json'
# res = requests.get(URL_CHINESE_AUTHOR)
# json.loads(res.text)#['query']['pages']