In [1]:
import json
import pandas as pd
import numpy as np

import sys
from pathlib import Path
if '../src' not in sys.path:
    sys.path.append('../src')

from dietcoke import dynaspan_lst

Mapping author time ...
File read: ../data/author_time/wiki_author_time.json
File read: author_time_map.json
File read: ../notes/post_edit.json
File read: ../notes/post_edit.json


In [2]:
df = pd.read_csv('../data/author_time/author_profile.csv') \
    .dropna(subset=['mid_year', 'dynaspan'])
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
print(df.shape)

(1477, 9)


In [3]:
dynasties = pd.read_csv('/Users/mac/Documents/in_sync_mac/in_sync_documents/GIL/corpus/ctext/meta/dynasties.csv')
dynaspan_df = dynasties.groupby('dynasty_group') \
    .agg({'yearfrom': ['min'], 'yearto': ['max']})
lookup_yearrange = tuple(zip(dynaspan_df['yearfrom']['min'], dynaspan_df['yearto']['max']))

lookup = dict(zip(dynaspan_lst, lookup_yearrange))
lookup

{'先秦': (-1046, -221),
 '漢': (-221, 220),
 '魏晉南北': (220, 589),
 '唐五代十國': (581, 1125),
 '宋元': (960, 1368),
 '明': (1368, 1644),
 '清': (1644, 1911),
 '民國': (1911, 1949)}

In [4]:
def is_valid_year(birth_year, death_year, dynaspan):
    valid = True
    year_start, year_end = lookup.get(dynaspan)
    if birth_year > year_end: valid = False
    if death_year < year_start: valid = False
    return valid

df['dynaspan_yearfrom'] = df['dynaspan'].apply(lambda x: lookup[x][0])
df['dynaspan_yearto'] = df['dynaspan'].apply(lambda x: lookup[x][1])

df['valid_birth_year'] = df.apply(lambda row: is_valid_year(row['birth_year'], row['mort_year'], row['dynaspan']), axis=1)
print(df[df['valid_birth_year'] == False].shape)

(46, 12)


In [5]:
df[df['valid_birth_year'] == False].to_csv('author_profile_man.csv', index=False)

In [6]:
name_cnt_df = df[['name', 'birth_year', 'mort_year', 'dynaspan']].drop_duplicates() \
    .groupby('name')['dynaspan'].count()
name_cnt_df = pd.DataFrame(name_cnt_df) \
    .reset_index().rename({'dynaspan': 'dynaspan_cnt'}, axis=1)
name_cnt_df = name_cnt_df[name_cnt_df['dynaspan_cnt'] > 1]
name_cnt_df = df[df['name'].isin(name_cnt_df['name'])]
print(name_cnt_df.shape)

(74, 12)


In [7]:
# name_cnt_df.to_csv('author_profile_man.csv', index=False, columns=False, mode='a')