In [5]:
import sys
import os
import importlib
import json
import pickle
import uuid
import copy
import tqdm
import requests
from urllib.parse import urlparse, unquote
from pathlib import Path

curr_path = Path.cwd()
while curr_path.name != 'tasks' and curr_path != curr_path.parent:
    curr_path = curr_path.parent

if curr_path.name == 'tasks':
    sys.path.append(str(curr_path))
else:
    raise FileNotFoundError("Could not find 'tasks' folder in parent hierarchy.")

import numpy as np
import pandas as pd

from pack import helper as h

In [6]:
from datetime import datetime

def convert_date_string(date_str):
    try:
        dt = datetime.strptime(date_str, "%d %B %Y")
        return dt.strftime("%Y-%m-%d")
    except:
        return None  # or raise an exception if preferred


In [7]:
from src import selenbeau

In [4]:
page_name = 'Freddie Mercury'
selenbeau.get_wiki_id_from_page_title(page_title=page_name, open_browser=True)

'Q15869'

In [8]:
load_path = os.path.join('data', 'outputs', 'walk_wiki', 'china_people_dob.pickle')

In [9]:
with open(load_path, 'rb') as f:
    china_people_dob = pickle.load(f)

In [10]:
# remove 'gregorian'
for k,v in china_people_dob.items():
    if v and v['dob'] and 'Gregorian' in v['dob']:
        china_people_dob[k]['dob'] = china_people_dob[k]['dob'].replace('Gregorian', '')
    china_people_dob[k]['dob_formatted'] = convert_date_string(china_people_dob[k]['dob'])

In [12]:
pd.DataFrame(china_people_dob.values()).head(15)

Unnamed: 0,url,page_name,page_id,dob,dob_formatted
0,https://en.wikipedia.org/wiki/Cao_Fei,Cao Fei,Q1034078,1978,
1,https://en.wikipedia.org/wiki/Chen_Qiang_(artist),Chen Qiang (artist),No matching records found,,
2,https://en.wikipedia.org/wiki/Michael_Lin_(art...,Michael Lin (artist),No matching records found,,
3,https://en.wikipedia.org/wiki/Li_Di,Li Di,Q27527265,15 December 1999,1999-12-15
4,https://en.wikipedia.org/wiki/Zhou_Chengzhou,Zhou Chengzhou,Q105721640,28 December 1982,1982-12-28
5,https://en.wikipedia.org/wiki/Main_Page,Main Page,No matching records found,,
6,https://en.wikipedia.org/wiki/Chen_Shuxia,Chen Shuxia,Q64740279,1963,
7,https://en.wikipedia.org/wiki/Pingju,Pingju,Q834885,,
8,https://en.wikipedia.org/wiki/Qiu_Deshu,Qiu Deshu,Q22247739,1948,
9,https://en.wikipedia.org/wiki/Chen_Shaoxiong,Chen Shaoxiong,Q27983350,1962,


In [13]:
china_people_dob_cleansed = [bundle for bundle in list(china_people_dob.values()) if bundle['dob_formatted'] is not None]

In [14]:
pd.DataFrame(china_people_dob_cleansed).head(15)

Unnamed: 0,url,page_name,page_id,dob,dob_formatted
0,https://en.wikipedia.org/wiki/Li_Di,Li Di,Q27527265,15 December 1999,1999-12-15
1,https://en.wikipedia.org/wiki/Zhou_Chengzhou,Zhou Chengzhou,Q105721640,28 December 1982,1982-12-28
2,https://en.wikipedia.org/wiki/Xin_Fengxia,Xin Fengxia,Q8263391,26 January 1927,1927-01-26
3,https://en.wikipedia.org/wiki/He_Xiangning,He Xiangning,Q8248830,27 June 1878,1878-06-27
4,https://en.wikipedia.org/wiki/Li_Chevalier,Li Chevalier,Q3237543,30 March 1961,1961-03-30
5,https://en.wikipedia.org/wiki/Chow_Chung-cheng,Chow Chung-cheng,Q18019061,20 June 1908,1908-06-20
6,https://en.wikipedia.org/wiki/Chen_Guang,Chen Guang,Q16727970,3 February 1995,1995-02-03
7,https://en.wikipedia.org/wiki/Wang_Jianwei,Wang Jianwei,Q16269579,28 October 1958,1958-10-28
8,https://en.wikipedia.org/wiki/Zimei,Zimei,Q8072113,16 January 1971,1971-01-16
9,https://en.wikipedia.org/wiki/Gao_Minglu,Gao Minglu,Q34224406,29 October 1949,1949-10-29


In [15]:
import requests

def get_wiki_url_from_id(wiki_id):
    bridge_json_url = f'https://www.wikidata.org/wiki/Special:EntityData/{wiki_id}.json'
    response = requests.get(bridge_json_url)
    if response.status_code == 200:
        try:
            data = response.json()
            url = data['entities'][wiki_id]['sitelinks']['enwiki']['url']
            return url
        except:
            return None
    else:
        return None

In [16]:
### view
get_wiki_url_from_id('Q472683')

'https://en.wikipedia.org/wiki/Nie_Haisheng'

In [17]:
from src import special

In [18]:
importlib.reload(special)

<module 'src.special' from 'C:\\Users\\uthop\\notebooks\\tasks\\250623_WikiLists\\src\\special.py'>

In [19]:
for bundle in tqdm.tqdm(china_people_dob_cleansed):
    dob_formatted = bundle['dob_formatted']
    # if 'url_checked' in bundle: continue
    is_above_50s = special.get_ymd_and_above_1950(dob_formatted)
    page_id = bundle['page_id']
    url = get_wiki_url_from_id(page_id)
    bundle['url_checked'] = url
    bundle['is_above_50s'] = is_above_50s
    bundle['name_from_url'] = special.get_name_from_link(bundle['url_checked'])

100%|████████████████████████████████████████████████████████████████████████████████| 206/206 [01:48<00:00,  1.90it/s]


In [20]:
china_people_dob_cleansed_df = pd.DataFrame(china_people_dob_cleansed)

In [21]:
### view
china_people_dob_cleansed_df.tail(15)

Unnamed: 0,url,page_name,page_id,dob,dob_formatted,url_checked,is_above_50s,name_from_url
191,https://en.wikipedia.org/wiki/Wu_Shanzhuan,Wu Shanzhuan,Q8038961,25 October 1960,1960-10-25,https://en.wikipedia.org/wiki/Wu_Shanzhuan,1960-10-25,Wu Shanzhuan
192,https://en.wikipedia.org/wiki/He_Rong,He Rong,Q30950291,21 October 1962,1962-10-21,https://en.wikipedia.org/wiki/He_Rong,1962-10-21,He Rong
193,https://en.wikipedia.org/wiki/Chen_Yixin,Chen Yixin,Q18977835,1 September 1959,1959-09-01,https://en.wikipedia.org/wiki/Chen_Yixin,1959-09-01,Chen Yixin
194,https://en.wikipedia.org/wiki/Zhuang_Rongwen,Zhuang Rongwen,Q55694631,22 February 1961,1961-02-22,https://en.wikipedia.org/wiki/Zhuang_Rongwen,1961-02-22,Zhuang Rongwen
195,https://en.wikipedia.org/wiki/Peng_Liyuan,Peng Liyuan,Q430911,20 November 1962,1962-11-20,https://en.wikipedia.org/wiki/Peng_Liyuan,1962-11-20,Peng Liyuan
196,https://en.wikipedia.org/wiki/Li_Qiang,Li Qiang,Q11097097,1 January 1968,1968-01-01,https://en.wikipedia.org/wiki/Li_Qiang_(screen...,1968-01-01,Li Qiang (screenwriter)
197,https://en.wikipedia.org/wiki/Li_Shulei,Li Shulei,Q15931195,21 January 1964,1964-01-21,https://en.wikipedia.org/wiki/Li_Shulei,1964-01-21,Li Shulei
198,https://en.wikipedia.org/wiki/He_Lifeng,He Lifeng,Q8291839,4 February 1955,1955-02-04,https://en.wikipedia.org/wiki/He_Lifeng,1955-02-04,He Lifeng
199,https://en.wikipedia.org/wiki/Wu_Zhenglong,Wu Zhenglong,Q10919528,6 November 1964,1964-11-06,https://en.wikipedia.org/wiki/Wu_Zhenglong,1964-11-06,Wu Zhenglong
200,https://en.wikipedia.org/wiki/Li_Ganjie,Li Ganjie,Q16076530,11 November 1964,1964-11-11,https://en.wikipedia.org/wiki/Li_Ganjie,1964-11-11,Li Ganjie


In [26]:
save_path = os.path.join('data', 'outputs', 'walk_wiki', 'china_people_dob_cleansed.pickle')

In [27]:
with open(save_path, 'wb') as f:
    pickle.dump(china_people_dob_cleansed, f)

In [28]:
keys = ['name_from_url', 'is_above_50s', 'url_checked']
all_data = list()
for index, row_ori in china_people_dob_cleansed_df.iterrows():
    row = dict(row_ori)
    is_above_50s = row['is_above_50s']
    url_checked = row['url_checked']
    name_from_url = row['name_from_url']
    if is_above_50s is not None:
        data = h.split_dict_with_keys(row, keys, follow_my_keys_order=True)[1]
        all_data.append(data)

In [29]:
_path = os.path.join('data', 'outputs', 'china.xlsx')
pd.DataFrame(all_data).to_excel(_path)