In [2]:
!pip install everypolitician



In [3]:
from everypolitician import EveryPolitician
import json
from bs4 import BeautifulSoup
import requests
ep = EveryPolitician()


try:
    with open("countries.json", "r") as file:
        countryJSON = json.load(file)
        print(countryJSON)
except json.JSONDecodeError as e:
    print(f"Failed to decode JSON: {e}")
except Exception as e:
    print(f"An error occurred: {e}")




[{'key': '', 'doc_count': 632}, {'key': 'Australia', 'doc_count': 145}, {'key': 'Indonesia', 'doc_count': 95}, {'key': 'China', 'doc_count': 94}, {'key': 'Malaysia', 'doc_count': 74}, {'key': 'India', 'doc_count': 71}, {'key': 'Vietnam', 'doc_count': 42}, {'key': 'Singapore', 'doc_count': 31}, {'key': 'Japan', 'doc_count': 28}, {'key': 'Qatar', 'doc_count': 24}]


In [4]:
countryList = [country['key'] for country in countryJSON]
countryList = countryList[1:] #remove the first element because its empty
print(countryList)




['Australia', 'Indonesia', 'China', 'Malaysia', 'India', 'Vietnam', 'Singapore', 'Japan', 'Qatar']


In [5]:
# get the list of politicians for each country
politicians_by_country = {}

for country in countryList:
    try:
        country_legislature = ep.country(country).lower_house()
        politicians = country_legislature.popolo().persons
        politicians_by_country[country] = [politician.name for politician in politicians]
    except Exception as e:
        print(f"Failed to retrieve data for {country}: {e}")

Failed to retrieve data for Qatar: Couldn't find the country with slug 'Qatar'


In [6]:
def search_wikidata(search_term, language="id"):
    base = "https://wikidata.org/w/api.php"
    search_base_params = {
        "action": "wbsearchentities",  # Searches for entities using labels and aliases
        "format": "json"
    }

    search_res = requests.get(base, params={**search_base_params, **{
        "search": search_term,
        "language": language
    }})
    parsed_res = search_res.json()['search'][0]
    return parsed_res['id']

# Example usage
search_wikidata("Drs._H._ANDI_NAWIR,_MP")

'Q97590771'

In [35]:
# Load or initialize the cache
try:
    with open('label_cache.json', 'r') as cache_file:
        label_cache = json.load(cache_file)
except FileNotFoundError:
    label_cache = {}

def get_label(qid):
    # Check if the label is already in the cache
    if qid in label_cache:
        return label_cache[qid]

    # If not in cache, fetch from Wikidata
    url = "https://www.wikidata.org/w/api.php"
    params = {
        "action": "wbgetentities",
        "ids": qid,
        "format": "json",
        "props": "labels",
        "languages": "en"  # Assuming you want the label in English
    }
    response = requests.get(url, params=params)
    data = response.json()
    label = data['entities'][qid]['labels']['en']['value']

    # Update the cache with the new label
    label_cache[qid] = label
    with open('label_cache.json', 'w') as cache_file:
        json.dump(label_cache, cache_file)

    return label

# Example usage
label = get_label("Q408")
print(label)  # This will print the label for Q408

Australia


In [43]:
def get_wikidata(id, language="en"):
    base = "https://wikidata.org/w/api.php"
    get_base_params = {
        "action": "wbgetentities",
        "format": "json",
        "ids": id,
        "languages": language
    }

    get_res = requests.get(base, params={**get_base_params})
    parsed_res = get_res.json()['entities'][id]
    
    def get_nested(data, keys, default='Unknown'):
        try:
            for key in keys:
                if isinstance(data, dict) and len(data) > 0:
                    data = data.get(key, {})
                    print(keys, " is dict",str(data)[:50], '...')
                elif isinstance(data, list) and len(data) > 0:  # Check if data is a non-empty list
                    data = data[0]
                    print(keys, " is list",str(data)[:50], '...')
                elif (isinstance(data, str) or isinstance(data, int)) and (len(data) > 0):
                    print(keys, " is str or int",str(data)[:50], '...')
                    return data
                else:
                    print(keys, 'is', type(data))
                    return default
            return data
        except Exception as e:
            print(f"Error navigating JSON: {e}")
            return default
    print(json.dumps(parsed_res, indent=4))
        
    properties = {
        'P27': 'country of citizenship',
        'P140': 'religion or worldview',
        'P21': 'gender',
        'P19': 'place of birth',
        'P131': 'nationality',
        'P1971': 'number of children',
        'P551': 'residence',
        'P102': 'Political party',
        'P18': 'image',
        'P106': 'occupation',
        'P39': 'position held',
        'P69': 'educated at',
        'P39': 'position held'
    }
    
    data = {
        prop: get_nested(parsed_res, ['claims', prop, 0, 'mainsnak', 'datavalue', 'value', 'id','time'], "Unknown " + properties[prop])
        for prop in properties.keys()
    }
    data['P569'] = get_nested(parsed_res, ['claims', 'P569', 0, 'mainsnak', 'datavalue', 'value', 'time'], 'Unknown date of birth')
    
    # print('so we got data', data)
    positions_held = []
    for position in parsed_res.get('claims', {}).get('P39', []):
        position_data = {
            'position': get_label(get_nested(position, ['mainsnak', 'datavalue', 'value', 'id'], 'Unknown position')),
            'start_date': get_nested(position, ['qualifiers', 'P580', 0, 'datavalue', 'value', 'time'], 'Unknown start date'),
            'end_date': get_nested(position, ['qualifiers', 'P582', 0, 'datavalue', 'value', 'time'], 'Unknown end date'),
            'replaces': get_label(get_nested(position, ['qualifiers', 'P1365', 0, 'datavalue', 'value', 'id'], 'Unknown replaces')),
            'replaced_by': get_label(get_nested(position, ['qualifiers', 'P1366', 0, 'datavalue', 'value', 'id'], 'Unknown replaced by')),
        }
        positions_held.append(position_data)
    
    aliases = [alias.get('value') for alias in parsed_res.get('aliases', {}).get(language, []) if isinstance(alias, dict)]
    description = parsed_res.get('descriptions', {}).get(language, {}).get('value', 'No description available')
    for k, v in data.items():
        if str(v).startswith('Q') and str(v)[1:].isdigit():
            # print('for', k, 'we got qid', v)
            # print('now we get label', get_label(v))
            data[k] = get_label(v)

    return {**{properties[prop]: data[prop] for prop in properties}, "aliases": aliases, "description": description, "positions_held": positions_held}
    
get_wikidata(search_wikidata("Lee Hsien Loong"))


{
    "pageid": 60463,
    "ns": 0,
    "title": "Q57643",
    "lastrevid": 2135118030,
    "modified": "2024-04-22T22:47:23Z",
    "type": "item",
    "id": "Q57643",
    "labels": {
        "en": {
            "language": "en",
            "value": "Lee Hsien Loong"
        }
    },
    "descriptions": {
        "en": {
            "language": "en",
            "value": "3rd Prime Minister of Singapore since 2004"
        }
    },
    "aliases": {
        "en": [
            {
                "language": "en",
                "value": "PM Lee"
            },
            {
                "language": "en",
                "value": "LHL"
            }
        ]
    },
    "claims": {
        "P21": [
            {
                "mainsnak": {
                    "snaktype": "value",
                    "property": "P21",
                    "hash": "85ad4b1c7348f7a5aac521135040d74e91fb5939",
                    "datavalue": {
                        "value": {
                        

KeyError: 'entities'

In [39]:
"""
type get_wikidata = {
 'country of citizenship': string,
 'religion or worldview': string | {},
 'gender': string,
 'place of birth': string,
 'nationality': string | {},
 'number of children': number | {},
 'residence': string | {},
 'Political party': string | {},
 'image': string,
 'occupation': string,
 'position held': string,
 'educated at': string,
 'aliases': string[],
 'description': string,
 'positions_held': [{'position': string,
   'start_date': string,
   'end_date': string,
   'replaces': string,
   'replaced_by': string},
  {'position': string,
   'start_date': string,
   'end_date': string,
   'replaces': string,
   'replaced_by': string}]}
"""



"\ntype get_wikidata = {'country of citizenship': string,\n 'religion or worldview': string | {},\n 'gender': string,\n 'place of birth': string,\n 'nationality': string | {},\n 'number of children': number | {},\n 'residence': string | {},\n 'Political party': string | {},\n 'image': string,\n 'occupation': string,\n 'position held': string,\n 'educated at': string,\n 'aliases': string[],\n 'description': string,\n 'positions_held': [{'position': string,\n   'start_date': string,\n   'end_date': string,\n   'replaces': string,\n   'replaced_by': string},\n  {'position': string,\n   'start_date': string,\n   'end_date': string,\n   'replaces': string,\n   'replaced_by': string}]}\n"

In [54]:
# test function to scrape Australia
def scrape_australia():
    australian_politicians = politicians_by_country.get('Australia', [])[:50]  # Get the first 10 Australian politicians
    politician_details = []

    for politician in australian_politicians:
        try:
            # Construct the URL for each politician's Wikipedia page
            url = f'https://en.wikipedia.org/wiki/{politician.replace(" ", "_")}'  # Replace spaces with underscores for URL
            response = requests.get(url)
            response.raise_for_status()  # Raises an HTTPError for bad responses

            # Parse the HTML content
            soup = BeautifulSoup(response.content, 'html.parser')

            # Extract specific details based on the provided CSS selectors
            details = {}

            # Extract the link to the politician's Wikipedia page
            details['Wikipedia Link'] = url

            # Extract the image link
            image_link = soup.select_one('#mw-content-text > div.mw-content-ltr.mw-parser-output > table > tbody > tr:nth-child(2) > td > span > a')
            if image_link and image_link.has_attr('href'):
                details['Image Link'] = image_link['href']

            for sup in soup.find_all('sup'):
                sup.decompose()
            description = []
            for p in soup.select('#mw-content-text > div.mw-content-ltr.mw-parser-output > table.infobox.vcard ~ p'):
                description.append(p.text.strip())
            print(description)
            print(f"Length of description for {politician}: {len(' '.join(description))} characters")
            details['Description'] = ' '.join(description)
            politician_details.append({politician: details})
        except requests.RequestException as e:
            print(f"HTTP Request failed for {politician}: {e}")
        except Exception as e:
            print(f"An error occurred while scraping {politician}: {e}")

    return politician_details

# Call the test function to scrape details of Australian politicians
australian_politicians_details = scrape_australia()
import pandas as pd
from IPython.display import display

# Convert the list of Australian politician details into a DataFrame
df_australia = pd.DataFrame([details for sublist in australian_politicians_details for details in sublist.values()])
display(df_australia)



['De-Anne Margaret Kelly (née Park; born 21 March 1954) is an Australian former politician.', 'Kelly was a National Party member of the Australian House of Representatives from March 1996 until November 2007, representing the Division of Dawson, Queensland.', 'She was the first female member of the National Party to win a seat in the House of Representatives.', "Kelly was born in Rockhampton, Queensland to parents Ian Park and Margaret Park (née Bauman). Kelly's uncle was Kerrod Park, who was known for his involvement with local government in Queensland, having served as the chairman of Duaringa Shire Council from 1973 to 1994.", 'Kelly grew up on a cattle property where she obtained her primary education from her mother who taught her at the kitchen table. She then attended high school in Rockhampton and after winning a scholarship to study electrical engineering, Kelly attended the University of Queensland.', 'After graduating, Kelly worked as an engineer before she bought a small ma

Unnamed: 0,Wikipedia Link,Image Link,Description
0,https://en.wikipedia.org/wiki/De-Anne_Kelly,/wiki/File:De-Anne_Kelly_2006.jpg,De-Anne Margaret Kelly (née Park; born 21 Marc...
1,https://en.wikipedia.org/wiki/Percival_Millar,,Percival Clarence Millar AM (né Turbill; 15 Ju...
2,https://en.wikipedia.org/wiki/John_Alexander,,
3,https://en.wikipedia.org/wiki/Michael_Ronaldson,/wiki/File:Michael_Ronaldson.jpg,Michael John Clyde Ronaldson (born 13 February...
4,https://en.wikipedia.org/wiki/Peter_Andren,,Peter James Andren AM (28 August 1946 – 3 Nove...
5,https://en.wikipedia.org/wiki/David_Simmons,,
6,https://en.wikipedia.org/wiki/Warwick_Smith,,
7,https://en.wikipedia.org/wiki/Alasdair_Webster,,Alasdair Paine Webster OAM (born 12 February 1...
8,https://en.wikipedia.org/wiki/Jim_Turnour,,James Pearce Turnour (born 7 April 1966) is a ...
9,https://en.wikipedia.org/wiki/Kym_Richardson,,Kym Charles Richardson (born 16 March 1958) is...


In [None]:

def scrape_china():
    china_politicians = politicians_by_country.get('China', [])[:22]
    politician_details = []

    for politician in china_politicians:
        try:
            # Construct the URL for each politician's Wikipedia page
            url = f'https://zh.wikipedia.org/wiki/{politician}'  # Replace spaces with underscores for URL
            print(url)
            response = requests.get(url)
            response.raise_for_status()  # Raises an HTTPError for bad responses

            # Parse the HTML content
            soup = BeautifulSoup(response.content, 'html.parser')

            # Extract specific details based on the provided CSS selectors
            details = {}

            # Extract the link to the politician's Wikipedia page
            details['Wikipedia Link'] = url

            # Extract the image link
            image_link = soup.select_one('#mw-content-text > div.mw-content-ltr.mw-parser-output > table > tbody > tr:nth-child(2) > td > span > a')
            if image_link and image_link.has_attr('href'):
                details['Image Link'] = image_link['href']

            for sup in soup.find_all('sup'):
                sup.decompose()
            description = []
            for p in soup.select('#mw-content-text > div.mw-content-ltr.mw-parser-output'):
                description.append(p.text.strip())
            print(description)
            print(f"Length of description for {politician}: {len(' '.join(description))} characters")
            details['Description'] = ' '.join(description)
            politician_details.append({politician: details})
        except requests.RequestException as e:
            print(f"HTTP Request failed for {politician}: {e}")
        except Exception as e:
            print(f"An error occurred while scraping {politician}: {e}")

    return politician_details

china_politicians_details = scrape_china()
import pandas as pd
from IPython.display import display

df_china = pd.DataFrame([details for sublist in china_politicians_details for details in sublist.values()])
display(df_china)



https://zh.wikipedia.org/wiki/王树芬
['王树芬（1962年3月—），云南香格里拉人，藏族，中华人民共和国政治人物、第十二届全国人民代表大会云南地区代表。\n毕业于中央党校研究生院经济管理专业。1984年11月加入中国共产党。担任云南省民政厅厅长。2008年起担任全国人大代表。2013年，担任全国人大代表。2018年2月24日，当选为第十三届全国人大代表。2018年10月25日，中华全国总工会第十七届执行委员会召开第一次全体会议，选举全总十七届执委会主席、副主席和主席团委员，他当选为全总主席团委员。\n\n參考文獻[编辑]\n\n\n^ 全国人大代表信息-王树芬. 全国人大网.   [2013-04-05].\xa0\n\n^ 全国人大代表信息-王树芬. 全国人大网.   [2013-04-05].\xa0\n\n^ （受权发布）中华人民共和国第十三届全国人民代表大会代表名单-中新网. 中新网.   [2021-02-13]. （原始内容存档于2018-02-27）.\xa0\n\n^ 李建国当选为全国总工会主席. 环球网.   [2021-05-26]. （原始内容存档于2021-09-17）.\xa0\n\n^ 中华全国总工会第十七届执委会举行第一次全体会议  王东明当选为全国总工会主席. 中国人大网.   [2021-05-26]. （原始内容存档于2021-09-17）.\xa0\n\n^ 中华全国总工会第十七届执行委员会主席、副主席、主席团委员名单. 人民网.   [2021-05-26]. （原始内容存档于2020-01-28）.\xa0\n\n^ 中华全国总工会第十七届执委会举行第一次全体会议 王东明当选为全国总工会主席. 新华网.   [2021-05-26]. （原始内容存档于2020-08-10）.\xa0\n\n\n外部链接[编辑]\n王树芬简历（页面存档备份，存于互联网档案馆） 中国经济网 2014-07-31\n查论编云南省现任省部级官员 中国共产党云南省委第十一届书记★王宁 (中央委员)2\xa0副书记☆\n王予波 (中央委员)\n石玉钢 (中央候补委员、专职副书记)\n11\xa0常委☆\n杨宁 (省委统战部长、省委教育工委书记)\n冯志礼 (中央纪委委员)\n刘洪建 (中央候补委员、昆明市委书记)\n杨亚

Unnamed: 0,Wikipedia Link,Description,Image Link
0,https://zh.wikipedia.org/wiki/王树芬,王树芬（1962年3月—），云南香格里拉人，藏族，中华人民共和国政治人物、第十二届全国人民代...,
1,https://zh.wikipedia.org/wiki/刘蓉华,刘蓉华（1959年—），山西霍州人，汉族，中华人民共和国政治人物、第十二届全国人民代表大会山...,
2,https://zh.wikipedia.org/wiki/米合伦沙·阿不都,米合伦沙·阿不都（1985年2月—），新疆伊宁人。女，维吾尔族。中国人民解放军人物。\n\n...,
3,https://zh.wikipedia.org/wiki/苏力坦·加依纳克,苏力坦·加依纳克（1979年—），新疆乌恰人，柯尔克孜族，中华人民共和国政治人物、第十二届全...,
4,https://zh.wikipedia.org/wiki/向晓梅,向晓梅（1965年—），四川安岳人，汉族，中华人民共和国政治人物、第十二届全国人民代表大会广...,
5,https://zh.wikipedia.org/wiki/江香梅,江香梅（1962年—），江西南昌人，汉族，中华人民共和国政治人物、第十二届全国人民代表大会江...,
6,https://zh.wikipedia.org/wiki/韦丽萍,韦丽萍（1968年—），广西柳州人，壮族，中华人民共和国政治人物、第十二届全国人民代表大会广...,
7,https://zh.wikipedia.org/wiki/杨剑波,杨剑波（1957年—），安徽太和人，汉族，中华人民共和国政治人物、第十二届全国人民代表大会安...,
8,https://zh.wikipedia.org/wiki/何建洋,何建洋（1959年—），江西萍乡人，汉族，中华人民共和国政治人物、第十二届全国人民代表大会江...,
9,https://zh.wikipedia.org/wiki/袁玉珠,袁玉珠（1953年—），山西中阳人。汉族。太原理工大学经济管理专业毕业。\n曾任山西中阳钢铁...,


In [None]:
def scrape_indonesia():
    indonesia_politicians = politicians_by_country.get('Indonesia', [])[:5]
    politician_details = []


    for politician in indonesia_politicians:
        try:
            # Replace spaces with underscores for URL
            politician = politician.replace(' ', '_')
            url = f'https://id.wikipedia.org/wiki/{politician}'
            print(url)
            response = requests.get(url)
            response.raise_for_status()  # Raises an HTTPError for bad responses

            # Parse the HTML content
            soup = BeautifulSoup(response.content, 'html.parser')

            # Extract specific details based on the provided CSS selectors
            details = {}

            redirect = soup.select_one('#wdsearch_container > table > tbody > tr > th > a.wd_title')
            if redirect and redirect.has_attr('href'):
                url =  redirect['href']
                print(f"redirect found for {politician}: {url}")
                response = requests.get(url)
                response.raise_for_status()
                soup = BeautifulSoup(response.content, 'html.parser')
            else:
                print(f"no redirect found for {politician}: {redirect}")
                
            # Extract the link to the politician's Wikipedia page
            details['Wikipedia Link'] = url



            # Extract the image link
            image_link = soup.select_one('#mw-content-text > div.mw-parser-output > table.infobox.vcard > tbody > tr:nth-child(2) > td > a > img')
            if image_link and image_link.has_attr('src'):
                details['Image Link'] = 'https:' + image_link['src']

            # Remove references from the text
            for sup in soup.find_all('sup'):
                sup.decompose()

            # Extract the description
            description = []
            for p in soup.select('#mw-content-text > div.mw-parser-output > p'):
                description.append(p.text.strip())
            details['Description'] = ' '.join(description)

            # Append the details dictionary to the list with the politician's name as the key
            politician_details.append({politician: details})

        except requests.RequestException as e:
            print(f"HTTP Request failed for {politician}: {e}")
        except Exception as e:
            print(f"An error occurred while scraping {politician}: {e}")

    return politician_details

indonesia_politicians_details = scrape_indonesia()
import pandas as pd
from IPython.display import display

df_indonesia = pd.DataFrame([details for sublist in indonesia_politicians_details for details in sublist.values()])
display(df_indonesia)

# https://www.wikidata.org/w/api.php?action=query&generator=search&format=json&prop=extracts&gsrsearch=Drs._H._ANDI_NAWIR,_MP


https://id.wikipedia.org/wiki/Drs._I_MADE_URIP,_M.Si
HTTP Request failed for Drs._I_MADE_URIP,_M.Si: 404 Client Error: Not Found for url: https://id.wikipedia.org/wiki/Drs._I_MADE_URIP,_M.Si
https://id.wikipedia.org/wiki/ABDUL_MALIK_HARAMAIN,_M.Si.
HTTP Request failed for ABDUL_MALIK_HARAMAIN,_M.Si.: 404 Client Error: Not Found for url: https://id.wikipedia.org/wiki/ABDUL_MALIK_HARAMAIN,_M.Si.
https://id.wikipedia.org/wiki/Drs._H._ANDI_NAWIR,_MP
HTTP Request failed for Drs._H._ANDI_NAWIR,_MP: 404 Client Error: Not Found for url: https://id.wikipedia.org/wiki/Drs._H._ANDI_NAWIR,_MP
https://id.wikipedia.org/wiki/Ir._H._BAMBANG_HARYO_SOEKARTONO
HTTP Request failed for Ir._H._BAMBANG_HARYO_SOEKARTONO: 404 Client Error: Not Found for url: https://id.wikipedia.org/wiki/Ir._H._BAMBANG_HARYO_SOEKARTONO
https://id.wikipedia.org/wiki/drh._SLAMET
HTTP Request failed for drh._SLAMET: 404 Client Error: Not Found for url: https://id.wikipedia.org/wiki/Drh._SLAMET
