In [2]:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
import numpy as np
from datetime import datetime,timedelta
from time import sleep
import json
import sys
from urllib.parse import quote as qt
import os
import dill
from ctypes import c_char

In [5]:
def get_name_by_year_birth(year_list, gender):
    year_data = []
    error_year = []
    error_type = []
    status_code = []
    year_code = []
    query_url = 'https://query.wikidata.org/sparql'
    if gender == 'f':
        gender_wd = 'Q6581072'# female wiki data
    else:
        gender_wd = 'Q6581097'# male wiki data
    for year in year_list:
        query = """
            SELECT DISTINCT ?item ?itemLabel ?dob ?nation ?nationLabel ?page_title WHERE {
              ?item wdt:P21 wd:"""+gender_wd+""". 
              ?item wdt:P27 ?nation.
              ?item wdt:P31 wd:Q5;
                    wdt:P569 ?dob. hint:Prior hint:rangeSafe true.
              ?article schema:about ?item ; schema:isPartOf <https://en.wikipedia.org/> ;  schema:name ?page_title .
              SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
              FILTER('"""+str(year)+"""-01-01'^^xsd:dateTime <= ?dob && ?dob < '"""+str(year+1)+"""-01-01'^^xsd:dateTime)
            }
            """
        try:
            query_result = requests.get(query_url, params = {'format': 'json', 'query': query})
            year_code.append(year)
            status_code.append(query_result.status_code)
            data = query_result.json()
            wiki_id = []
            yearobs = []
            names = []
            genders = []
            countryobs = []
            page_title = []
            for item in data['results']['bindings']:
                wiki_id.append(item['item']['value'])
                yearobs.append(year)
                names.append(item['itemLabel']['value'])
                genders.append(gender)
                countryobs.append(item['nationLabel']['value'])
                page_title.append(item['page_title']['value'])
            result_women = pd.DataFrame.from_dict(
                {'Wiki_ID': wiki_id,
                'Name': names,
                'Gender': genders,
                'Year_birth': yearobs,
                'Country_birth': countryobs,
                'Page_title': page_title}
            )
            year_data.append(result_women) 
            print(year)
            sleep(0.02)
        except :
            error_year.append(year)
            e = sys.exc_info()[0]
            error_type.append(e)
            print(str(year) + " has an " + str(e))
            continue
    df_error = pd.DataFrame.from_dict({
        'error_year': error_year,
        'error_type': error_type
    })
    df_status_code = pd.DataFrame.from_dict({
        'code_year': year_code,
        'status_code': status_code
    })
    return year_data, df_error, df_status_code

In [6]:
def retry_get_data(year_data, df_error, df_status_code):
    error_n = len(df_error)
    if error_n == 0:
        final_year_data = year_data
        retry_df_error = df_error
        retry_df_status_code = df_status_code
    else:
        re_year_list = df_error['error_year'].tolist()
        re_error_n = error_n
        for i in range(0,3):
            if re_error_n > 0:
                retry_year_data, retry_df_error, retry_df_status_code = get_name_by_year_birth(re_year_list, gender_value)
                final_year_data = year_data + retry_year_data
                re_error_n = len(retry_df_error)
                re_year_list = retry_df_error['error_year'].tolist()
            else:
                break
        if i == 3 & re_error_n > 0:
            print('too much error on Wikidata')
    return final_year_data, retry_df_error, retry_df_status_code

In [16]:
def get_people_by_gender_yob(year_covered, gender, en_nations, directory):
  year_data, df_error, df_status_code = get_name_by_year_birth(year_covered, gender)
  if len(df_error) > 0:
      year_data, df_error, df_status_code = retry_get_data(year_data, df_error, df_status_code)
  data = pd.concat(year_data)
  temp = data.Wiki_ID
  data['Wiki_ID'] = temp.apply(lambda x: x[x.find('Q'):])
  data['Is_EN'] = data['Country_birth'].isin(en_nations)
  data = data.drop_duplicates(subset = ['Wiki_ID'], keep = 'last')
  data.to_csv(directory, index = False, header = True)

In [9]:
from google.colab import drive # this is only required if the code is run on google colab, remove if run on other enviroment
drive.mount('/content/drive') # this is only required if the code is run on google colab, remove if run on other enviroment

Mounted at /content/drive


In [10]:
%cd /content/drive/MyDrive/Github/epistemic-injustice-on-wikipedia

/content/drive/MyDrive/Github/epistemic-injustice-on-wikipedia


In [12]:
dataDirectory = './data/female_wikidata.csv'
yobList = list(range(1900,2010))
genderValue = 'f'
enNationList = ['United States of America','United Kingdom','Australia','Canada','New Zealand', 'South Africa']

In [17]:
get_people_by_gender_yob(yobList, genderValue, enNationList, dataDirectory)

1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
