## Extracting data from Wikipedia
Code to scrape Wikipedia articles and extract information from each article.

In [82]:
import urllib3, json
from bs4 import BeautifulSoup

WOMEN_SCI_INDEX = "https://en.wikipedia.org/wiki/Index_of_women_scientists_articles"
WIKI_BASE = "https://en.wikipedia.org"
DATA_DIR = "data/"

In [136]:
#################################################
# Helper Functions
#################################################

def get_html(url):
    http_pool = urllib3.connection_from_url(url)
    r = http_pool.urlopen('GET',url)
    return r.data.decode('utf-8')

def save_json(data, file):
    with open(file, 'w') as outfile:
        json.dump(data, outfile, indent=4)

def load_json(file):
    with open("{}{}".format(DATA_DIR,file)) as f:
        data = json.load(f)
    return data

def load_page(link):
    name = link.split("/")[-1]
    file = open("{}{}/{}.html".format(DATA_DIR, "pages", name), "r")
    return file.read()

#### Step 1
Fetch all links pointing to women scientist Wiki pages from https://en.wikipedia.org/wiki/Index_of_women_scientists_articles
Save links in a file so we don't have to scrape this page again.

In [83]:
def list_of_scientist_pages(html):
    """
    Return list of wiki links pointing to women scientists
    """
    links = []
    pages = {}
    soup = BeautifulSoup(html, 'html.parser')
    for link in soup.find_all('a', href=True):
        if link['href'].startswith("/wiki/") and link["title"]==link.text:
            links.append("{}{}".format(WIKI_BASE, link["href"]))
        if link.text == "Zora Neale Hurston":
            break
    
    # first two are wrong, manually remove
    links = links[2:]
    return links

In [85]:
html = get_html(WOMEN_SCI_INDEX)
pages = list_of_scientist_pages(html)
save_json({'pages':pages}, "{}{}".format(DATA_DIR,'pages_list.json'))
print("Total {} links saved")



Total 930 links saved


Now we can just load the list of links from pages_list.json

#### Step 2
Scrape html of each page and store on disk. This will take a few minutes.

In [94]:
PAGES = load_json('pages_list.json')['pages']
# Fetch each page and save to file on disk
for page in PAGES:
    html = get_html(page)
    name = page.split("/")[-1]
    file = open("{}{}/{}.html".format(DATA_DIR, "pages", name), "w")
    file.write(html)
    file.close() 













































































#### Step 3
Parse the Wiki articles and extract features

In [138]:
def parse(url):
    """
    Parse info from Wiki Page
    """
    data = {
        'uid': url
    }
    html = load_page(url)
    soup = BeautifulSoup(html, 'html.parser')
    
    # Name
    name_div = soup.find('h1', attrs={'id':'firstHeading'})
    if name_div is not None:
        data["name"] = name_div.text
    
    # Infobox
    info_table = soup.find('table', attrs={'class':'infobox biography vcard'})
    data["has_infobox"] = info_table is not None
    if info_table is not None:
        info = parse_infobox(info_table)
        data.update(info)

    # Content
    content = soup.find('div', attrs={'id':'mw-content-text'}).find('div', attrs={'class':'mw-parser-output'})
    data['text'] = ""
    data['links'] = {}
    for el in content.findChildren(recursive=False):
        if el.name == 'h2':
            h = el.find('span', attrs={'class':'mw-headline'})
#             print(h.text)
#             data['text'] += h.text + "\n" if len(h.text) > 0 else ""
        elif el.name == 'p':
#             print(el.text)
#             data['text'] += el.text + "\n" if len(el.text) > 0 else ""
            
            # Find all links that lead to other wiki pages
            links = el.find_all('a', href=True)
            for a in links:
                if a['href'].startswith("/wiki/"):
                    if a.has_attr('title') and a.has_attr('href'):
                        data['links'][a["title"]] = a["href"]
                
    return data

def parse_infobox(box_el):
    info = {}
    tbody = box_el.find('tbody')
    for row in tbody.find_all('tr'):
        key = None
        for el in row.findChildren(recursive=False):
            if el.has_attr('scope') and (el.text == "Born" or el.text == "Died"):
                key = el.text.lower()
            elif el.name == 'td' and key != None:
                info[key] = el.text
                key = None
    return info
                    
                    

Testing the parser:

In [135]:
page = PAGES[622]
data = parse(page)
print(json.dumps(data, indent = 4))

{
    "uid": "https://en.wikipedia.org/wiki/Marie_Curie",
    "name": "Marie Curie",
    "infobox": true,
    "born": "Maria Salomea Sk\u0142odowska(1867-11-07)7 November 1867Warsaw, Congress Poland, Russian Empire[1]",
    "died": "4 July 1934(1934-07-04) (aged\u00a066)Passy, Haute-Savoie, Third French Republic",
    "text": "",
    "links": {
        "Help:IPA/English": "/wiki/Help:IPA/English",
        "Help:IPA/French": "/wiki/Help:IPA/French",
        "Help:IPA/Polish": "/wiki/Help:IPA/Polish",
        "Polish minority in France": "/wiki/Polish_minority_in_France",
        "Physicist": "/wiki/Physicist",
        "Chemist": "/wiki/Chemist",
        "Radioactivity": "/wiki/Radioactivity",
        "List of female Nobel laureates": "/wiki/List_of_female_Nobel_laureates",
        "Nobel Prize": "/wiki/Nobel_Prize#Multiple_laureates",
        "University of Paris": "/wiki/University_of_Paris",
        "Panth\u00e9on": "/wiki/Panth%C3%A9on",
        "Warsaw": "/wiki/Warsaw",
        "Con

In [139]:
# Parse all pages and save results
for page in PAGES:
    name = page.split("/")[-1]
    save_json(parse(page), "{}{}/{}.json".format(DATA_DIR, "parsed", name))