# Scraping wikipedia

While working on a side project, I needed to scrape some data from Wikipedia into CSVs. As I wrote the script, I ran into three hurdles: handling multiple tables in an article, data cells that span multiple rows, and removing footnotes from cells.

https://github.com/rocheio/wiki-table-scrape

In [5]:
!pip install --upgrade pip

Collecting pip
[?25l  Downloading https://files.pythonhosted.org/packages/00/b6/9cfa56b4081ad13874b0c6f96af8ce16cfbc1cb06bedf8e9164ce5551ec1/pip-19.3.1-py2.py3-none-any.whl (1.4MB)
[K    100% |████████████████████████████████| 1.4MB 7.5MB/s eta 0:00:01
[31mdistributed 1.21.8 requires msgpack, which is not installed.[0m
[?25hInstalling collected packages: pip
  Found existing installation: pip 10.0.1
    Uninstalling pip-10.0.1:
      Successfully uninstalled pip-10.0.1
Successfully installed pip-19.3.1


In [3]:
!pip install wikitablescrape

Collecting wikitablescrape
  Downloading https://files.pythonhosted.org/packages/d0/22/e2330078775be99eb7d42e84d96e788b9c51af45f82f3761cc6b998733b7/wikitablescrape-1.0.2-py3-none-any.whl
[31mdistributed 1.21.8 requires msgpack, which is not installed.[0m
Installing collected packages: wikitablescrape
Successfully installed wikitablescrape-1.0.2
[33mYou are using pip version 10.0.1, however version 19.3.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [9]:
help(wikitablescrape)

Help on package wikitablescrape:

NAME
    wikitablescrape

PACKAGE CONTENTS
    __main__
    cli
    parse
    test_parse

FILE
    /Users/Marco/anaconda3/lib/python3.6/site-packages/wikitablescrape/__init__.py




In [10]:
import requests
from bs4 import BeautifulSoup

WIKI_URL = "https://en.wikipedia.org/wiki/List_of_volcanoes_by_elevation"

req = requests.get(WIKI_URL)
soup = BeautifulSoup(req.content, 'lxml')
table_classes = {"class": ["sortable", "plainrowheaders"]}
wikitables = soup.findAll("table", table_classes)

print(wikitables)

[<table border="0" cellpadding="1" cellspacing="3" class="sortable" style="border:1px solid #e7dcc3">
<tbody><tr>
<th>Mountain
</th>
<th>Metres
</th>
<th>Feet
</th>
<th>Location and Notes
</th></tr>
<tr>
<td><a href="/wiki/Ojos_del_Salado" title="Ojos del Salado">Ojos del Salado</a></td>
<td>6,893</td>
<td>22,615</td>
<td>Argentina/Chile <small> – Highest active volcano on Earth</small>
</td></tr>
<tr>
<td><a href="/wiki/Monte_Pissis" title="Monte Pissis">Monte Pissis</a></td>
<td>6,793</td>
<td>22,287</td>
<td>Argentina
</td></tr>
<tr>
<td><a href="/wiki/Nevado_Tres_Cruces" title="Nevado Tres Cruces">Nevado Tres Cruces</a></td>
<td>6,748</td>
<td>22,139</td>
<td>Argentina/Chile
</td></tr>
<tr>
<td><a href="/wiki/Llullaillaco" title="Llullaillaco">Llullaillaco</a></td>
<td>6,739</td>
<td>22,110</td>
<td>Argentina/Chile <small> – Second highest active volcano on Earth</small>
</td></tr>
<tr>
<td><a href="/wiki/Tipas" title="Tipas">Tipas</a></td>
<td>6,660</td>
<td>21,850</td>
<td>Argent

In [1]:
"""Create CSVs from all tables on a Wikipedia article."""

import csv
import os
import platform

from bs4 import BeautifulSoup
import requests

def scrape(url, output_name):
    """Create CSVs from all tables in a Wikipedia article.

    ARGS:
        url (str): The full URL of the Wikipedia article to scrape tables from.
        output_name (str): The base file name (without filepath) to write to.
    """

    # Read tables from Wikipedia article into list of HTML strings
    resp = requests.get(url)
    soup = BeautifulSoup(resp.content, 'lxml')
    table_classes = {"class": ["sortable", "plainrowheaders"]}
    wikitables = soup.findAll("table", table_classes)

    # Create folder for output if it doesn't exist
    try:
        os.mkdir(output_name)
    except Exception:  # Generic OS Error
        pass

    for index, table in enumerate(wikitables):
        # Make a unique file name for each CSV
        if index == 0:
            filename = output_name
        else:
            filename = output_name + '_' + str(index)

        filepath = os.path.join(output_name, filename) + '.csv'

        with open(filepath, mode='w', newline='', encoding='utf-8') as output:
            # Deal with Windows inserting an extra '\r' in line terminators
            if platform.system() == 'Windows':
                kwargs = {'lineterminator': '\n'}

                csv_writer = csv.writer(output,
                                        quoting=csv.QUOTE_ALL,
                                        **kwargs)
            else:
                csv_writer = csv.writer(output,
                                        quoting=csv.QUOTE_ALL)

            write_html_table_to_csv(table, csv_writer)


def write_html_table_to_csv(table, writer):
    """Write HTML table from Wikipedia to a CSV file.

    ARGS:
        table (bs4.Tag): The bs4 Tag object being analyzed.
        writer (csv.writer): The csv Writer object creating the output.
    """

    # Hold elements that span multiple rows in a list of
    # dictionaries that track 'rows_left' and 'value'
    saved_rowspans = []
    for row in table.findAll("tr"):
        cells = row.findAll(["th", "td"])

        # If the first row, use it to define width of table
        if len(saved_rowspans) == 0:
            saved_rowspans = [None for _ in cells]
        # Insert values from cells that span into this row
        elif len(cells) != len(saved_rowspans):
            for index, rowspan_data in enumerate(saved_rowspans):
                if rowspan_data is not None:
                    # Insert the data from previous row; decrement rows left
                    value = rowspan_data['value']
                    cells.insert(index, value)

                    if saved_rowspans[index]['rows_left'] == 1:
                        saved_rowspans[index] = None
                    else:
                        saved_rowspans[index]['rows_left'] -= 1

        # If an element with rowspan, save it for future cells
        for index, cell in enumerate(cells):
            if cell.has_attr("rowspan"):
                rowspan_data = {
                    'rows_left': int(cell["rowspan"]),
                    'value': cell,
                }
                saved_rowspans[index] = rowspan_data

        if cells:
            # Clean the data of references and unusual whitespace
            cleaned = clean_data(cells)

            # Fill the row with empty columns if some are missing
            # (Some HTML tables leave final empty cells without a <td> tag)
            columns_missing = len(saved_rowspans) - len(cleaned)
            if columns_missing:
                cleaned += [None] * columns_missing

            writer.writerow(cleaned)


def clean_data(row):
    """Clean table row list from Wikipedia into a string for CSV.

    ARGS:
        row (bs4.ResultSet): The bs4 result set being cleaned for output.

    RETURNS:
        cleaned_cells (list[str]): List of cleaned text items in this row.
    """

    cleaned_cells = []

    for cell in row:
        # Strip references from the cell
        references = cell.findAll("sup", {"class": "reference"})
        if references:
            for ref in references:
                ref.extract()

        # Strip sortkeys from the cell
        sortkeys = cell.findAll("span", {"class": "sortkey"})
        if sortkeys:
            for ref in sortkeys:
                ref.extract()

        # Strip footnotes from text and join into a single string
        text_items = cell.findAll(text=True)
        no_footnotes = [text for text in text_items if text[0] != '[']

        cleaned = (
            ''.join(no_footnotes)  # Combine elements into single string
            .replace('\xa0', ' ')  # Replace non-breaking spaces
            .replace('\n', ' ')  # Replace newlines
            .strip()
        )

        cleaned_cells += [cleaned]

    return cleaned_cells

In [7]:
"""Test the wikitablescrape script on four articles."""

import os
import shutil

import wikitablescrape as ws

# Delete previous output folder if it exists, then create a new one
try:
    shutil.rmtree('output')
except FileNotFoundError:
    pass

wikitablescrape.scrape(
    url="https://en.wikipedia.org/wiki/List_of_mountains_by_elevation",
    output_name="mountains"
)

wikitablescrape.scrape(
    url="https://en.wikipedia.org/wiki/List_of_volcanoes_by_elevation",
    output_name="volcanoes"
)

wikitablescrape.scrape(
    url="https://en.wikipedia.org/wiki/List_of_National_Basketball_Association_career_scoring_leaders",
    output_name="nba"
)

wikitablescrape.scrape(
    url="https://en.wikipedia.org/wiki/List_of_highest-grossing_films",
    output_name="films"
)

# Move all CSV folders into a single 'output' folder
os.makedirs('output')
shutil.move('./mountains', './output')
shutil.move('./volcanoes', './output')
shutil.move('./nba', './output')
shutil.move('./films', './output')

AttributeError: module 'wikitablescrape' has no attribute 'scrape'

In [8]:
wikitablescrape(
    url="https://en.wikipedia.org/wiki/List_of_mountains_by_elevation",
    output_name="mountains"
)

TypeError: 'module' object is not callable