# Webscraping Wikipedia
Data about countries globally is not typically available in nice "digestible" and downloadable formats, and additionally, tends to be disproportionately available for countries considered "developed". We would like to make a broader set of recommendations available to the user by leveraging data related to a diverse set of countries and their attributes, ranging from geography and climate to demographics and health. With that in mind we use Wikipedia, which acts as a central data aggregator, as our data source in this work. 

It is important to note that given the open and editable nature of Wikipedia pages, there are bound to be inaccuracies. That said, for the purposes of this tool, we are willing to accept and tradeoff the data quality in exchange for the breadth of data available.

In [1]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
import pandas as pd

Below you can find a helper Class for scraping data from tables embedded in Wikipedia webpages. Ultimately, Wikipedia tables can be quite non-standard with respect to formatting, html tags, and headers used, so you may not be able to use this Class to automate your entire webscraping pipeline. Nonetheless, it acts to speed up the data procurement process across many Wikipedia pages that share similar themes in formatting.

In [2]:
class WikiTable:
    """
    A class that assists with the scraping and extraction of data from tables embedded within Wikipedia pages.
    
    Parameters
    ----------
    url (str) : a valid url for the target wikiepedia page
    category (str): a descriptive category that helps group and classify the type of data being mined
    table_num (int) : a number identifying the particular data table to scrape which should be >= 1, with 1 indicating the first table encountered when scrolling on the wiki page
    """
    def __init__(self, url, category, table_num):
        # Set url and target table we're interested in
        self.wiki_url = url
        self.wiki_category = category
        self.table_num = table_num
        self.data_notes = ""

        # Open wiki page, retrieve HTMl
        page_bytes = urlopen(self.wiki_url).read()
        page_content = page_bytes.decode("utf-8")
        page_html = BeautifulSoup(page_content)

        # Extract page title and target table
        self.wiki_title = page_html.h1.text
        self.wiki_table = page_html.find_all("tbody")[self.table_num - 1]
        self.table_data = None

    # print string repr of WikiTable object
    def __str__(self):
        return f"Table {self.table_num} in the webpage titled: {self.wiki_title}"

    # get wikipedia page url
    def get_url(self):
        return self.wiki_url

    # get the associated category
    def get_category(self):
        return self.wiki_category

    # set any special notes for this data
    def set_notes(self, text):
        self.data_notes = text

    # show special notes for this data
    def show_notes(self):
        if self.data_notes != "":
            print(self.data_notes)
        else:
            print("Notes on data cleaning and treatment are empty")

    # get a copy of the data that was scraped
    def get_table(self):
        return self.table_data.copy()

    def __extract_col_data(self, col_item):
        return col_item.get_text().rstrip()

    def __extract_row_data(self, row_item, country_index):
        if "<td" in str(row_item):
            col_items = row_item.find_all("td")
            try:
                col_items[country_index-1] = col_items[country_index-1].find_all("a")[0]
            except:
                print("Operation unsuccessful, no <a> tag, likely a non-country encountered")
        else:
            col_items = row_item.find_all("th")
        return list(map(self.__extract_col_data, col_items))   

    # extract data from the wikipedia table html and create a pandas dataframe 
    # country_index is the column in which the countries are contained in the table (indexed at 1)
    def create_table(self, get_data=False, country_index=1):
        if country_index < 1:
            raise Exception("Exception: Country index must be greater than 0")
        
        if not isinstance(self.table_data, pd.DataFrame):
            tr_list = self.wiki_table.find_all("tr")
            table_contents = list(map(self.__extract_row_data, tr_list, [country_index] * len(tr_list)))
            self.table_data = pd.DataFrame(
                table_contents[1:], 
                columns=table_contents[0],
                index=[row[country_index - 1] for row in table_contents[1:]] 
            )
            self.table_data.drop(
                columns=self.table_data.columns[:country_index],
                axis=1,
                inplace=True
            )

        if get_data:
            return self.get_table()

    # update table, usually after some prelim data cleaning is done
    def update_table(self, new_table):
        if isinstance(new_table, pd.DataFrame):
            self.table_data = new_table
        else:
            raise Exception("Exception: The updated table must be of type pd.DataFrame.")

    # save data in xlsx file
    def export_table(self):
        if isinstance(self.table_data, pd.DataFrame):
            file_name = self.wiki_category + "__" + self.wiki_title.replace(" ", "_")  + ".xlsx"
            self.table_data.to_excel(file_name, index=True)
        else:
            raise Exception("Exception: Table contains no data")    

## Example use-case
#### Percent of population living on less than \\$1.90, \\$3.20 and \\$5.50 a day
Source: https://en.wikipedia.org/wiki/List_of_sovereign_states_by_percentage_of_population_living_in_povert
<br>International (PPP) dollars as per the World Bank, the World Poverty Clock, and the Our World in Data.s

In [3]:
# Create WikiTable object from wikipage
page_url = "https://en.wikipedia.org/wiki/List_of_sovereign_states_by_percentage_of_population_living_in_poverty"
category = "Demographics"
table_num = 2
wiki_page = WikiTable(*[page_url, category, table_num])

# Preliminary cleaning
wiki_page.set_notes("N/A")
table = wiki_page.create_table(get_data=True, country_index=1)
table.drop(
    columns=table.columns[-2:],
    axis=1,
    inplace=True
)
table.replace(to_replace=["%"], value="", inplace=True, regex=True)
table = table.astype(float, copy=True) / 100

# Save data
wiki_page.update_table(table)
#wiki_page.export_table()

print(f"There are {table.shape[0]} rows in this data.")
print(f"There are {table.shape[1]} columns in this data.")
table.head()

There are 166 rows in this data.
There are 3 columns in this data.


Unnamed: 0,< $1.90[1][5],< $3.20[6],< $5.50[7]
Albania,0.1,0.266,0.384
Algeria,0.0032,0.0223,0.2083
Angola,0.514,0.7279,0.8913
Argentina,0.016,0.058,0.182
Armenia,0.004,0.069,0.447
