In [12]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
import pandas as pd

class WikiTable:
    """
    A class that assists with the scraping and extraction of data from tables embedded within Wikipedia pages.
    url : str, a valid url for the target wikiepedia page
    category : str, a descriptive category that helps group and classify the type of data being mined
    table_num : int, a number identifying the particular data table to scrape which should be >= 1, with 1 indicating the first table encountered when scrolling on the wiki page
    """
    def __init__(self, url, category, table_num):
        # Set url and target table we're interested in
        self.wiki_url = url
        self.wiki_category = category
        self.table_num = table_num
        self.data_notes = ""

        # Open wiki page, retrieve HTMl
        page_bytes = urlopen(self.wiki_url).read()
        page_content = page_bytes.decode("utf-8")
        page_html = BeautifulSoup(page_content)

        # Extract page title and target table
        self.wiki_title = page_html.h1.text
        self.wiki_table = page_html.find_all("tbody")[self.table_num - 1]
        self.table_data = None

    # print string repr of WikiTable object
    def __str__(self):
        return f"Table {self.table_num} in the webpage titled: {self.wiki_title}"

    def get_url(self):
        return self.wiki_url
    
    def get_category(self):
        return self.wiki_category

    def set_notes(self, text):
        self.data_notes = text

    def show_notes(self):
        if self.data_notes != "":
            print(self.data_notes)
        else:
            print("Notes on data cleaning and treatment are empty")

    def get_table(self):
        return self.table_data.copy()
    
    def __extract_col_data(self, col_item):
        return col_item.get_text().rstrip()

    def __extract_row_data(self, row_item, country_index):
        if "<td" in str(row_item):
            col_items = row_item.find_all("td")
            try:
                col_items[country_index-1] = col_items[country_index-1].find_all("a")[0]
            except:
                print("Operation unsuccessful, no <a> tag, likely a non-country encountered")
        else:
            col_items = row_item.find_all("th")
        return list(map(self.__extract_col_data, col_items))   

    def create_table(self, get_data=False, country_index=1):
        if country_index < 1:
            raise Exception("Exception: Country index must be greater than 0")
        
        if not isinstance(self.table_data, pd.DataFrame):
            tr_list = self.wiki_table.find_all("tr")
            table_contents = list(map(self.__extract_row_data, tr_list, [country_index] * len(tr_list)))
            self.table_data = pd.DataFrame(
                table_contents[1:], 
                columns=table_contents[0],
                index=[row[country_index - 1] for row in table_contents[1:]] 
            )
            self.table_data.drop(
                columns=self.table_data.columns[:country_index],
                axis=1,
                inplace=True
            )

        if get_data:
            return self.get_table()

    def update_table(self, new_table):
        if isinstance(new_table, pd.DataFrame):
            self.table_data = new_table
        else:
            raise Exception("Exception: The updated table must be of type pd.DataFrame.")

    def export_table(self):
        if isinstance(self.table_data, pd.DataFrame):
            file_name = self.wiki_category + "__" + self.wiki_title.replace(" ", "_")  + ".xlsx"
            self.table_data.to_excel(file_name, index=True)
        else:
            raise Exception("Exception: Table contains no data")    

## Demographics

#### Human sex ratio as defined by the comparative number of males with respect to females in a population
Source: https://en.wikipedia.org/wiki/List_of_sovereign_states_by_sex_ratio
<br> World Factbook 2020 estimates

In [7]:
# Create WikiTable object from wikipage
page_url = "https://en.wikipedia.org/wiki/List_of_sovereign_states_by_sex_ratio"
category = "Demographics"
table_num = 1
wiki_page = WikiTable(*[page_url, category, table_num])

# Preliminary cleaning
wiki_page.set_notes("-1 indicates that the entire population was male")
table = wiki_page.create_table(get_data=True, country_index=1)
table.loc["Falkland Islands"] = "1.12"
table.replace(to_replace=["--", "All male"], value="-1", inplace=True)
table = table.astype(float, copy=True)

# Save data
wiki_page.update_table(table)
wiki_page.export_table()

#### Percent of population living on less than \\$1.90, \\$3.20 and \\$5.50 a day
Source: https://en.wikipedia.org/wiki/List_of_sovereign_states_by_percentage_of_population_living_in_povert
<br>International (PPP) dollars as per the World Bank, the World Poverty Clock, and the Our World in Data.s

In [34]:
# Create WikiTable object from wikipage
page_url = "https://en.wikipedia.org/wiki/List_of_sovereign_states_by_percentage_of_population_living_in_poverty"
category = "Demographics"
table_num = 2
wiki_page = WikiTable(*[page_url, category, table_num])

# Preliminary cleaning
wiki_page.set_notes("N/A")
table = wiki_page.create_table(get_data=True, country_index=1)
table.drop(
    columns=table.columns[-2:],
    axis=1,
    inplace=True
)
table.replace(to_replace=["%"], value="", inplace=True, regex=True)
table = table.astype(float, copy=True) / 100

# Save data
wiki_page.update_table(table)
wiki_page.export_table()

#### Urbanization by sovereign state
Source: https://en.wikipedia.org/wiki/Urbanization_by_sovereign_state
<br>There are two measures of the degree of urbanization of a population. The first, urban population, describes the percentage of the total population living in urban areas, as defined by the countr. The second measure, rate of urbanization, describes the projected average rate of change of the size of the urban population over the given period of timey

In [93]:
# Create WikiTable object from wikipage
page_url = "https://en.wikipedia.org/wiki/Urbanization_by_sovereign_state"
category = "Demographics"
table_num = 1
wiki_page = WikiTable(*[page_url, category, table_num])

# Preliminary cleaning
wiki_page.set_notes("There are two measures of the degree of urbanization of a population. The first, urban population, describes the percentage of the total population living in urban areas, as defined by the country. The second measure, rate of urbanization, describes the projected average rate of change of the size of the urban population over the given period of time.")
table = wiki_page.create_table(get_data=True, country_index=2)
table.drop(
    columns=["Date", "Period"],
    axis=1,
    inplace=True
)
table.replace(to_replace=["\\u2212","\\u2013"], value="-", inplace=True, regex=True)
table.replace(to_replace=["NA", "-", "\\u2212", "\\u2013"], value=None, inplace=True)
table.replace(to_replace=["\[clarification needed\]", "\[2\]", "\[3\]"], value="", inplace=True, regex=True)
table = table.astype(float, copy=True)

# Save data
wiki_page.update_table(table)
wiki_page.export_table()

Operation unsuccessful, likely a non-country encountered


#### List of countries by population (United Nations)
Source: https://en.wikipedia.org/wiki/List_of_countries_by_population_(United_Nations)
<br> United Nations Estimates
<br> Manual scrape

## Religious Info

#### Importance of religion by country
Source: https://en.wikipedia.org/wiki/Importance_of_religion_by_country
<br> 2009 Gallup survey

In [24]:
# Create WikiTable object from wikipage
page_url = "https://en.wikipedia.org/wiki/Importance_of_religion_by_country"
category = "Religion"
table_num = 5
wiki_page = WikiTable(*[page_url, category, table_num])

# Preliminary cleaning
wiki_page.set_notes('Percentages for "yes" and "no" answers are listed below; they often do not add up to 100% because some answered "don\'t know" or did not answer.')
table = wiki_page.create_table(get_data=True, country_index=2)
table.replace(to_replace=["\[[0-9]\]", "%"], value="", inplace=True, regex=True)
table = table.astype(float, copy=True) / 100

# Save data
wiki_page.update_table(table)
wiki_page.export_table()

## Food and Consumption Info

In [48]:
pages = 5
page_urls = [
    "https://en.wikipedia.org/wiki/Vegetarianism_by_country",
    "https://en.wikipedia.org/wiki/List_of_countries_by_meat_consumption",
    "https://en.wikipedia.org/wiki/List_of_countries_by_alcohol_consumption_per_capita",
    "https://en.wikipedia.org/wiki/List_of_countries_by_tea_consumption_per_capita",
    "https://en.wikipedia.org/wiki/Tobacco_consumption_by_country"
]
categories = ["Food"] * pages
table_numbers = [
    1, 
    2, 
    3, 
    1, 
    1
]
country_indices = [
    1, 
    1, 
    1, 
    2, 
    1
]

In [49]:
wiki_tables = []
for page in range(pages):
    print(f"Starting with Page {page+1}")
    wiki_page = WikiTable(page_urls[page], categories[page], table_numbers[page])
    table = wiki_page.create_table(get_data=True, country_index=country_indices[page])    
    wiki_page.export_table()

Starting with Page 1
Starting with Page 2
Starting with Page 3
Starting with Page 4
Starting with Page 5


## Transportation Info

In [52]:
pages = 3
page_urls = [
    "https://en.wikipedia.org/wiki/List_of_countries_by_rail_transport_network_size",
    "https://en.wikipedia.org/wiki/List_of_countries_by_rail_usage",
    "https://en.wikipedia.org/wiki/List_of_countries_by_vehicles_per_capita",
]
categories = ["Transportation"] * pages
table_numbers = [
    1, 
    5, 
    1
]
country_indices = [
    1, 
    2, 
    1
]

In [53]:
wiki_tables = []
for page in range(pages):
    print(f"Starting with Page {page+1}")
    wiki_page = WikiTable(page_urls[page], categories[page], table_numbers[page])
    table = wiki_page.create_table(get_data=True, country_index=country_indices[page])    
    wiki_page.export_table()

Starting with Page 1


## Climate Info

In [10]:
pages = 2
page_urls = [
    "https://en.wikipedia.org/wiki/List_of_countries_by_average_annual_precipitation",
    "https://listfist.com/list-of-countries-by-average-temperature"
]
categories = ["Climate"] * pages
table_numbers = [
    1,
    1
]
country_indices = [
    2,
    3
]

In [None]:
wiki_tables = []
for page in range(pages):
    print(f"Starting with Page {page+1}")
    wiki_page = WikiTable(page_urls[page], categories[page], table_numbers[page])
    table = wiki_page.create_table(get_data=True, country_index=country_indices[page])    
    wiki_page.export_table()