In [1]:
# import pandas
import pandas as pd

In [2]:
# use pandas to get tables on wikipedia into a list of pandas dataframe
tables = pd.read_html("https://en.wikipedia.org/wiki/List_of_birds_of_Australia")

In [3]:
# remove the first 2 tables in the list, as they are non relevant
tables = tables[2:]

In [4]:
# set the first row as column names and reindex all those tables
tables_new = []
for table in tables:
    table.columns = table.iloc[0]
    table = table.reindex(table.index.drop(0))
    tables_new.append(table)

In [5]:
# import beautiful soup and urllib to extract order, family, category of each bird species
from bs4 import BeautifulSoup as bs
from urllib import request

In [6]:
# initialize a beautiful soup
url_request = request.urlopen("https://en.wikipedia.org/wiki/List_of_birds_of_Australia")

html_text = url_request.read()

soup = bs(html_text, "lxml")

In [7]:
# get a category list
cat_list = []
for h3 in soup.findAll("h3"):
    cat_list.append(h3.text[:-6])

cat_list = cat_list[:-11]

In [8]:
# make a list with all p tag in html
p_list = []
for p in soup.findAll("p"):
    p_list.append(p.text)

p_list = [p[7:].split('Order:')[0] for p in p_list if p.startswith('Order:')]

In [9]:
# extract family and order
order_list = []
family_list = []
for p in p_list:
    p_split = p.split('Family: ')
    if len(p_split) == 1:
        p_split = p.split('Families: ')
    order_list.append(p_split[0].strip())
    family_list.append(p_split[1].strip())

In [10]:
# add category and order for each bird
tables = []
for idx, table in enumerate(tables_new):
    table['Category'] = cat_list[idx]
    table['Order'] = order_list[idx]
    table['Family'] = family_list[idx]
    tables.append(table)

In [11]:
# combine all tables into one table
table = pd.concat(tables, axis=0)

table.reset_index(inplace=True)

# delete column named "index"
del table['index']

# create two empty columns
table['colour'] = ''
table['size'] = ''

In [12]:
table.head(50)

Unnamed: 0,Common name,Binomial,Notes,Category,Order,Family,colour,size
0,Southern cassowary,Casuarius casuarius,,Cassowaries,Casuariformes,Casuariidae,,
1,Emu,Dromaius novaehollandiae,,Emus,Casuariformes,Dromaiidae,,
2,King Island emu,Dromaius ater,extinct,Emus,Casuariformes,Dromaiidae,,
3,Kangaroo Island emu,Dromaius baudinianus,extinct,Emus,Casuariformes,Dromaiidae,,
4,Australian brushturkey,Alectura lathami,,Mound-builders,Galliformes,Megapodidae,,
5,Malleefowl,Leipoa ocellata,,Mound-builders,Galliformes,Megapodidae,,
6,Orange-footed scrubfowl,Megapodius reinwardt,,Mound-builders,Galliformes,Megapodidae,,
7,Stubble quail,Coturnix pectoralis,,Pheasants,Galliformes,Phasianidae,,
8,Brown quail,Coturnix ypsilophora,,Pheasants,Galliformes,Phasianidae,,
9,King quail,Excalfactoria chinensis,,Pheasants,Galliformes,Phasianidae,,


In [13]:
# save into a csv file
table.to_csv('birds_list.csv')