In [1]:
# import pandas
import pandas as pd
# import urllib to deal with url connection
from urllib.request import Request, urlopen
# import beautiful soup to extract information from a webpage
from bs4 import BeautifulSoup as bs
# import regular expression lib
import re
# import multiprocessing lib to accelerate fetching data, windows does not support this lib
from multiprocessing import Pool
import string

In [2]:
# read birds list into a dataframe
birds_list = pd.read_csv("birds_list.csv")

In [3]:
# check the head of the birds list
birds_list.head()

Unnamed: 0.1,Unnamed: 0,Common name,Binomial,Category,Order,Family
0,0,Southern cassowary,Casuarius casuarius,Cassowaries,Casuariformes,Casuariidae
1,1,Emu,Dromaius novaehollandiae,Emus,Casuariformes,Dromaiidae
2,2,King Island emu,Dromaius ater,Emus,Casuariformes,Dromaiidae
3,3,Kangaroo Island emu,Dromaius baudinianus,Emus,Casuariformes,Dromaiidae
4,4,Australian brushturkey,Alectura lathami,Mound-builders,Galliformes,Megapodidae


In [4]:
# convert the format of common name into lowercase with underscore, like 'australian_brushturkey'
birds_list['Common_name_lowercase'] = birds_list['Common name'].str.lower()
birds_list['Common_name_lowercase'] = birds_list['Common_name_lowercase'].str.replace(' ', '-')

In [5]:
# gerenate all the links to http://www.oiseaux-birds.com/ for every bird
link_prefix = 'http://www.oiseaux-birds.com/'
birds_list['link'] = link_prefix + 'card-' + birds_list['Common_name_lowercase'] + '.html'

In [6]:
# if one bird cannot found in http://www.oiseaux-birds.com/, then there will be a error page which length is 2282.
ERROR_PAGE_LEN = 2282

In [7]:
# try to get information for one specific bird

url = "http://www.oiseaux-birds.com/card-antipodean-albatross.html"
# use Firefox as an user-agent, otherwise the connection will be rejected.
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
# get the content of a webpage
webpage = urlopen(req).read()
# convert to string
webpage_str = str(webpage)
# initialize a beautiful soup with lxml engine
soup = bs(webpage, "lxml")

In [8]:
# use regular expression to get height, it it doesn't exist, leave it blank.
height = re.search('Height:(.*?)<br />', webpage_str)
if height:
    height = height.group(1).strip()
else:
    height = pd.np.nan

In [9]:
# use regular expression to get length, it it doesn't exist, leave it blank.
length = re.search('Length:(.*?)<br />', webpage_str)
if length:
    length = length.group(1).strip()
else:
    length = pd.np.nan

In [10]:
# use regular expression to get weight, it it doesn't exist, leave it blank.
weight = re.search('Weight:(.*?)</p>', webpage_str)
if weight:
    weight = weight.group(1).strip()
else:
    weight = pd.np.nan

In [11]:
# find all img tags in html file
imgs = soup.find_all("img")
if imgs:
    # list comprehension to have a list of complete link of img files
    imgs = [link_prefix + img['src'] for img in imgs]
else:
    imgs = pd.np.nan

In [12]:
# find the position of voice link by regular expression
voice = re.search('VOICE(.*?)</a>', webpage_str)
if voice:
    voice = voice.group(1).strip()
    voice = re.search('<a href="(.*?)"', voice)
    if voice:
        voice = voice.group(1).strip()
        
        # The parent of parent of parent of a tag contains 'VOICE' is where the voice link is
        for elem in soup(text=re.compile(r'VOICE')):
            voice_text = elem.parent.parent.parent.get_text()
            voice_text = voice_text[6:].strip()
            #print(voice_text)
            #print(voice)
else:
    voice = pd.np.nan
    voice_text = pd.np.nan

In [13]:
# extract habitat by beautiful soup, if can not found, leave it blank
habitat = re.search('HABITAT', webpage_str)
if habitat:
    for elem in soup(text=re.compile(r'HABITAT')):
        habitat = elem.parent.parent.parent.get_text()
        habitat = habitat[8:].strip()
        print(habitat)
else:
    habitat = pd.np.nan

The Antipodean  Albatross is a pelagic seabird that comes to land only for breeding on  windswept subantarctic islands. The nest is often built among dense vegetation  such as tussock grass and shrubs. It usually avoids areas with tall vegetation,  and exposed tops of hills or ridges. 
    It forages  over the shelf edge and deep water where it can find abundant food.


In [14]:
# extract description by beautiful soup, if can not found, leave it blank
description = re.search('DESCRIPTION OF THE BIRD', webpage_str)
if description:
    for elem in soup(text=re.compile(r'DESCRIPTION OF THE BIRD')):
        description = elem.parent.parent.parent.parent.get_text()
        description = description[25:].strip()
        print(description)
else:
    description = re.search('DESCRIPTION', webpage_str)
    if description:
        for elem in soup(text=re.compile(r'DESCRIPTION')):
            description = elem.parent.parent.parent.get_text()
            description = description[12:].strip()
            print(description)
    else:
        description = pd.np.nan

if description == description:
    find_biometrics = re.search('Biometrics((?!to).|\n)*\d *(k?gr? *(\(.*\))?|cm *(\(.*\))?)\.?', description)
    if find_biometrics:
        description = description[:find_biometrics.span()[0]] + description[find_biometrics.span()[1]:]
        description = description.strip()
    if len(description) > 0 and description[0] not in string.ascii_letters:
        description = description[1:]
    description = description.strip()

Biometrics:  
    Length:  110-117 cm
    Wingspan:  280-330 cm
    Weight:  M: 7240 g – F: 5790 g
    Race “gibsoni”: M: 5500-8600 g – F: 4600-7300  g    
The Antipodean  Albatross adult male of nominate race has mostly white body with variable, but  usually dense, dark brown vermiculations. The upperwing is almost plain dark  brown with whitish shafts on outer primaries. The black uppertail is tipped  white. 
    The underwing  is white with narrow trailing edge. 
    The head  is white, including forehead, chin and throat. The crown is often dark brown. There  is a variable amount of brownish blotching on the side of the rear part of head  and neck. 
    The bill  is pale pink, with the hooked tip of the upper mandible tinged yellowish to  horn. The eyes are dark brown. Legs and webbed feet are greyish to pink.


In [15]:
# extract movements by beautiful soup, if can not found, leave it blank
movements = re.search('MOVEMENTS', webpage_str)
if movements:
    for elem in soup(text=re.compile(r'MOVEMENTS')):
        movements = elem.parent.parent.parent.get_text()
        movements = movements[10:].strip()
        print(movements)
else:
    movements = pd.np.nan

In [16]:
# extract reproduction by beautiful soup, if can not found, leave it blank
reproduction = re.search('REPRODUCTION', webpage_str)
if reproduction:
    for elem in soup(text=re.compile(r'REPRODUCTION')):
        reproduction = elem.parent.parent.parent.get_text()
        reproduction = reproduction[13:].strip()
        print(reproduction)
else:
    reproduction = pd.np.nan

OF THIS SPECIES: 
    The Antipodean  Albatross usually breeds biennially if successful. The laying occurs around January  on Antipodes Islands and around February on Chatham Islands. 
    The race  “gibsoni” breeds mostly in late December  on the Auckland Islands. 
    The nest  is a low truncated cone made with soil, twigs and roots, and the shallow  depression at top is sometimes lined with grass. It is often placed in open or  in scattered tussock grass or shrubs, and from coastline to inland areas such  as slopes or plateau.


In [17]:
# extract diet by beautiful soup, if can not found, leave it blank
diet = re.search('DIET', webpage_str)
if diet:
    for elem in soup(text=re.compile(r'DIET')):
        diet = elem.parent.parent.parent.get_text()
        diet = diet[5:].strip()
        print(diet)
else:
    diet = pd.np.nan

In [18]:
# extract protection by beautiful soup, if can not found, leave it blank
protection = re.search('PROTECTION', webpage_str)
if protection:
    for elem in soup(text=re.compile(r'PROTECTION')):
        protection = elem.parent.parent.parent.get_text()
        protection = protection[31:].strip()
        print(protection)
else:
    protection = pd.np.nan

The Antipodean  Albatross is affected by introduced predators such as pigs, cats, rats and mice  on some breeding islands. However, the main threat is at sea where the birds  take bait from hooks. Numerous birds are killed by long-line fisheries,  involving important declines over several following years. The global warming  also changes the ocean conditions and reduces prey abundance.
    In 2009,  the global population was estimated to number 44,500 mature individuals. There were  4,565 breeding pairs on Antipodes Islands in 2007/2009, and 3,277 pairs in the  Auckland group between 2006 and 2009. 
    The Antipodean  Albatross has reduced breeding range on some subantarctic islands. It is currently  classified as Vulnerable, but following declines, it could be reclassified as  Endangered or even Critically Endangered.


In [19]:
# extract sources by beautiful soup, if can not found, leave it blank
sources = re.search('Sources', webpage_str)
if sources:
    for elem in soup(text=re.compile(r'Sources')):
        sources = elem.parent.parent.parent.parent.get_text()
        sources = sources[sources.find("Sources") + 9:sources.find("Home page")].strip()
        print(sources)
else:
    sources = pd.np.nan

HANDBOOK  OF THE BIRDS OF THE WORLD vol 1 by Josep del Hoyo-Andrew Elliot-Jordi Sargatal - Lynx Edicions - ISBN:  8487334105  
A  Complete Guide to Antarctic Wildlife by Hadoram Shirihai and Illustrated by Brett  Jarrett - Edited by Guy M. Kirwan - ALUL.A Press Oy, Finland - ISBN 9519894705
Avibase (Denis Lepage) 
BirdLife International 
HBW Alive 
ARKive (Christopher Parsons)  
New Zealand Birds Online  
Report  prepared for Department of Conservation by Graeme Elliott and Kath Walker -  November 2014
Antipodean wandering albatross –  population study 
NSW Government – Office of Environment  & Heritage  
Birds in Danger – Australia’s  Threatened Birds  
New Zealand bird status between 2008  and 2012 
Department of Sustainability,  Environment, Water, Population and Communities 
Ocean  Wanderers "Ride the Wave"


In [20]:
# define a error_list, if one bird can not find in this website, then add it into this list
error_list = []

# define a fetch function to get all the information for a multiprocessing pool
def fetch(idx):
    url = birds_list.ix[idx, 'link']
    req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
    webpage = urlopen(req).read()
    webpage_str = str(webpage)
    soup = bs(webpage, "lxml")
    imgs_dict = dict()
    
    if len(webpage) == ERROR_PAGE_LEN:
        error_list.append(idx)
        return pd.np.nan
    
    height = re.search('Height:(.*?)<br />', webpage_str)
    if height:
        height = height.group(1).strip()
    else:
        height = pd.np.nan
    
    length = re.search('Length:(.*?)<br />', webpage_str)
    if length:
        length = length.group(1).strip()
    else:
        length = pd.np.nan

    weight = re.search('Weight:(.*?)</p>', webpage_str)
    if weight:
        weight = weight.group(1).strip()
    else:
        weight = pd.np.nan
    
    imgs = soup.find_all("img")
    if imgs:
        imgs = [link_prefix + img['src'] for img in imgs]
    else:
        imgs = pd.np.nan
    
    voice = re.search('VOICE(.*?)</a>', webpage_str)
    if voice:
        voice = voice.group(1).strip()
        voice = re.search('<a href="(.*?)"', voice)
        if voice:
            voice = voice.group(1).strip()

            for elem in soup(text=re.compile(r'VOICE')):
                voice_text = elem.parent.parent.parent.get_text()
                voice_text = voice_text[6:].strip()
                #print(voice_text)
                #print(voice)
    else:
        voice = pd.np.nan
        voice_text = pd.np.nan
    
    habitat = re.search('HABITAT', webpage_str)
    if habitat:
        for elem in soup(text=re.compile(r'HABITAT')):
            habitat = elem.parent.parent.parent.get_text()
            habitat = habitat[8:].strip()
            #print(habitat)
    else:
        habitat = pd.np.nan
    
    description = re.search('DESCRIPTION OF THE BIRD', webpage_str)
    if description:
        for elem in soup(text=re.compile(r'DESCRIPTION OF THE BIRD')):
            description = elem.parent.parent.parent.parent.get_text()
            description = description[25:].strip()
            # print(description)
    else:
        description = re.search('DESCRIPTION', webpage_str)
        if description:
            for elem in soup(text=re.compile(r'DESCRIPTION')):
                description = elem.parent.parent.parent.get_text()
                description = description[12:].strip()
                # print(description)
        else:
            description = pd.np.nan
    if description == description:
        find_biometrics = re.search('Biometrics((?!to).|\n)*\d *(k?gr? *(\(.*\))?|cm *(\(.*\))?)\.?', description)
        if find_biometrics:
            description = description[:find_biometrics.span()[0]] + description[find_biometrics.span()[1]:]
            description = description.strip()
        if len(description) > 0 and description[0] not in string.ascii_letters:
            description = description[1:]
        description = description.strip()
    
    movements = re.search('MOVEMENTS', webpage_str)
    if movements:
        for elem in soup(text=re.compile(r'MOVEMENTS')):
            movements = elem.parent.parent.parent.get_text()
            movements = movements[10:].strip()
            #print(movements)
    else:
        movements = pd.np.nan
    
    reproduction = re.search('REPRODUCTION', webpage_str)
    if reproduction:
        for elem in soup(text=re.compile(r'REPRODUCTION')):
            reproduction = elem.parent.parent.parent.get_text()
            reproduction = reproduction[13:].strip()
            #print(reproduction)
    else:
        reproduction = pd.np.nan
    
    diet = re.search('DIET', webpage_str)
    if diet:
        for elem in soup(text=re.compile(r'DIET')):
            diet = elem.parent.parent.parent.get_text()
            diet = diet[5:].strip()
            #print(diet)
    else:
        diet = pd.np.nan
    
    protection = re.search('PROTECTION', webpage_str)
    if protection:
        for elem in soup(text=re.compile(r'PROTECTION')):
            protection = elem.parent.parent.parent.get_text()
            protection = protection[31:].strip()
            #print(protection)
    else:
        protection = pd.np.nan
    
    sources = re.search('Sources', webpage_str)
    if sources:
        for elem in soup(text=re.compile(r'Sources')):
            sources = elem.parent.parent.parent.parent.get_text()
            sources = sources[sources.find("Sources") + 9:sources.find("Home page")].strip()
            #print(sources)
    else:
        sources = pd.np.nan
    
    birds_list.ix[idx, 'height'] = height
    birds_list.ix[idx, 'length'] = length
    birds_list.ix[idx, 'weight'] = weight
    if imgs == imgs:
        imgs_dict[idx] = imgs
    birds_list.ix[idx, 'voice'] = voice
    birds_list.ix[idx, 'habitat'] = habitat
    birds_list.ix[idx, 'description'] = description
    birds_list.ix[idx, 'movements'] = movements
    birds_list.ix[idx, 'reproduction'] = reproduction
    birds_list.ix[idx, 'diet'] = diet
    birds_list.ix[idx, 'protection'] = protection
    birds_list.ix[idx, 'sources'] = sources

    imgs_dict[idx] = imgs
    
    return (birds_list.ix[idx], imgs_dict)

#### Even with 20 process at one time, checking about 900 birds still takes very long time, seperate them into 5 parts, otherwise it may crash or get timeout when executing these tasks.

In [21]:
# Create a new Pool object
multi_pool = Pool(processes=20)
# Extract all info
new_bird_list = multi_pool.map(fetch, range(200))
# Close Multiprocessing Pool
multi_pool.close()

In [22]:
# Create a new Pool object
multi_pool = Pool(processes=20)
# Extract all info
new_bird_list1 = multi_pool.map(fetch, range(200, 400))
# Close Multiprocessing Pool
multi_pool.close()

In [23]:
# Create a new Pool object
multi_pool = Pool(processes=20)
# Extract all info
new_bird_list2 = multi_pool.map(fetch, range(400, 600))
# Close Multiprocessing Pool
multi_pool.close()

In [24]:
# Create a new Pool object
multi_pool = Pool(processes=20)
# Extract all info
new_bird_list3 = multi_pool.map(fetch, range(600, 800))
# Close Multiprocessing Pool
multi_pool.close()

In [25]:
# Create a new Pool object
multi_pool = Pool(processes=20)
# Extract all info
new_bird_list4 = multi_pool.map(fetch, range(800, len(birds_list)))
# Close Multiprocessing Pool
multi_pool.close()

In [26]:
# combine all 5 parts data into one list
birds_list_cleaned = new_bird_list + new_bird_list1 + new_bird_list2 + new_bird_list3 + new_bird_list4
birds_list_cleaned = [bird for bird in birds_list_cleaned if type(bird) is not float]

In [27]:
# check how many birds have got in oiseaux-birds
len(birds_list_cleaned)

235

In [28]:
imgs_list = [bird[1] for bird in birds_list_cleaned]
birds_list_cleaned = [bird[0] for bird in birds_list_cleaned]

In [29]:
# create a new dataframe with data from oiseaux-birds
birds_new_df = pd.DataFrame(birds_list_cleaned)
birds_new_df = birds_new_df.rename(columns={'Unnamed: 0': 'id'})
birds_new_df.set_index('id', inplace=True)
birds_new_df.columns

Index(['Common name', 'Binomial', 'Category', 'Order', 'Family',
       'Common_name_lowercase', 'link', 'height', 'length', 'weight', 'voice',
       'habitat', 'description', 'movements', 'reproduction', 'diet',
       'protection', 'sources'],
      dtype='object')

In [30]:
# create a img dataframe to store all img links for each birds
imgs_dict = dict()
for imgs in imgs_list:
    imgs_dict.update(imgs)

imgs_df = pd.DataFrame(dict([(k, pd.Series(v)) for k,v in imgs_dict.items()]))

In [31]:
# save new dataframe to csv file
birds_new_df.to_csv("birds.csv")
imgs_df.to_csv("imgs.csv")

In [32]:
# create new dataframe to store the birds cannot find in oiseaux and save it to a csv file
incomplete_df = birds_list[~birds_list.index.isin(birds_new_df.index)]
incomplete_df.to_csv("can_not_find_in_oiseaux.csv")