In [1]:
# import pandas
import pandas as pd
# import urllib to deal with url connection
from urllib.request import Request, urlopen
# import beautiful soup to extract information from a webpage
from bs4 import BeautifulSoup as bs
# import regular expression lib
import re
# import multiprocessing lib to accelerate fetching data, windows does not support this lib
from multiprocessing import Pool

In [2]:
# read birds list into a dataframe
birds_list = pd.read_csv("birds_list.csv")

In [3]:
# check the head of the birds list
birds_list.head()

Unnamed: 0.1,Unnamed: 0,Common name,Binomial,Notes,Category,Order,Family,colour,size
0,0,Southern cassowary,Casuarius casuarius,,Cassowaries,Casuariformes,Casuariidae,,
1,1,Emu,Dromaius novaehollandiae,,Emus,Casuariformes,Dromaiidae,,
2,2,King Island emu,Dromaius ater,extinct,Emus,Casuariformes,Dromaiidae,,
3,3,Kangaroo Island emu,Dromaius baudinianus,extinct,Emus,Casuariformes,Dromaiidae,,
4,4,Australian brushturkey,Alectura lathami,,Mound-builders,Galliformes,Megapodidae,,


In [4]:
# convert the format of common name into lowercase with underscore, like 'australian_brushturkey'
birds_list['Common_name_lowercase'] = birds_list['Common name'].str.lower()
birds_list['Common_name_lowercase'] = birds_list['Common_name_lowercase'].str.replace(' ', '-')

In [5]:
# gerenate all the links to http://www.oiseaux-birds.com/ for every bird
link_prefix = 'http://www.oiseaux-birds.com/'
birds_list['link'] = link_prefix + 'card-' + birds_list['Common_name_lowercase'] + '.html'

In [6]:
# if one bird cannot found in http://www.oiseaux-birds.com/, then there will be a error page which length is 2282.
ERROR_PAGE_LEN = 2282

In [7]:
# try to get information for one specific bird

url = "http://www.oiseaux-birds.com/card-southern-cassowary.html"
# use Firefox as an user-agent, otherwise the connection will be rejected.
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
# get the content of a webpage
webpage = urlopen(req).read()
# convert to string
webpage_str = str(webpage)
# initialize a beautiful soup with lxml engine
soup = bs(webpage, "lxml")

In [8]:
# use regular expression to get height, it it doesn't exist, leave it blank.
height = re.search('Height:(.*?)<br />', webpage_str)
if height:
    height = height.group(1).strip()
else:
    height = pd.np.nan

In [9]:
# use regular expression to get length, it it doesn't exist, leave it blank.
length = re.search('Length:(.*?)<br />', webpage_str)
if length:
    length = length.group(1).strip()
else:
    length = pd.np.nan

In [10]:
# use regular expression to get weight, it it doesn't exist, leave it blank.
weight = re.search('Weight:(.*?)</p>', webpage_str)
if weight:
    weight = weight.group(1).strip()
else:
    weight = pd.np.nan

In [11]:
# find all img tags in html file
imgs = soup.find_all("img")
if imgs:
    # list comprehension to have a list of complete link of img files
    imgs = [link_prefix + img['src'] for img in imgs]
else:
    imgs = pd.np.nan

In [12]:
# find the position of voice link by regular expression
voice = re.search('VOICE(.*?)</a>', webpage_str)
if voice:
    voice = voice.group(1).strip()
    voice = re.search('<a href="(.*?)"', voice)
    if voice:
        voice = voice.group(1).strip()
        
        # The parent of parent of parent of a tag contains 'VOICE' is where the voice link is
        for elem in soup(text=re.compile(r'VOICE')):
            voice_text = elem.parent.parent.parent.get_text()
            voice_text = voice_text[6:].strip()
            #print(voice_text)
            #print(voice)
else:
    voice = pd.np.nan
    voice_text = pd.np.nan

In [13]:
# extract habitat by beautiful soup, if can not found, leave it blank
habitat = re.search('HABITAT', webpage_str)
if habitat:
    for elem in soup(text=re.compile(r'HABITAT')):
        habitat = elem.parent.parent.parent.get_text()
        habitat = habitat[8:].strip()
        print(habitat)
else:
    habitat = pd.np.nan

Southern Cassowary frequents mainly the rainforest.  This species prefers virgin forests where it can live free from disturbances. 
    It is often found at medium elevation, in dense  tropical rainforest, but also in galleries and swamp forests.


In [14]:
# extract description by beautiful soup, if can not found, leave it blank
description = re.search('DESCRIPTION OF THE BIRD', webpage_str)
if description:
    for elem in soup(text=re.compile(r'DESCRIPTION OF THE BIRD')):
        description = elem.parent.parent.parent.parent.get_text()
        description = description[25:].strip()
        print(description)
else:
    description = re.search('DESCRIPTION', webpage_str)
    if description:
        for elem in soup(text=re.compile(r'DESCRIPTION')):
            description = elem.parent.parent.parent.get_text()
            description = description[12:].strip()
            print(description)
    else:
        description = pd.np.nan

Southern Cassowary is the second largest and tallest  bird after the Ostrich. This species differs in general facts by the female.  She has the criterions of most of the other species’ males, such as brighter  colours, really much larger size, and she does not share the nesting duties  with the male.


In [15]:
# extract location by beautiful soup, if can not found, leave it blank
# sadly only few birds have location information in Australia, most time are in other countries

location = re.search('RANGE', webpage_str)
if location:
    for elem in soup(text=re.compile(r'RANGE')):
        location = elem.parent.parent.parent.get_text()
        location = location[6:].strip()
        print(location)
else:
    location = pd.np.nan

Southern Cassowary lives in Aru and Seram Islands  of Indonesia, in New Guinea and NE Australia.


In [16]:
# extract movements by beautiful soup, if can not found, leave it blank
movements = re.search('MOVEMENTS', webpage_str)
if movements:
    for elem in soup(text=re.compile(r'MOVEMENTS')):
        movements = elem.parent.parent.parent.get_text()
        movements = movements[10:].strip()
        print(movements)
else:
    movements = pd.np.nan

Southern Cassowary does not fly. Its powerful legs and  feet allow it to run at speeds of up to 50 km per hour. It is able to jump up to 5 metres. It also can  swim, and cross lakes and wide rivers without any difficulty.


In [17]:
# extract reproduction by beautiful soup, if can not found, leave it blank
reproduction = re.search('REPRODUCTION', webpage_str)
if reproduction:
    for elem in soup(text=re.compile(r'REPRODUCTION')):
        reproduction = elem.parent.parent.parent.get_text()
        reproduction = reproduction[13:].strip()
        print(reproduction)
else:
    reproduction = pd.np.nan

Breeding season starts at the end of the dry season in  New Guinea, and from June to  October in Queensland. 
    The female copulates with several males, and  disappears as soon as the eggs are laid. 
    The nest is a shallow depression scraped in the  ground, with a lining of leaves and grasses. It is very well camouflaged in the  vegetation. 
    She lays 3 to 5 greenish eggs. Incubation by the male  lasts about 50 days. The male rears the chicks during about 9 months.


In [18]:
# extract diet by beautiful soup, if can not found, leave it blank
diet = re.search('DIET', webpage_str)
if diet:
    for elem in soup(text=re.compile(r'DIET')):
        diet = elem.parent.parent.parent.get_text()
        diet = diet[5:].strip()
        print(diet)
else:
    diet = pd.np.nan

Southern Cassowary feeds mainly on fallen fruits of  different plant’s species. It also consumes invertebrates, small vertebrates  and sometimes carrion.


In [19]:
# extract protection by beautiful soup, if can not found, leave it blank
protection = re.search('PROTECTION', webpage_str)
if protection:
    for elem in soup(text=re.compile(r'PROTECTION')):
        protection = elem.parent.parent.parent.get_text()
        protection = protection[31:].strip()
        print(protection)
else:
    protection = pd.np.nan

Southern Cassowary is fairly widespread in New Guinea, but  declines due to habitat loss and hunting occurred. Disturbances are also an  important fact in local declines.


In [20]:
# extract sources by beautiful soup, if can not found, leave it blank
sources = re.search('Sources', webpage_str)
if sources:
    for elem in soup(text=re.compile(r'Sources')):
        sources = elem.parent.parent.parent.parent.get_text()
        sources = sources[sources.find("Sources") + 9:sources.find("Home page")].strip()
        print(sources)
else:
    sources = pd.np.nan

HANDBOOK  OF THE BIRDS OF THE WORLD vol 1 by Josep  del  Hoyo-Andrew Elliot-Jordi Sargatal - Lynx  Edicions - ISBN: 8487334105
L’ENCYCLOPEDIE MONDIALE DES  OISEAUX - Dr  Christopher M. Perrins -  BORDAS -  ISBN: 2040185607
Avibase (Lepage Denis)

Birds in backyards (Birds Australia and Australian Museum) 
ARKive (Christopher Parsons) 
El Zoológico Electrónico (Damisela)


In [21]:
# initialize some new columns
birds_list['Height'] = ''
birds_list['Length'] = ''
birds_list['Weight'] = ''
birds_list['Image'] = ''
birds_list['Voice'] = ''
birds_list['Habitat'] = ''
birds_list['Description'] = ''
birds_list['Range'] = ''
birds_list['Movements'] = ''
birds_list['Reproduction'] = ''
birds_list['Diet'] = ''
birds_list['Protection'] = ''
birds_list['Sources'] = ''

In [22]:
# define a error_list, if one bird can not find in this website, then add it into this list
error_list = []

# define a fetch function to get all the information for a multiprocessing pool
def fetch(idx):
    url = birds_list.ix[idx, 'link']
    req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
    webpage = urlopen(req).read()
    webpage_str = str(webpage)
    soup = bs(webpage, "lxml")
    imgs_dict = dict()
    
    if len(webpage) == ERROR_PAGE_LEN:
        error_list.append(idx)
        return pd.np.nan
    
    height = re.search('Height:(.*?)<br />', webpage_str)
    if height:
        height = height.group(1).strip()
    else:
        height = pd.np.nan
    
    length = re.search('Length:(.*?)<br />', webpage_str)
    if length:
        length = length.group(1).strip()
    else:
        length = pd.np.nan

    weight = re.search('Weight:(.*?)</p>', webpage_str)
    if weight:
        weight = weight.group(1).strip()
    else:
        weight = pd.np.nan
    
    imgs = soup.find_all("img")
    if imgs:
        imgs = [link_prefix + img['src'] for img in imgs]
    else:
        imgs = pd.np.nan
    
    voice = re.search('VOICE(.*?)</a>', webpage_str)
    if voice:
        voice = voice.group(1).strip()
        voice = re.search('<a href="(.*?)"', voice)
        if voice:
            voice = voice.group(1).strip()

            for elem in soup(text=re.compile(r'VOICE')):
                voice_text = elem.parent.parent.parent.get_text()
                voice_text = voice_text[6:].strip()
                #print(voice_text)
                #print(voice)
    else:
        voice = pd.np.nan
        voice_text = pd.np.nan
    
    habitat = re.search('HABITAT', webpage_str)
    if habitat:
        for elem in soup(text=re.compile(r'HABITAT')):
            habitat = elem.parent.parent.parent.get_text()
            habitat = habitat[8:].strip()
            #print(habitat)
    else:
        habitat = pd.np.nan
    
    description = re.search('DESCRIPTION OF THE BIRD', webpage_str)
    if description:
        for elem in soup(text=re.compile(r'DESCRIPTION OF THE BIRD')):
            description = elem.parent.parent.parent.parent.get_text()
            description = description[25:].strip()
            # print(description)
    else:
        description = re.search('DESCRIPTION', webpage_str)
        if description:
            for elem in soup(text=re.compile(r'DESCRIPTION')):
                description = elem.parent.parent.parent.get_text()
                description = description[12:].strip()
                # print(description)
        else:
            description = pd.np.nan
    
    location = re.search('RANGE', webpage_str)
    if location:
        for elem in soup(text=re.compile(r'RANGE')):
            location = elem.parent.parent.parent.get_text()
            location = location[6:].strip()
            #print(location)
    else:
        location = pd.np.nan
    
    movements = re.search('MOVEMENTS', webpage_str)
    if movements:
        for elem in soup(text=re.compile(r'MOVEMENTS')):
            movements = elem.parent.parent.parent.get_text()
            movements = movements[10:].strip()
            #print(movements)
    else:
        movements = pd.np.nan
    
    reproduction = re.search('REPRODUCTION', webpage_str)
    if reproduction:
        for elem in soup(text=re.compile(r'REPRODUCTION')):
            reproduction = elem.parent.parent.parent.get_text()
            reproduction = reproduction[13:].strip()
            #print(reproduction)
    else:
        reproduction = pd.np.nan
    
    diet = re.search('DIET', webpage_str)
    if diet:
        for elem in soup(text=re.compile(r'DIET')):
            diet = elem.parent.parent.parent.get_text()
            diet = diet[5:].strip()
            #print(diet)
    else:
        diet = pd.np.nan
    
    protection = re.search('PROTECTION', webpage_str)
    if protection:
        for elem in soup(text=re.compile(r'PROTECTION')):
            protection = elem.parent.parent.parent.get_text()
            protection = protection[31:].strip()
            #print(protection)
    else:
        protection = pd.np.nan
    
    sources = re.search('Sources', webpage_str)
    if sources:
        for elem in soup(text=re.compile(r'Sources')):
            sources = elem.parent.parent.parent.parent.get_text()
            sources = sources[sources.find("Sources") + 9:sources.find("Home page")].strip()
            #print(sources)
    else:
        sources = pd.np.nan
    
    birds_list.ix[idx, 'Height'] = height
    birds_list.ix[idx, 'Length'] = length
    birds_list.ix[idx, 'Weight'] = weight
    if type(imgs) is float:
        birds_list.ix[idx, 'Image'] = imgs
    else:
        birds_list.ix[idx, 'Image'] = imgs[0]
        imgs_dict[idx] = imgs
    birds_list.ix[idx, 'Voice'] = voice
    birds_list.ix[idx, 'Habitat'] = habitat
    birds_list.ix[idx, 'Description'] = description
    birds_list.ix[idx, 'Range'] = location
    birds_list.ix[idx, 'Movements'] = movements
    birds_list.ix[idx, 'Reproduction'] = reproduction
    birds_list.ix[idx, 'Diet'] = diet
    birds_list.ix[idx, 'Protection'] = protection
    birds_list.ix[idx, 'Sources'] = sources

    imgs_dict[idx] = imgs
    
    return (birds_list.ix[idx], imgs_dict)

#### Even with 20 process at one time, checking about 900 birds still takes very long time, seperate them into 5 parts, otherwise it may crash or get timeout when executing these tasks.

In [24]:
# Create a new Pool object
multi_pool = Pool(processes=20)
# Extract all info
new_bird_list = multi_pool.map(fetch, range(200))
# Close Multiprocessing Pool
multi_pool.close()

In [25]:
# Create a new Pool object
multi_pool = Pool(processes=20)
# Extract all info
new_bird_list1 = multi_pool.map(fetch, range(200, 400))
# Close Multiprocessing Pool
multi_pool.close()

In [26]:
# Create a new Pool object
multi_pool = Pool(processes=20)
# Extract all info
new_bird_list2 = multi_pool.map(fetch, range(400, 600))
# Close Multiprocessing Pool
multi_pool.close()

In [27]:
# Create a new Pool object
multi_pool = Pool(processes=20)
# Extract all info
new_bird_list3 = multi_pool.map(fetch, range(600, 800))
# Close Multiprocessing Pool
multi_pool.close()

In [28]:
# Create a new Pool object
multi_pool = Pool(processes=20)
# Extract all info
new_bird_list4 = multi_pool.map(fetch, range(800, len(birds_list)))
# Close Multiprocessing Pool
multi_pool.close()

In [30]:
# combine all 5 parts data into one list
birds_list_cleaned = new_bird_list + new_bird_list1 + new_bird_list2 + new_bird_list3 + new_bird_list4
birds_list_cleaned = [bird for bird in birds_list_cleaned if type(bird) is not float]

In [31]:
# check how many birds have got in oiseaux-birds
len(birds_list_cleaned)

235

In [32]:
imgs_list = [bird[1] for bird in birds_list_cleaned]
birds_list_cleaned = [bird[0] for bird in birds_list_cleaned]

In [33]:
# create a new dataframe with data from oiseaux-birds
birds_new_df = pd.DataFrame(birds_list_cleaned)
birds_new_df = birds_new_df.rename(columns={'Unnamed: 0': 'Idx'})
birds_new_df.columns

Index(['Idx', 'Common name', 'Binomial', 'Notes', 'Category', 'Order',
       'Family', 'colour', 'size', 'Common_name_lowercase', 'link', 'Height',
       'Length', 'Weight', 'Image', 'Voice', 'Habitat', 'Description', 'Range',
       'Movements', 'Reproduction', 'Diet', 'Protection', 'Sources'],
      dtype='object')

In [34]:
# create a img dataframe to store all img links for each birds
imgs_dict = dict()
for imgs in imgs_list:
    imgs_dict.update(imgs)

imgs_df = pd.DataFrame(dict([(k, pd.Series(v)) for k,v in imgs_dict.items()]))

In [35]:
# save new dataframe to csv file
birds_new_df.to_csv("birds.csv")
imgs_df.to_csv("imgs.csv")

In [36]:
# create new dataframe to store the birds cannot find in oiseaux and save it to a csv file
incomplete_df = birds_list[~birds_list.index.isin(birds_new_df.index)]
incomplete_df.to_csv("can_not_find_in_oiseaux.csv")