In [1]:
import pandas as pd
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup as bs
import re
from multiprocessing import Pool

In [2]:
birds_list = pd.read_csv("birds_list.csv")

In [3]:
birds_list.head()

Unnamed: 0.1,Unnamed: 0,Common name,Binomial,Notes,Category,Order,Family,colour,size
0,0,Southern cassowary,Casuarius casuarius,,Cassowaries,Casuariformes,Casuariidae,,
1,1,Emu,Dromaius novaehollandiae,,Emus,Casuariformes,Dromaiidae,,
2,2,King Island emu,Dromaius ater,extinct,Emus,Casuariformes,Dromaiidae,,
3,3,Kangaroo Island emu,Dromaius baudinianus,extinct,Emus,Casuariformes,Dromaiidae,,
4,4,Australian brushturkey,Alectura lathami,,Mound-builders,Galliformes,Megapodidae,,


In [4]:
birds_list['Common_name_lowercase'] = birds_list['Common name'].str.lower()
birds_list['Common_name_lowercase'] = birds_list['Common_name_lowercase'].str.replace(' ', '-')

In [5]:
link_prefix = 'http://www.oiseaux-birds.com/'
birds_list['link'] = link_prefix + 'card-' + birds_list['Common_name_lowercase'] + '.html'

In [6]:
ERROR_PAGE_LEN = 2282

In [7]:
url = "http://www.oiseaux-birds.com/card-pink-eared-duck.html"
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
webpage = urlopen(req).read()
webpage_str = str(webpage)
soup = bs(webpage, "lxml")

In [8]:
height = re.search('Height:(.*?)<br />', webpage_str)
if height:
    height = height.group(1).strip()
else:
    height = pd.np.nan

In [9]:
length = re.search('Length:(.*?)<br />', webpage_str)
if length:
    length = length.group(1).strip()
else:
    length = pd.np.nan

In [10]:
weight = re.search('Weight:(.*?)</p>', webpage_str)
if weight:
    weight = weight.group(1).strip()
else:
    weight = pd.np.nan

In [11]:
imgs = soup.find_all("img")
if imgs:
    imgs = [link_prefix + img['src'] for img in imgs]
else:
    imgs = pd.np.nan

In [12]:
voice = re.search('VOICE(.*?)</a>', webpage_str)
if voice:
    voice = voice.group(1).strip()
    voice = re.search('<a href="(.*?)"', voice)
    if voice:
        voice = voice.group(1).strip()

        for elem in soup(text=re.compile(r'VOICE')):
            voice_text = elem.parent.parent.parent.get_text()
            voice_text = voice_text[6:].strip()
            #print(voice_text)
            #print(voice)
else:
    voice = pd.np.nan
    voice_text = pd.np.nan

In [13]:
habitat = re.search('HABITAT', webpage_str)
if habitat:
    for elem in soup(text=re.compile(r'HABITAT')):
        habitat = elem.parent.parent.parent.get_text()
        habitat = habitat[8:].strip()
        print(habitat)
else:
    habitat = pd.np.nan

The Pink-eared  Duck frequents inland wetlands, shallow lakes and ponds, stagnant water with  rich aquatic life (including sewage ponds), often saline and brackish waters of  inland continent. It usually avoids the fast-moving, clear or deep waters, and  the coastal wetlands with high rainfall. Large flocks can be seen in extensive  open wetlands.


In [14]:
description = re.search('DESCRIPTION OF THE BIRD', webpage_str)
if description:
    for elem in soup(text=re.compile(r'DESCRIPTION OF THE BIRD')):
        description = elem.parent.parent.parent.get_text()
        description = description[24:].strip()
        print(description)
else:
    description = re.search('DESCRIPTION', webpage_str)
    if description:
        for elem in soup(text=re.compile(r'DESCRIPTION')):
            description = elem.parent.parent.parent.get_text()
            description = description[12:].strip()
            print(description)
    else:
        description = pd.np.nan

Biometrics:
    Length:  36-45 cm
    Wingspan:  57-70 cm
    Weight:  M: 290-480 g – F: 272-423 g


In [15]:
location = re.search('RANGE', webpage_str)
if location:
    for elem in soup(text=re.compile(r'RANGE')):
        location = elem.parent.parent.parent.get_text()
        location = location[6:].strip()
        print(location)
else:
    location = pd.np.nan

Southern Cassowary lives in Aru and Seram Islands  of Indonesia, in New Guinea and NE Australia.


In [16]:
movements = re.search('MOVEMENTS', webpage_str)
if movements:
    for elem in soup(text=re.compile(r'MOVEMENTS')):
        movements = elem.parent.parent.parent.get_text()
        movements = movements[10:].strip()
        print(movements)
else:
    movements = pd.np.nan

Southern Cassowary does not fly. Its powerful legs and  feet allow it to run at speeds of up to 50 km per hour. It is able to jump up to 5 metres. It also can  swim, and cross lakes and wide rivers without any difficulty.


In [17]:
reproduction = re.search('REPRODUCTION', webpage_str)
if reproduction:
    for elem in soup(text=re.compile(r'REPRODUCTION')):
        reproduction = elem.parent.parent.parent.get_text()
        reproduction = reproduction[13:].strip()
        print(reproduction)
else:
    reproduction = pd.np.nan

Breeding season starts at the end of the dry season in  New Guinea, and from June to  October in Queensland. 
    The female copulates with several males, and  disappears as soon as the eggs are laid. 
    The nest is a shallow depression scraped in the  ground, with a lining of leaves and grasses. It is very well camouflaged in the  vegetation. 
    She lays 3 to 5 greenish eggs. Incubation by the male  lasts about 50 days. The male rears the chicks during about 9 months.


In [18]:
diet = re.search('DIET', webpage_str)
if diet:
    for elem in soup(text=re.compile(r'DIET')):
        diet = elem.parent.parent.parent.get_text()
        diet = diet[5:].strip()
        print(diet)
else:
    diet = pd.np.nan

Southern Cassowary feeds mainly on fallen fruits of  different plant’s species. It also consumes invertebrates, small vertebrates  and sometimes carrion.


In [19]:
protection = re.search('PROTECTION', webpage_str)
if protection:
    for elem in soup(text=re.compile(r'PROTECTION')):
        protection = elem.parent.parent.parent.get_text()
        protection = protection[31:].strip()
        print(protection)
else:
    protection = pd.np.nan

Southern Cassowary is fairly widespread in New Guinea, but  declines due to habitat loss and hunting occurred. Disturbances are also an  important fact in local declines.


In [20]:
sources = re.search('Sources', webpage_str)
if sources:
    for elem in soup(text=re.compile(r'Sources')):
        sources = elem.parent.parent.parent.parent.get_text()
        sources = sources[sources.find("Sources") + 9:sources.find("Home page")].strip()
        print(sources)
else:
    sources = pd.np.nan

HANDBOOK  OF THE BIRDS OF THE WORLD vol 1 by Josep  del  Hoyo-Andrew Elliot-Jordi Sargatal - Lynx  Edicions - ISBN: 8487334105
L’ENCYCLOPEDIE MONDIALE DES  OISEAUX - Dr  Christopher M. Perrins -  BORDAS -  ISBN: 2040185607
Avibase (Lepage Denis)

Birds in backyards (Birds Australia and Australian Museum) 
ARKive (Christopher Parsons) 
El Zoológico Electrónico (Damisela)


In [21]:
sources.find("Sources")

-1

In [22]:
sources[sources.find("Sources") + 9:sources.find("Home page")].strip()

'OF THE BIRDS OF THE WORLD vol 1 by Josep  del  Hoyo-Andrew Elliot-Jordi Sargatal - Lynx  Edicions - ISBN: 8487334105\nL’ENCYCLOPEDIE MONDIALE DES  OISEAUX - Dr  Christopher M. Perrins - \xa0BORDAS -  ISBN: 2040185607\nAvibase (Lepage Denis)\n\nBirds in backyards (Birds Australia and Australian Museum) \nARKive (Christopher Parsons) \nEl Zoológico Electrónico (Damisela'

In [23]:
birds_list['Height'] = ''
birds_list['Length'] = ''
birds_list['Weight'] = ''
birds_list['Image'] = ''
birds_list['Voice'] = ''
birds_list['Habitat'] = ''
birds_list['Description'] = ''
birds_list['Range'] = ''
birds_list['Movements'] = ''
birds_list['Reproduction'] = ''
birds_list['Diet'] = ''
birds_list['Protection'] = ''
birds_list['Sources'] = ''

In [24]:
error_list = []
def fetch(idx):
    url = birds_list.ix[idx, 'link']
    req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
    webpage = urlopen(req).read()
    webpage_str = str(webpage)
    soup = bs(webpage, "lxml")
    imgs_dict = dict()
    
    if len(webpage) == ERROR_PAGE_LEN:
        error_list.append(idx)
        return pd.np.nan
    
    height = re.search('Height:(.*?)<br />', webpage_str)
    if height:
        height = height.group(1).strip()
    else:
        height = pd.np.nan
    
    length = re.search('Length:(.*?)<br />', webpage_str)
    if length:
        length = length.group(1).strip()
    else:
        length = pd.np.nan

    weight = re.search('Weight:(.*?)</p>', webpage_str)
    if weight:
        weight = weight.group(1).strip()
    else:
        weight = pd.np.nan
    
    imgs = soup.find_all("img")
    if imgs:
        imgs = [link_prefix + img['src'] for img in imgs]
    else:
        imgs = pd.np.nan
    
    voice = re.search('VOICE(.*?)</a>', webpage_str)
    if voice:
        voice = voice.group(1).strip()
        voice = re.search('<a href="(.*?)"', voice)
        if voice:
            voice = voice.group(1).strip()

            for elem in soup(text=re.compile(r'VOICE')):
                voice_text = elem.parent.parent.parent.get_text()
                voice_text = voice_text[6:].strip()
                #print(voice_text)
                #print(voice)
    else:
        voice = pd.np.nan
        voice_text = pd.np.nan
    
    habitat = re.search('HABITAT', webpage_str)
    if habitat:
        for elem in soup(text=re.compile(r'HABITAT')):
            habitat = elem.parent.parent.parent.get_text()
            habitat = habitat[8:].strip()
            #print(habitat)
    else:
        habitat = pd.np.nan
    
    description = re.search('DESCRIPTION OF THE BIRD', webpage_str)
    if description:
        for elem in soup(text=re.compile(r'DESCRIPTION OF THE BIRD')):
            description = elem.parent.parent.parent.get_text()
            description = description[24:].strip()
            # print(description)
    else:
        description = re.search('DESCRIPTION', webpage_str)
        if description:
            for elem in soup(text=re.compile(r'DESCRIPTION')):
                description = elem.parent.parent.parent.get_text()
                description = description[12:].strip()
                # print(description)
        else:
            description = pd.np.nan
    
    location = re.search('RANGE', webpage_str)
    if location:
        for elem in soup(text=re.compile(r'RANGE')):
            location = elem.parent.parent.parent.get_text()
            location = location[6:].strip()
            #print(location)
    else:
        location = pd.np.nan
    
    movements = re.search('MOVEMENTS', webpage_str)
    if movements:
        for elem in soup(text=re.compile(r'MOVEMENTS')):
            movements = elem.parent.parent.parent.get_text()
            movements = movements[10:].strip()
            #print(movements)
    else:
        movements = pd.np.nan
    
    reproduction = re.search('REPRODUCTION', webpage_str)
    if reproduction:
        for elem in soup(text=re.compile(r'REPRODUCTION')):
            reproduction = elem.parent.parent.parent.get_text()
            reproduction = reproduction[13:].strip()
            #print(reproduction)
    else:
        reproduction = pd.np.nan
    
    diet = re.search('DIET', webpage_str)
    if diet:
        for elem in soup(text=re.compile(r'DIET')):
            diet = elem.parent.parent.parent.get_text()
            diet = diet[5:].strip()
            #print(diet)
    else:
        diet = pd.np.nan
    
    protection = re.search('PROTECTION', webpage_str)
    if protection:
        for elem in soup(text=re.compile(r'PROTECTION')):
            protection = elem.parent.parent.parent.get_text()
            protection = protection[31:].strip()
            #print(protection)
    else:
        protection = pd.np.nan
    
    sources = re.search('Sources', webpage_str)
    if sources:
        for elem in soup(text=re.compile(r'Sources')):
            sources = elem.parent.parent.parent.parent.get_text()
            sources = sources[sources.find("Sources") + 9:sources.find("Home page")].strip()
            #print(sources)
    else:
        sources = pd.np.nan
    
    birds_list.ix[idx, 'Height'] = height
    birds_list.ix[idx, 'Length'] = length
    birds_list.ix[idx, 'Weight'] = weight
    if type(imgs) is float:
        birds_list.ix[idx, 'Image'] = imgs
    else:
        birds_list.ix[idx, 'Image'] = imgs[0]
        imgs_dict[idx] = imgs
    birds_list.ix[idx, 'Voice'] = voice
    birds_list.ix[idx, 'Habitat'] = habitat
    birds_list.ix[idx, 'Description'] = description
    birds_list.ix[idx, 'Range'] = location
    birds_list.ix[idx, 'Movements'] = movements
    birds_list.ix[idx, 'Reproduction'] = reproduction
    birds_list.ix[idx, 'Diet'] = diet
    birds_list.ix[idx, 'Protection'] = protection
    birds_list.ix[idx, 'Sources'] = sources

    imgs_dict[idx] = imgs
    
    return (birds_list.ix[idx], imgs_dict)

In [25]:
import time
start_time = time.time()

In [26]:
# Create a new Pool object
multi_pool = Pool(processes=20)
# Extract all info from XML file and assign id_list
new_bird_list = multi_pool.map(fetch, range(200))
# Close Multiprocessing Pool
multi_pool.close()

In [27]:
# Create a new Pool object
multi_pool = Pool(processes=20)
# Extract all info from XML file and assign id_list
new_bird_list1 = multi_pool.map(fetch, range(200, 400))
# Close Multiprocessing Pool
multi_pool.close()

In [28]:
# Create a new Pool object
multi_pool = Pool(processes=20)
# Extract all info from XML file and assign id_list
new_bird_list2 = multi_pool.map(fetch, range(400, 600))
# Close Multiprocessing Pool
multi_pool.close()

In [29]:
# Create a new Pool object
multi_pool = Pool(processes=20)
# Extract all info from XML file and assign id_list
new_bird_list3 = multi_pool.map(fetch, range(600, 800))
# Close Multiprocessing Pool
multi_pool.close()

In [30]:
# Create a new Pool object
multi_pool = Pool(processes=20)
# Extract all info from XML file and assign id_list
new_bird_list4 = multi_pool.map(fetch, range(800, len(birds_list)))
# Close Multiprocessing Pool
multi_pool.close()

In [31]:
print(time.time() - start_time)

102.71105003356934


In [32]:
birds_list_cleaned = new_bird_list + new_bird_list1 + new_bird_list2 + new_bird_list3 + new_bird_list4
birds_list_cleaned = [bird for bird in birds_list_cleaned if type(bird) is not float]

In [33]:
len(birds_list_cleaned)

235

In [34]:
imgs_list = [bird[1] for bird in birds_list_cleaned]
birds_list_cleaned = [bird[0] for bird in birds_list_cleaned]

In [35]:
birds_new_df = pd.DataFrame(birds_list_cleaned)
birds_new_df = birds_new_df.rename(columns={'Unnamed: 0': 'Idx'})
birds_new_df.columns

Index(['Idx', 'Common name', 'Binomial', 'Notes', 'Category', 'Order',
       'Family', 'colour', 'size', 'Common_name_lowercase', 'link', 'Height',
       'Length', 'Weight', 'Image', 'Voice', 'Habitat', 'Description', 'Range',
       'Movements', 'Reproduction', 'Diet', 'Protection', 'Sources'],
      dtype='object')

In [36]:
imgs_dict = dict()
for imgs in imgs_list:
    imgs_dict.update(imgs)

imgs_df = pd.DataFrame(dict([(k, pd.Series(v)) for k,v in imgs_dict.items()]))

In [37]:
birds_new_df.to_csv("birds.csv")
imgs_df.to_csv("imgs.csv")

In [48]:
incomplete_df = birds_list[~birds_list.index.isin(birds_new_df.index)]

In [50]:
incomplete_df.to_csv("incomplete.csv")