In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import warnings
import tqdm as tqdm

warnings.filterwarnings('ignore', message='Unverified HTTPS request')

In [2]:
url = 'https://www.nhm.ac.uk/discover/dino-directory/name/name-az-all.html'

In [3]:
response = requests.get(url)

print('Status Code: ', response.status_code)
html = response.text

print('\nFirst part of HML document fetched as string:\n')
print(html[:700])

Status Code:  200

First part of HML document fetched as string:

<!DOCTYPE html>
<!--[if lte IE 8 ]>
<html lang="en" class="no-js oldie">
<![endif]-->
<!--[if IE 9 ]>
<html lang="en" class="no-js ie9">
<![endif]-->
<!--[if (gt IE 9)|!(IE)]><!-->
<html lang="en" class="no-js">
    <!--<![endif]-->





<head>
	<meta charset="utf-8" />
    <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
    <meta http-equiv="content-type" content="text/html; charset=UTF-8">
    
    <title>The Dino Directory | Natural History Museum</title>
    <meta name="description" content="">
    <meta name="keywords" content="">

    
    
    <meta property="og:title" content="The Dino Directory | Natural History Museum">
    <meta property="


In [4]:
def get_page_contents(url):
    page = requests.get(url, verify = False, headers={"Accept-Language": "en-US"})
    return BeautifulSoup(page.text, "html.parser")
soup = get_page_contents(url)

In [5]:
names_url = soup.findAll('li', class_= 'dinosaurfilter--dinosaur dinosaurfilter--all-list')

In [6]:
names_url[2]

<li class="dinosaurfilter--dinosaur dinosaurfilter--all-list">
<a href="https://www.nhm.ac.uk/discover/dino-directory/achelousaurus.html">
<p class="dinosaurfilter--name dinosaurfilter--name-unhyphenated">
							Achelousaurus
						</p>
</a>
</li>

In [7]:
import re
regex = re.compile(r'[\n\r\t]')
dinosaur_name = [regex.sub("", name.find('p').text) for name in names_url]

In [8]:
dinosaur_name[3]

'Achillobator'

In [9]:
dinosaur_url = [link.find('a')['href'] for link in names_url]

In [10]:
dinosaur_url[2]

'https://www.nhm.ac.uk/discover/dino-directory/achelousaurus.html'

In [11]:
from tqdm import tqdm

In [12]:
test_name = []


for unique_url in tqdm(dinosaur_url):
    new_soup = get_page_contents(unique_url)
    test_name.append(new_soup.find('h1', class_= 'dinosaur--name dinosaur--name-unhyphenated').text)

100%|██████████| 309/309 [00:53<00:00,  5.83it/s]


In [14]:
test_name[2]

'Achelousaurus'

In [15]:
dinosaur_name = test_name

In [16]:
pronunciation = []

for unique_url in tqdm(dinosaur_url):
    new_soup = get_page_contents(unique_url)
    pronunciation.append(new_soup.find('dd', class_='dinosaur--pronunciation').text)

100%|██████████| 309/309 [00:54<00:00,  5.69it/s]


In [17]:
pronunciation[2]

'ah-KEL-oo-SORE-us'

In [18]:
name_meaning = []

for unique_url in tqdm(dinosaur_url):
    new_soup = get_page_contents(unique_url)
    name_meaning.append(new_soup.find('dd', class_='dinosaur--meaning').text)

100%|██████████| 309/309 [00:52<00:00,  5.90it/s]


In [19]:
name_meaning[2]

"'Achelous' lizard'"

In [20]:
#get all

dino_img = []
length = []
dino_type = []
diet_type = []
living_period = []
living_area = []
taxonomy = []
named_by = []
species_type = []

for unique_url in tqdm(dinosaur_url):
    new_soup = get_page_contents(unique_url)
    
    #get image
    try:
        image = new_soup.find('img', class_='dinosaur--image')['src']
    except:
        image = ''
    dino_img.append(image)
        
    
    #dino length
    try:
        length_data = new_soup.find('dl', class_='dinosaur--description dinosaur--list').find_all('dd')[1].text
    except IndexError:
        length_data = ''
    length.append(length_data)
    
    #get type
    try:
        type = new_soup.find('dl', class_='dinosaur--description dinosaur--list').find('dd').find('a').text.strip()
    except:
        type = ''
    dino_type.append(type)
    
    #get diet type
    try:
        diet = new_soup.find('dl', class_='dinosaur--info dinosaur--list').find_all('dd')[0].text.strip()
    except IndexError:
        diet = ''
    diet_type.append(diet)
    
    #get taxonomy
    try:
        taxo = new_soup.find('dl', class_='dinosaur--taxonomy dinosaur--list').find_all('dd')[0].text.strip()
    except IndexError:
        taxo = ''
    taxonomy.append(taxo)
    
    #get named_by
    try:
        named = new_soup.find('dl', class_='dinosaur--taxonomy dinosaur--list').find_all('dd')[1].text.strip()
    except IndexError:
        named = ''
    named_by.append(named)
    
    #get species type
    try:
        species = new_soup.find('dl', class_='dinosaur--taxonomy dinosaur--list').find_all('dd')[2].text.strip()
    except IndexError:
        species = ''
    species_type.append(species)
    

100%|██████████| 309/309 [00:50<00:00,  6.16it/s]


In [21]:
#get living period test
for unique_url in tqdm(dinosaur_url):
    new_soup = get_page_contents(unique_url)
    
    try:
        period = new_soup.find('dl', class_='dinosaur--info dinosaur--list').find_all('dd')[1].text.strip()
    except IndexError:
        period = ''
    living_period.append(period)
    

100%|██████████| 309/309 [01:06<00:00,  4.67it/s]


In [26]:
living_period

['Early Jurassic, 199-189 million years ago',
 'Late Cretaceous, 74-70 million years ago',
 'Late Cretaceous, 83-70 million years ago',
 'Late Cretaceous, 99-84 million years ago',
 'Early Cretaceous, 115-105 million years ago',
 'Late Cretaceous, 98-93 million years ago',
 'Early Cretaceous, 132-121 million years ago',
 'Late Jurassic, 169-159 million years ago',
 'Late Cretaceous, 70-65 million years ago',
 'Late Cretaceous, 80-75 million years ago',
 'saw-edged, flesh-slicing teeth',
 'Late Cretaceous, 90-70 million years ago',
 'Late Cretaceous, 71-65 million years ago',
 'dagger-like with serrated edges',
 'Late Cretaceous, 89-85 million years ago',
 'teeth for grinding and chewing',
 'Early Jurassic, 195-180 million years ago',
 'Late Cretaceous, 71-65 million years ago',
 'Mid Jurassic, 177-169 million years ago',
 'Late Cretaceous, 74-70 million years ago',
 'Early Jurassic, 190 million years ago',
 'grinding teeth with a horny beak at the front',
 'Late Cretaceous, 84-65 milli

In [33]:
living_period.index('Mid Jurassic, 159-154 million years ago',)

303

In [55]:
dinosaur_url[108]

'https://www.nhm.ac.uk/discover/dino-directory/gallimimus.html'

In [51]:
new_soup = get_page_contents('https://www.nhm.ac.uk/discover/dino-directory/yinlong.html')

print(new_soup.find('dl', class_='dinosaur--info dinosaur--list').findAll('dd')[1].text.strip())

Mid Jurassic, 159-154 million years ago


In [53]:
new_soup = get_page_contents('https://www.nhm.ac.uk/discover/dino-directory/gallimimus.html')

try:
    if new_soup.find('dl', class_='dinosaur--info dinosaur--list').find_all('dt')[4].text.strip() == 'When it lived:':
        when = new_soup.find('dl', class_='dinosaur--info dinosaur--list').find_all('dd')[4].text.strip()
except IndexError:
    if new_soup.find('dl', class_='dinosaur--info dinosaur--list').find_all('dt')[1].text.strip() == 'When it lived:':
        when = new_soup.find('dl', class_='dinosaur--info dinosaur--list').find_all('dd')[1].text.strip()
except IndexError:
    when = ''

In [54]:
when

'Late Cretaceous, 74-70 million years ago'

In [67]:
living_period = []

#get living period test
for unique_url in tqdm(dinosaur_url):
    new_soup = get_page_contents(unique_url)
    
    try:
        if new_soup.find('dl', class_='dinosaur--info dinosaur--list').find_all('dt')[4].text.strip() == 'When it lived:':
            when = new_soup.find('dl', class_='dinosaur--info dinosaur--list').find_all('dd')[4].text.strip()
    except IndexError:
        if new_soup.find('dl', class_='dinosaur--info dinosaur--list').find_all('dt')[1].text.strip() == 'When it lived:':
            when = new_soup.find('dl', class_='dinosaur--info dinosaur--list').find_all('dd')[1].text.strip()
    except IndexError:
        when = ''
    living_period.append(when)

100%|██████████| 309/309 [09:31<00:00,  1.85s/it]


In [69]:
living_area = []

#get living period test
for unique_url in tqdm(dinosaur_url):
    new_soup = get_page_contents(unique_url)
    
    try:
        if new_soup.find('dl', class_='dinosaur--info dinosaur--list').find_all('dt')[5].text.strip() == 'Found in:':
            where = new_soup.find('dl', class_='dinosaur--info dinosaur--list').find_all('dd')[5].text.strip()
    except IndexError:
        if new_soup.find('dl', class_='dinosaur--info dinosaur--list').find_all('dt')[1].text.strip() == 'Found in:':
            where = new_soup.find('dl', class_='dinosaur--info dinosaur--list').find_all('dd')[1].text.strip()
    except IndexError:
        if new_soup.find('dl', class_='dinosaur--info dinosaur--list').find_all('dt')[2].text.strip() == 'Found in:':
            where = new_soup.find('dl', class_='dinosaur--info dinosaur--list').find_all('dd')[2].text.strip()
    except IndexError:
        where = ''
    living_area.append(where)

100%|██████████| 309/309 [00:26<00:00, 11.79it/s]


In [72]:
living_area = [regex.sub("", area) for area in living_area]

In [73]:
living_area

['Mongolia',
 'Mongolia',
 'Mongolia',
 'Mongolia',
 'Mongolia',
 'Mongolia',
 'Mongolia',
 'Mongolia',
 'Mongolia',
 'Mongolia',
 'Canada',
 'Canada',
 'Canada',
 'Portugal,USA',
 'Portugal,USA',
 'Portugal,USA',
 'Portugal,USA',
 'Portugal,USA',
 'Portugal,USA',
 'Portugal,USA',
 'Portugal,USA',
 'Canada,USA',
 'Canada,USA',
 'Canada,USA',
 'Canada,USA',
 'Canada,USA',
 'Canada,USA',
 'Canada,USA',
 'Germany',
 'Germany',
 'Argentina',
 'Argentina',
 'Argentina',
 'Argentina',
 'Argentina',
 'Argentina',
 'Argentina',
 'Argentina',
 'Argentina',
 'Argentina',
 'Argentina',
 'Argentina',
 'England,Spain,United Kingdom',
 'England,Spain,United Kingdom',
 'England,Spain,United Kingdom',
 'England,Spain,United Kingdom',
 'England,Spain,United Kingdom',
 'England,Spain,United Kingdom',
 'England,Spain,United Kingdom',
 'England,Spain,United Kingdom',
 'England,Spain,United Kingdom',
 'USA',
 'England,United Kingdom,USA',
 'England,United Kingdom,USA',
 'England,United Kingdom,USA',
 'Engl

In [81]:
teeth = []

#get living period test
for unique_url in tqdm(dinosaur_url):
    new_soup = get_page_contents(unique_url)
    
    try:
        if new_soup.find('dl', class_='dinosaur--info dinosaur--list').find_all('dt')[1].text.strip() == 'Teeth:':
            teeth_info = new_soup.find('dl', class_='dinosaur--info dinosaur--list').find_all('dd')[1].text.strip()
        else:
            teeth_info = ''
    except IndexError:
        teeth_info = ''
    teeth.append(teeth_info)

100%|██████████| 309/309 [07:18<00:00,  1.42s/it]


In [83]:
food = []

#get living period test
for unique_url in tqdm(dinosaur_url):
    new_soup = get_page_contents(unique_url)
    
    try:
        if new_soup.find('dl', class_='dinosaur--info dinosaur--list').find_all('dt')[2].text.strip() == 'Food:':
            food_info = new_soup.find('dl', class_='dinosaur--info dinosaur--list').find_all('dd')[2].text.strip()
        else:
            food_info = ''
    except IndexError:
        food_info = ''
    food.append(food_info)

100%|██████████| 309/309 [00:24<00:00, 12.39it/s]


In [85]:
movement = []

#get living period test
for unique_url in tqdm(dinosaur_url):
    new_soup = get_page_contents(unique_url)
    
    try:
        if new_soup.find('dl', class_='dinosaur--info dinosaur--list').find_all('dt')[3].text.strip() == 'How it moved:':
            move = new_soup.find('dl', class_='dinosaur--info dinosaur--list').find_all('dd')[3].text.strip()
        else:
            move = ''
    except IndexError:
        move = ''
    movement.append(move)

100%|██████████| 309/309 [00:24<00:00, 12.51it/s]


In [87]:
df_dict = {'Reference': dinosaur_url, 'Name': dinosaur_name, 'Pronunciation': pronunciation,
          'Image_Reference': dino_img, 'Length': length, 'Type': dino_type, 'Diet': diet_type,
           'Period': living_period, 'Area': living_area, 'Taxonomy': taxonomy, 'Named_By': named_by,
           'Species': species_type, 'Teeth': teeth, 'Food': food, 'Movement': movement
          }

In [88]:
df = pd.DataFrame(df_dict)

In [89]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 309 entries, 0 to 308
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Reference        309 non-null    object
 1   Name             309 non-null    object
 2   Pronunciation    309 non-null    object
 3   Image_Reference  309 non-null    object
 4   Length           309 non-null    object
 5   Type             309 non-null    object
 6   Diet             309 non-null    object
 7   Period           309 non-null    object
 8   Area             309 non-null    object
 9   Taxonomy         309 non-null    object
 10  Named_By         309 non-null    object
 11  Species          309 non-null    object
 12  Teeth            309 non-null    object
 13  Food             309 non-null    object
 14  Movement         309 non-null    object
dtypes: object(15)
memory usage: 36.3+ KB


In [90]:
df.to_csv('dinosaur_data.csv')