# Fetch All Animal and Plant Links from Vietnamese Red List

This notebook fetches all species links from the Vietnamese Red List website across all categories (animals and plants), including all pages with pagination.

## 1. Import Required Libraries

Import necessary libraries for web scraping and data handling.

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from typing import List, Dict
import re

## 2. Define Target URL and Setup

Set up the base URL and configuration for scraping.

In [11]:
# Base URL for the Vietnamese Red List
BASE_URL = "http://vnredlist.vast.vn"

# List of all categories to scrape
target_urls = [
    "http://vnredlist.vast.vn/dong-vat/dong-vat-co-day-song/lop-thu/",
    "http://vnredlist.vast.vn/dong-vat/dong-vat-co-day-song/lop-chim/",
    "http://vnredlist.vast.vn/dong-vat/dong-vat-co-day-song/lop-bo-sat/",
    "http://vnredlist.vast.vn/dong-vat/dong-vat-co-day-song/lop-luong-cu/",
    "http://vnredlist.vast.vn/dong-vat/dong-vat-co-day-song/lop-ca-xuong/",
    "http://vnredlist.vast.vn/dong-vat/dong-vat-co-day-song/lop-ca-mang-tam/",

    "http://vnredlist.vast.vn/dong-vat/nganh-chan-khop/lop-con-trung/",
    "http://vnredlist.vast.vn/dong-vat/nganh-chan-khop/lop-hinh-nhen/",
    "http://vnredlist.vast.vn/dong-vat/nganh-chan-khop/lop-giap-xac-lon/",
    "http://vnredlist.vast.vn/dong-vat/nganh-chan-khop/lop-mieng-dot/",
    
    "http://vnredlist.vast.vn/dong-vat/nganh-than-mem/lop-than-mem-chan-bung/",
    "http://vnredlist.vast.vn/dong-vat/nganh-than-mem/lop-than-mem-hai-manh-vo/",
    "http://vnredlist.vast.vn/dong-vat/nganh-than-mem/lop-chan-dau/",

    "http://vnredlist.vast.vn/dong-vat/nganh-da-gai/lop-cau-gai/",
    "http://vnredlist.vast.vn/dong-vat/nganh-da-gai/lop-hai-sam/",

    "http://vnredlist.vast.vn/dong-vat/nganh-san-ho/lop-octocorallia/",
    "http://vnredlist.vast.vn/dong-vat/nganh-san-ho/lop-hexacorallia/",

    "http://vnredlist.vast.vn/thuc-vat/nganh-moc-lan/lop-moc-lan/",
    "http://vnredlist.vast.vn/thuc-vat/nganh-moc-lan/lop-hanh/",
    
    "http://vnredlist.vast.vn/thuc-vat/nganh-thong/lop-thong/",
    "http://vnredlist.vast.vn/thuc-vat/nganh-thong/lop-tue/",

    "http://vnredlist.vast.vn/thuc-vat/nganh-duong-xi/lop-duong-xi/",

    "http://vnredlist.vast.vn/thuc-vat/nganh-thong-dat/lop-thong-dat/",

    "http://vnredlist.vast.vn/thuc-vat/nganh-khuyet-la-thong/lop-khuyet-la-thong/",

    "http://vnredlist.vast.vn/thuc-vat/nganh-rong-luc/lop-ulvophyceae/",

    "http://vnredlist.vast.vn/thuc-vat/nganh-rong-nau/lop-phaeophyceae/",

    "http://vnredlist.vast.vn/thuc-vat/nganh-rong-do/lop-bangiophyceae/",
    "http://vnredlist.vast.vn/thuc-vat/nganh-rong-do/lop-florideophyceae/",

    "http://vnredlist.vast.vn/thuc-vat/nganh-reu-tan/lop-jungermanniopsida/",

    "http://vnredlist.vast.vn/thuc-vat/nganh-nam-nang/lop-nam-dia/",
    
    "http://vnredlist.vast.vn/thuc-vat/nganh-nam-dam/lop-nam-tan/",
    "http://vnredlist.vast.vn/thuc-vat/nganh-nam-dam/lop-nam-ngan-nhi/"
]

# Headers to mimic a browser request
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

print(f"Total categories to scrape: {len(target_urls)}")

Total categories to scrape: 32


## 3. Create Helper Functions

Define functions to fetch pages and extract animal links.

In [12]:
def fetch_page(url: str) -> BeautifulSoup:
    """
    Fetch a web page and return BeautifulSoup object.
    
    Args:
        url: The URL to fetch
    
    Returns:
        BeautifulSoup object of the page content
    """
    try:
        response = requests.get(url, headers=HEADERS, timeout=10)
        response.raise_for_status()
        print(f"✓ Successfully fetched: {url}")
        return BeautifulSoup(response.content, 'html.parser')
    except requests.exceptions.RequestException as e:
        print(f"✗ Error fetching {url}: {e}")
        return None

In [13]:
def extract_animal_links(soup: BeautifulSoup) -> List[Dict[str, str]]:
    """
    Extract animal links from a page.
    
    Args:
        soup: BeautifulSoup object of the page
    
    Returns:
        List of dictionaries containing species information
    """
    animals = []
    
    # Find all article elements or h2 headers that contain species links
    # The structure shows species links in h2 tags with anchor tags
    headers = soup.find_all('h2')
    
    for header in headers:
        link_tag = header.find('a')
        if link_tag and link_tag.get('href'):
            url = link_tag.get('href')
            
            # Only process species links (not navigation links)
            if url.startswith('http://vnredlist.vast.vn/') and url.count('/') > 3:
                scientific_name = link_tag.get_text(strip=True)
                
                # Try to get Vietnamese common name (usually in the next element)
                common_name = ""
                next_text = header.find_next_sibling()
                if next_text:
                    common_name = next_text.get_text(strip=True)
                
                animals.append({
                    'scientific_name': scientific_name,
                    'common_name_vi': common_name,
                    'url': url
                })
    
    return animals

In [14]:
def get_total_pages(soup: BeautifulSoup) -> int:
    """
    Determine the total number of pages from pagination.
    
    Args:
        soup: BeautifulSoup object of the first page
    
    Returns:
        Total number of pages
    """
    max_page = 1
    
    # Look for pagination links
    pagination_links = soup.find_all('a', href=True)
    
    for link in pagination_links:
        href = link.get('href', '')
        # Look for pattern like /page/2/, /page/3/, etc.
        match = re.search(r'/page/(\d+)/', href)
        if match:
            page_num = int(match.group(1))
            max_page = max(max_page, page_num)
    
    return max_page

In [15]:
def get_category_name(url: str) -> str:
    """
    Extract category name from URL for labeling.
    
    Args:
        url: The category URL
    
    Returns:
        A readable category name
    """
    # Extract the last part of the URL path
    parts = url.rstrip('/').split('/')
    if len(parts) >= 2:
        return f"{parts[-2]}/{parts[-1]}"
    return url

## 4. Process All Categories

Iterate through all target URLs and collect species links from each category.

In [16]:
# Store all species from all categories
all_species = []
category_stats = []

print(f"Starting to process {len(target_urls)} categories...\n")
print("=" * 80)

for category_idx, target_url in enumerate(target_urls, 1):
    category_name = get_category_name(target_url)
    print(f"\n[{category_idx}/{len(target_urls)}] Processing: {category_name}")
    print("-" * 80)
    
    # Fetch the first page of this category
    first_page_soup = fetch_page(target_url)
    
    if not first_page_soup:
        print(f"✗ Failed to fetch first page of {category_name}")
        category_stats.append({
            'category': category_name,
            'url': target_url,
            'total_pages': 0,
            'species_count': 0,
            'status': 'Failed'
        })
        time.sleep(2)  # Wait before next category
        continue
    
    # Determine total pages for this category
    total_pages = get_total_pages(first_page_soup)
    print(f"  Total pages in this category: {total_pages}")
    
    category_species = []
    
    # Process first page
    species_page1 = extract_animal_links(first_page_soup)
    category_species.extend(species_page1)
    print(f"  Page 1: Found {len(species_page1)} species")
    
    # Process remaining pages if any
    for page_num in range(2, total_pages + 1):
        page_url = f"{target_url}page/{page_num}/"
        
        # Wait 1-2 seconds between requests
        time.sleep(1.5)
        
        soup = fetch_page(page_url)
        if soup:
            species = extract_animal_links(soup)
            category_species.extend(species)
            print(f"  Page {page_num}: Found {len(species)} species")
        else:
            print(f"  Page {page_num}: Failed to fetch")
    
    # Add category information to each species
    for species in category_species:
        species['category'] = category_name
        species['category_url'] = target_url
    
    all_species.extend(category_species)
    
    # Record statistics
    category_stats.append({
        'category': category_name,
        'url': target_url,
        'total_pages': total_pages,
        'species_count': len(category_species),
        'status': 'Success'
    })
    
    print(f"  ✓ Total species in {category_name}: {len(category_species)}")
    
    # Wait 2 seconds before moving to next category
    if category_idx < len(target_urls):
        print(f"\n  Waiting 2 seconds before next category...")
        time.sleep(2)

print("\n" + "=" * 80)
print(f"✓ COMPLETE! Total species collected: {len(all_species)}")
print("=" * 80)

Starting to process 32 categories...


[1/32] Processing: dong-vat-co-day-song/lop-thu
--------------------------------------------------------------------------------
✓ Successfully fetched: http://vnredlist.vast.vn/dong-vat/dong-vat-co-day-song/lop-thu/
  Total pages in this category: 8
  Page 1: Found 16 species
✓ Successfully fetched: http://vnredlist.vast.vn/dong-vat/dong-vat-co-day-song/lop-thu/
  Total pages in this category: 8
  Page 1: Found 16 species
✓ Successfully fetched: http://vnredlist.vast.vn/dong-vat/dong-vat-co-day-song/lop-thu/page/2/
  Page 2: Found 16 species
✓ Successfully fetched: http://vnredlist.vast.vn/dong-vat/dong-vat-co-day-song/lop-thu/page/2/
  Page 2: Found 16 species
✓ Successfully fetched: http://vnredlist.vast.vn/dong-vat/dong-vat-co-day-song/lop-thu/page/3/
  Page 3: Found 16 species
✓ Successfully fetched: http://vnredlist.vast.vn/dong-vat/dong-vat-co-day-song/lop-thu/page/3/
  Page 3: Found 16 species
✓ Successfully fetched: http://vnredlist.vast.

## 5. Display Results and Statistics

Show the collected data and statistics for each category.

In [17]:
# Create DataFrame from all species
df_all_species = pd.DataFrame(all_species)

# Create DataFrame for category statistics
df_stats = pd.DataFrame(category_stats)

print("=" * 80)
print("CATEGORY STATISTICS")
print("=" * 80)
print(df_stats.to_string(index=False))

print("\n" + "=" * 80)
print("SPECIES DATA PREVIEW")
print("=" * 80)

# Display first 10 species
print("\nFirst 10 species:")
print(df_all_species.head(10).to_string(index=False))

# Display last 10 species
print("\nLast 10 species:")
print(df_all_species.tail(10).to_string(index=False))

# Display overall statistics
print("\n" + "=" * 80)
print("OVERALL STATISTICS")
print("=" * 80)
print(f"Total categories processed: {len(category_stats)}")
print(f"Total species collected: {len(df_all_species)}")
print(f"Columns: {df_all_species.columns.tolist()}")
print(f"Successful categories: {df_stats[df_stats['status'] == 'Success'].shape[0]}")
print(f"Failed categories: {df_stats[df_stats['status'] == 'Failed'].shape[0]}")

CATEGORY STATISTICS
                                 category                                                                          url  total_pages  species_count  status
             dong-vat-co-day-song/lop-thu              http://vnredlist.vast.vn/dong-vat/dong-vat-co-day-song/lop-thu/            8            121 Success
            dong-vat-co-day-song/lop-chim             http://vnredlist.vast.vn/dong-vat/dong-vat-co-day-song/lop-chim/            7            112 Success
          dong-vat-co-day-song/lop-bo-sat           http://vnredlist.vast.vn/dong-vat/dong-vat-co-day-song/lop-bo-sat/            7            103 Success
        dong-vat-co-day-song/lop-luong-cu         http://vnredlist.vast.vn/dong-vat/dong-vat-co-day-song/lop-luong-cu/            5             67 Success
        dong-vat-co-day-song/lop-ca-xuong         http://vnredlist.vast.vn/dong-vat/dong-vat-co-day-song/lop-ca-xuong/            5             73 Success
     dong-vat-co-day-song/lop-ca-mang-tam      htt

## 6. Save Results to CSV Files

Export the collected data to CSV files.

In [18]:
# Save all species to CSV
species_filename = 'vnredlist_all_species_links.csv'
df_all_species.to_csv(species_filename, index=False, encoding='utf-8-sig')
print(f"✓ All species data saved to: {species_filename}")
print(f"  Total records: {len(df_all_species)}")

# Save category statistics to CSV
stats_filename = 'vnredlist_category_statistics.csv'
df_stats.to_csv(stats_filename, index=False, encoding='utf-8-sig')
print(f"\n✓ Category statistics saved to: {stats_filename}")
print(f"  Total categories: {len(df_stats)}")

✓ All species data saved to: vnredlist_all_species_links.csv
  Total records: 1357

✓ Category statistics saved to: vnredlist_category_statistics.csv
  Total categories: 32


## 7. Optional: Display Sample Links by Category

Show sample links from different categories to verify the data.

In [None]:
# Display sample species from different categories
print("\nSample species from different categories:")
print("=" * 80)

# Get unique categories
unique_categories = df_all_species['category'].unique()

# Show 2 species from each of the first 5 categories
for i, category in enumerate(unique_categories[:5], 1):
    print(f"\n{i}. Category: {category}")
    print("-" * 80)
    category_species = df_all_species[df_all_species['category'] == category]
    
    for j, (idx, species) in enumerate(category_species.head(2).iterrows(), 1):
        print(f"  {j}. {species['scientific_name']}")
        print(f"     Vietnamese name: {species['common_name_vi']}")
        print(f"     URL: {species['url']}")
        if j < 2:
            print()