# Extract Detailed Species Information from Vietnamese Red List

This notebook fetches detailed information for each species from their individual pages on the Vietnamese Red List website.

## 1. Import Required Libraries

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
import time
from typing import Dict, Optional, List
import re



## 2. Configuration

In [2]:
# Headers to mimic a browser request
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

# Input file containing species links
INPUT_CSV = 'vnredlist_all_species_links.csv'

# Output file for detailed species data
OUTPUT_JSON = 'vnredlist_species_details.json'

## 3. Helper Functions to Extract Data

In [3]:
def fetch_species_page(url: str) -> Optional[BeautifulSoup]:
    """
    Fetch a species page and return BeautifulSoup object.
    
    Args:
        url: The species page URL
    
    Returns:
        BeautifulSoup object or None if failed
    """
    try:
        response = requests.get(url, headers=HEADERS, timeout=15)
        response.raise_for_status()
        return BeautifulSoup(response.content, 'html.parser')
    except requests.exceptions.RequestException as e:
        print(f"✗ Error fetching {url}: {e}")
        return None

In [4]:
def extract_text_after_heading(soup: BeautifulSoup, heading_text: str) -> str:
    """
    Extract text content after a specific heading.
    
    Args:
        soup: BeautifulSoup object
        heading_text: The heading text to search for
    
    Returns:
        Extracted text or empty string
    """
    # Find the heading (h3, h4, etc.)
    headings = soup.find_all(['h3', 'h4'])
    
    for heading in headings:
        heading_content = heading.get_text(strip=True)
        if heading_text.lower() in heading_content.lower():
            # Get the next sibling element
            next_elem = heading.find_next_sibling()
            if next_elem:
                # Get text with proper spacing
                text = next_elem.get_text(separator=' ', strip=True)
                # Clean up multiple spaces
                text = ' '.join(text.split())
                return text
    
    return ""

In [5]:
def debug_page_structure(soup: BeautifulSoup):
    """
    Debug function to inspect the page structure.
    """
    print("\n=== Page Structure Analysis ===")
    
    # Find all h3 and h4 headings
    print("\nAll H3/H4 headings found:")
    for heading in soup.find_all(['h3', 'h4'])[:20]:  # Limit to first 20
        text = heading.get_text(strip=True)
        next_elem = heading.find_next_sibling()
        next_text = next_elem.get_text(strip=True)[:100] if next_elem else "None"
        print(f"  • {text}")
        print(f"    → {next_text}")
    
    print("\n=== End of Structure Analysis ===")

In [6]:
def extract_taxonomic_info(soup: BeautifulSoup) -> Dict[str, str]:
    """
    Extract taxonomic classification information.
    
    Returns:
        Dictionary with taxonomic ranks
    """
    taxonomy = {
        'kingdom_latin': '',
        'kingdom_vi': '',
        'phylum_latin': '',
        'phylum_vi': '',
        'class_latin': '',
        'class_vi': '',
        'order_latin': '',
        'order_vi': '',
        'family_latin': '',
        'family_vi': ''
    }
    
    # Map Vietnamese headings to taxonomy keys (exact matches)
    taxonomy_map = {
        'Giới': 'kingdom',
        'Ngành': 'phylum',
        'Lớp': 'class',
        'Bộ': 'order',
        'Họ': 'family'
    }
    
    # Find the taxonomy section (usually under "Phân loại")
    headings = soup.find_all(['h3', 'h4'])
    
    for heading in headings:
        heading_text = heading.get_text(strip=True)
        
        # Check if this heading matches one of our taxonomy levels
        for vn_name, en_key in taxonomy_map.items():
            if heading_text == vn_name:  # Exact match only
                # Get the next sibling element
                next_elem = heading.find_next_sibling()
                if next_elem:
                    value = next_elem.get_text(separator=' ', strip=True)
                    value = ' '.join(value.split())  # Clean multiple spaces
                    taxonomy[f'{en_key}_latin'] = value.upper()
                break
    
    return taxonomy

In [7]:
def extract_species_details(url: str, soup: BeautifulSoup) -> Dict:
    """
    Extract all species details from a species page.
    
    Args:
        url: The species page URL
        soup: BeautifulSoup object of the page
    
    Returns:
        Dictionary with species information
    """
    species_data = {
        'scientific_name': {'value': '', 'note': ''},
        'common_name': {'value': '', 'note': ''},
        'kingdom_latin': '',
        'kingdom_vi': '',
        'phylum_latin': '',
        'phylum_vi': '',
        'class_latin': '',
        'class_vi': '',
        'order_latin': '',
        'order_vi': '',
        'family_latin': '',
        'family_vi': '',
        'note': '',
        'laws': []
    }
    
    # Extract scientific name (from page title or heading)
    title = soup.find('h1')
    if title:
        sci_name = title.get_text(strip=True)
        # Clean scientific name - remove author and year if present in title
        # Keep only the first 2-3 words (genus, species, and possibly subspecies)
        parts = sci_name.split()
        if len(parts) >= 2:
            # Check if third word starts with lowercase (subspecies) or uppercase (author)
            if len(parts) >= 3 and parts[2][0].islower():
                sci_name = ' '.join(parts[:3])
            else:
                sci_name = ' '.join(parts[:2])
        species_data['scientific_name']['value'] = sci_name
    
    # Extract Vietnamese common name
    common_name = extract_text_after_heading(soup, 'Tên việt nam')
    if common_name:
        species_data['common_name']['value'] = common_name
    
    # Extract taxonomic information
    taxonomy = extract_taxonomic_info(soup)
    species_data.update(taxonomy)
    
    # Extract conservation status from Vietnamese Red List
    # Try multiple possible headings
    status = extract_text_after_heading(soup, 'Phân hạng bảo tồn')
    if not status:
        status = extract_text_after_heading(soup, 'Phân hạng')
    
    # Extract just the status code (CR, EN, VU, etc.)
    if status:
        # Look for status codes in the text
        status_match = re.search(r'\b(CR|EN|VU|NT|LC|DD|EW|EX|NE)\b', status)
        if status_match:
            status_code = status_match.group(1)
            species_data['laws'].append({
                'name': {
                    'vi': 'Danh lục Đỏ Việt Nam',
                    'en': 'Vietnam Red List'
                },
                'value': status_code,
                'note': url
            })
    
    return species_data

## 4. Load Species Links from CSV

In [8]:
# Load the CSV file with species links
try:
    df_species = pd.read_csv(INPUT_CSV)
    print(f"✓ Loaded {len(df_species)} species links from {INPUT_CSV}")
    print(f"Columns: {df_species.columns.tolist()}")
    print(f"\nFirst few rows:")
    print(df_species.head())
except FileNotFoundError:
    print(f"✗ File {INPUT_CSV} not found. Please run the fetch_animal_links.ipynb notebook first.")
    df_species = pd.DataFrame()

✓ Loaded 1357 species links from vnredlist_all_species_links.csv
Columns: ['scientific_name', 'common_name_vi', 'url', 'category', 'category_url']

First few rows:
                scientific_name  common_name_vi  \
0               Elephas maximus      Voi châu á   
1        Galeopterus variegatus        Chồn dơi   
2        Nycticebus bengalensis       Cu li lớn   
3  Xanthonycticebus intermedius  Cu li miền bắc   
4     Xanthonycticebus pygmaeus       Cu li nhỏ   

                                                 url  \
0          http://vnredlist.vast.vn/elephas-maximus/   
1   http://vnredlist.vast.vn/galeopterus-variegatus/   
2   http://vnredlist.vast.vn/nycticebus-bengalensis/   
3  http://vnredlist.vast.vn/xanthonycticebus-inte...   
4  http://vnredlist.vast.vn/xanthonycticebus-pygm...   

                       category  \
0  dong-vat-co-day-song/lop-thu   
1  dong-vat-co-day-song/lop-thu   
2  dong-vat-co-day-song/lop-thu   
3  dong-vat-co-day-song/lop-thu   
4  dong-vat-co-da

## 5. Test with a Single Species

In [9]:
# Test with the first species
if not df_species.empty:
    test_url = df_species.iloc[3]['url']
    print(f"Testing with: {test_url}")
    print("=" * 80)
    
    test_soup = fetch_species_page(test_url)
    if test_soup:
        # Extract data
        test_data = extract_species_details(test_url, test_soup)
        print("\nExtracted data:")
        print(json.dumps(test_data, indent=2, ensure_ascii=False))
        
        # Verify the extraction
        print("\n" + "=" * 80)
        print("Verification:")
        print(f"✓ Scientific name: {test_data['scientific_name']['value']}")
        print(f"✓ Common name: {test_data['common_name']['value']}")
        print(f"✓ Kingdom: {test_data['kingdom_latin']}")
        print(f"✓ Phylum: {test_data['phylum_latin']}")
        print(f"✓ Class: {test_data['class_latin']}")
        print(f"✓ Order: {test_data['order_latin']}")
        print(f"✓ Family: {test_data['family_latin']}")
        if test_data['laws']:
            print(f"✓ Conservation status: {test_data['laws'][0]['value']}")
        print("=" * 80)
    else:
        print("Failed to fetch test page")

Testing with: http://vnredlist.vast.vn/xanthonycticebus-intermedius/

Extracted data:
{
  "scientific_name": {
    "value": "Xanthonycticebus intermedius",
    "note": ""
  },
  "common_name": {
    "value": "Cu li miền bắc",
    "note": ""
  },
  "kingdom_latin": "ANIMALIA",
  "kingdom_vi": "",
  "phylum_latin": "CHORDATA",
  "phylum_vi": "",
  "class_latin": "MAMMALIA",
  "class_vi": "",
  "order_latin": "PRIMATES",
  "order_vi": "",
  "family_latin": "LORISIDAE",
  "family_vi": "",
  "note": "",
  "laws": [
    {
      "name": {
        "vi": "Danh lục Đỏ Việt Nam",
        "en": "Vietnam Red List"
      },
      "value": "EN",
      "note": "http://vnredlist.vast.vn/xanthonycticebus-intermedius/"
    }
  ]
}

Verification:
✓ Scientific name: Xanthonycticebus intermedius
✓ Common name: Cu li miền bắc
✓ Kingdom: ANIMALIA
✓ Phylum: CHORDATA
✓ Class: MAMMALIA
✓ Order: PRIMATES
✓ Family: LORISIDAE
✓ Conservation status: EN


## 6. Process All Species

Extract detailed information for all species with proper delays between requests.

In [None]:
all_species_details = []
failed_urls = []

if not df_species.empty:
    total_species = len(df_species)
    print(f"Starting to process {total_species} species...")
    print("=" * 80)
    
    for idx, row in df_species.iterrows():
        species_url = row['url']
        scientific_name = row.get('scientific_name', 'Unknown')
        
        # Progress indicator
        if (idx + 1) % 10 == 0 or idx == 0:
            print(f"\n[{idx + 1}/{total_species}] Processing: {scientific_name}")
        
        # Fetch the species page
        soup = fetch_species_page(species_url)
        
        if soup:
            try:
                # Extract species details
                species_data = extract_species_details(species_url, soup)
                all_species_details.append(species_data)
                
                if (idx + 1) % 10 == 0:
                    print(f"  ✓ Successfully extracted data")
            except Exception as e:
                print(f"  ✗ Error extracting data from {species_url}: {e}")
                failed_urls.append(species_url)
        else:
            print(f"  ✗ Failed to fetch: {species_url}")
            failed_urls.append(species_url)
        
        # Wait 1-2 seconds between requests
        time.sleep(2)
        
        # Save intermediate results every 10 species
        if (idx + 1) % 10 == 0:
            intermediate_file = f'vnredlist_status.json'
            with open(intermediate_file, 'w', encoding='utf-8') as f:
                json.dump(all_species_details, f, ensure_ascii=False, indent=2)
            print(f"\n  💾 Saved intermediate results to {intermediate_file}")
    
    print("\n" + "=" * 80)
    print(f"✓ Processing complete!")
    print(f"  Successfully extracted: {len(all_species_details)} species")
    print(f"  Failed: {len(failed_urls)} species")
    print("=" * 80)
else:
    print("No species data to process.")

Starting to process 1357 species...

[1/1357] Processing: Elephas maximus

[10/1357] Processing: Macaca mulatta
  ✓ Successfully extracted data

  💾 Saved intermediate results to vnredlist_status.json

[20/1357] Processing: Trachypithecus margarita
  ✓ Successfully extracted data

  💾 Saved intermediate results to vnredlist_status.json

[30/1357] Processing: Neohylomys hainanensis
  ✓ Successfully extracted data

  💾 Saved intermediate results to vnredlist_status.json

[40/1357] Processing: Rhinolophus rex
  ✓ Successfully extracted data

  💾 Saved intermediate results to vnredlist_status.json

[50/1357] Processing: Harpiola isodon
  ✓ Successfully extracted data

  💾 Saved intermediate results to vnredlist_status.json

[60/1357] Processing: Scotomanes ornatus
  ✓ Successfully extracted data

  💾 Saved intermediate results to vnredlist_status.json

[70/1357] Processing: Neofelis nebulosa
  ✓ Successfully extracted data

  💾 Saved intermediate results to vnredlist_status.json

[80/1357]

## 7. Save Results to JSON

In [None]:
if all_species_details:
    # Save to JSON file
    with open(OUTPUT_JSON, 'w', encoding='utf-8') as f:
        json.dump(all_species_details, f, ensure_ascii=False, indent=2)
    
    print(f"✓ All species details saved to: {OUTPUT_JSON}")
    print(f"  Total records: {len(all_species_details)}")
    
    # Save failed URLs for retry
    if failed_urls:
        failed_file = 'failed_species_urls.txt'
        with open(failed_file, 'w') as f:
            f.write('\n'.join(failed_urls))
        print(f"\n✓ Failed URLs saved to: {failed_file}")
        print(f"  Total failed: {len(failed_urls)}")
else:
    print("No data to save.")

## 8. Display Sample Results

In [None]:
if all_species_details:
    print("\nSample extracted species data:")
    print("=" * 80)
    
    # Show first 3 species
    for i, species in enumerate(all_species_details[:3], 1):
        print(f"\n{i}. {species['scientific_name']['value']}")
        print(f"   Common name: {species['common_name']['value']}")
        print(f"   Kingdom: {species['kingdom_latin']}")
        print(f"   Phylum: {species['phylum_latin']}")
        print(f"   Class: {species['class_latin']}")
        print(f"   Order: {species['order_latin']}")
        print(f"   Family: {species['family_latin']}")
        if species['laws']:
            print(f"   Conservation Status: {species['laws'][0]['value']}")
        print()

## 9. Statistics Summary

In [None]:
if all_species_details:
    print("\nExtraction Statistics:")
    print("=" * 80)
    
    # Count species with complete taxonomic information
    complete_taxonomy = sum(1 for s in all_species_details 
                           if s['kingdom_latin'] and s['phylum_latin'] 
                           and s['class_latin'] and s['order_latin'] 
                           and s['family_latin'])
    
    # Count species with conservation status
    with_status = sum(1 for s in all_species_details if s['laws'])
    
    # Count species with common names
    with_common_name = sum(1 for s in all_species_details 
                          if s['common_name']['value'])
    
    print(f"Total species extracted: {len(all_species_details)}")
    print(f"Species with complete taxonomy: {complete_taxonomy} ({complete_taxonomy/len(all_species_details)*100:.1f}%)")
    print(f"Species with conservation status: {with_status} ({with_status/len(all_species_details)*100:.1f}%)")
    print(f"Species with common names: {with_common_name} ({with_common_name/len(all_species_details)*100:.1f}%)")
    print(f"Failed extractions: {len(failed_urls)}")
    print("=" * 80)

In [16]:
len(all_species_details)

1355

In [15]:
# Find missing species details
all_scientific_names = set(df_species['scientific_name'].dropna().str.strip())
extracted_scientific_names = set(s['scientific_name']['value'] for s in all_species_details if s['scientific_name']['value'])
missing_species = all_scientific_names - extracted_scientific_names
print(f"Total species in original list: {len(all_scientific_names)}")
print(f"Total species extracted: {len(extracted_scientific_names)}")
print(f"Total missing species: {len(missing_species)}")

Total species in original list: 1356
Total species extracted: 1354
Total missing species: 5


In [14]:
# Run the code on the failed URLs to retry fetching them
failed_species_details = []
if failed_urls:
    print(f"\nRetrying {len(failed_urls)} failed URLs...")
    print("=" * 80)
    
    for idx, species_url in enumerate(failed_urls):
        print(f"\nRetrying [{idx + 1}/{len(failed_urls)}]: {species_url}")
        
        # Fetch the species page
        soup = fetch_species_page(species_url)
        
        if soup:
            try:
                # Extract species details
                species_data = extract_species_details(species_url, soup)
                failed_species_details.append(species_data)
                print(f"  ✓ Successfully extracted data on retry")
            except Exception as e:
                print(f"  ✗ Error extracting data from {species_url} on retry: {e}")
        else:
            print(f"  ✗ Failed to fetch again: {species_url}")
        
        # Wait 1-2 seconds between requests
        time.sleep(2)
    
    print("\n" + "=" * 80)
    print("Retry complete.")
    print(f"Total records after retry: {len(failed_species_details)}")
    
    # Save updated results
    with open("vnredlist_status_failed.json", 'w', encoding='utf-8') as f:
        json.dump(failed_species_details, f, ensure_ascii=False, indent=2)



Retrying 2 failed URLs...

Retrying [1/2]: http://vnredlist.vast.vn/heliotropium-arboreum-blanco-mabb-2017/
  ✓ Successfully extracted data on retry

Retrying [2/2]: http://vnredlist.vast.vn/dioscorea-membranacea-craib-1914/
  ✓ Successfully extracted data on retry

Retry complete.
Total records after retry: 2


In [17]:
# Save updated results
with open("vnredlist_status_new.json", 'w', encoding='utf-8') as f:
    json.dump(all_species_details, f, ensure_ascii=False, indent=2)