# BioPortal Data Extraction Notebook

This notebook demonstrates how to extract and work with biomedical ontology data from the BioPortal API. BioPortal is the world's most comprehensive repository of biomedical ontologies and provides REST APIs for accessing ontological data.

## BioPortal API Overview
- **Base URL**: https://data.bioontology.org/
- **Authentication**: API key required (free registration)
- **Rate Limiting**: Reasonable usage recommended
- **Coverage**: 800+ ontologies including SNOMED CT, Gene Ontology, ICD, UMLS

## What we'll cover:
1. API setup and authentication
2. Browsing available ontologies
3. Extracting concept/class data
4. Searching and mapping terms
5. Working with hierarchical relationships
6. Annotations and metadata extraction
7. Integration with Knowledge Fabric

## 1. Import Required Libraries

In [None]:
# Essential libraries for BioPortal API interaction
import requests
import pandas as pd
import json
import time
from typing import Dict, List, Optional, Any, Union
from urllib.parse import urljoin, quote, unquote
import warnings
warnings.filterwarnings('ignore')

# For data visualization and analysis
try:
    import matplotlib.pyplot as plt
    import seaborn as sns
    import numpy as np
    print("Visualization libraries loaded successfully")
except ImportError:
    print("Visualization libraries not available - install matplotlib, seaborn, numpy for plots")

# For working with nested JSON data
try:
    from collections import defaultdict, Counter
    import xml.etree.ElementTree as ET
    print("Data processing libraries loaded")
except ImportError:
    print("Some data processing libraries not available")

print("All core libraries imported successfully!")

## 2. BioPortal API Configuration

In [None]:
# BioPortal API Configuration
import os

class BioPortalConfig:
    """Configuration class for BioPortal API"""
    
    BASE_URL = "https://data.bioontology.org"
    
    # Get your API key from: https://bioportal.bioontology.org/login
    # Sign up for free account and get API key from your profile
    API_KEY = None  # Set your API key here or use environment variable
    
    # Request headers
    @property
    def headers(self):
        headers = {
            "User-Agent": "KnowledgeFabric/1.0 BioPortal Integration",
            "Accept": "application/json",
            "Content-Type": "application/json"
        }
        if self.API_KEY:
            headers["Authorization"] = f"apikey token={self.API_KEY}"
        return headers
    
    # Rate limiting - be respectful to the API
    REQUEST_DELAY = 0.5  # seconds between requests
    
    # Common parameters
    DEFAULT_PARAMS = {
        "format": "json",
        "display_links": "false",
        "display_context": "false"
    }

# Initialize configuration
config = BioPortalConfig()

# Check for API key in environment variable
if not config.API_KEY:
    config.API_KEY = os.getenv('BIOPORTAL_API_KEY')

# You can set your API key here directly (not recommended for production)
# configurators.API_KEY = "your-pipelines-key-here"

print(f"BioPortal API Base URL: {config.BASE_URL}")
print(f"API Key configured: {'Yes' if config.API_KEY else 'No (required for most endpoints)'}")
print(f"Request delay: {config.REQUEST_DELAY} seconds")

if not config.API_KEY:
    print("\n⚠️  WARNING: No API key found!")
    print("To get an API key:")
    print("1. Go to https://bioportal.bioontology.org/login")
    print("2. Create a free account")
    print("3. Get your API key from your profile")
    print("4. Set it in the cell above or as environment variable BIOPORTAL_API_KEY")

## 3. Core API Request Functions

In [None]:
def make_bioportal_request(endpoint: str, params: Dict = None, delay: bool = True) -> Dict:
    """
    Make a request to the BioPortal API with error handling and rate limiting.
    
    Args:
        endpoint: API endpoint (e.g., 'ontologies', 'ontologies/SNOMEDCT/classes')
        params: Query parameters
        delay: Whether to add delay for rate limiting
    
    Returns:
        JSON response as dictionary
    """
    if params is None:
        params = {}
    
    # Add API key to params if available
    if config.API_KEY:
        params['apikey'] = config.API_KEY
    
    # Merge with default parameters
    final_params = {**config.DEFAULT_PARAMS, **params}
    
    # Build URL
    url = urljoin(config.BASE_URL, endpoint)
    
    try:
        # Rate limiting
        if delay:
            time.sleep(config.REQUEST_DELAY)
        
        # Make request
        response = requests.get(url, headers=config.headers, params=final_params)
        
        # Check for API key issues
        if response.status_code == 401:
            print(f"❌ Authentication failed. Check your API key.")
            return None
        elif response.status_code == 403:
            print(f"❌ Forbidden. Check API key permissions.")
            return None
        
        response.raise_for_status()
        
        return response.json()
        
    except requests.exceptions.RequestException as e:
        print(f"Error making request to {url}: {e}")
        if hasattr(e, 'response') and e.response is not None:
            print(f"Status code: {e.response.status_code}")
            print(f"Response: {e.response.text[:200]}")
        return None

def test_bioportal_connection():
    """Test the BioPortal API connection"""
    print("Testing BioPortal API connection...")
    
    # Test with a simple ontologies list request
    response = make_bioportal_request("ontologies", {"pagesize": 1})
    
    if response and isinstance(response, list) and len(response) > 0:
        print("✅ API connection successful!")
        onto = response[0]
        print(f"Sample ontology: {onto.get('name', 'Unknown')} ({onto.get('acronym', 'N/A')})")
        return True
    elif response and 'error' in str(response):
        print(f"❌ API error: {response}")
        return False
    else:
        print("❌ API connection failed!")
        return False

# Test the connection
test_bioportal_connection()

## 4. Ontology Discovery and Metadata

In [None]:
def get_all_ontologies() -> pd.DataFrame:
    """
    Retrieve all available ontologies from BioPortal.
    
    Returns:
        DataFrame with ontology metadata
    """
    print("Fetching all available ontologies...")
    
    response = make_bioportal_request("ontologies")
    
    if not response:
        print("Failed to fetch ontologies")
        return pd.DataFrame()
    
    ontologies_data = []
    
    for onto in response:
        onto_data = {
            "acronym": onto.get("acronym"),
            "name": onto.get("name"),
            "description": onto.get("description", "")[:200] + "..." if onto.get("description", "") else "",
            "status": onto.get("administrivia", {}).get("status"),
            "group": onto.get("group", [{}])[0].get("name") if onto.get("group") else None,
            "domain": onto.get("hasDomain", [{}])[0].get("name") if onto.get("hasDomain") else None,
            "language": onto.get("naturalLanguage", [{}])[0].get("name") if onto.get("naturalLanguage") else None,
            "format": onto.get("format"),
            "submissions_count": len(onto.get("submissions", [])),
            "views_count": onto.get("viewingRestriction"),
            "latest_submission": onto.get("submissions", [{}])[-1].get("submissionId") if onto.get("submissions") else None,
            "categories": [cat.get("name") for cat in onto.get("categories", [])] if onto.get("categories") else [],
            "homepage": onto.get("homepage"),
            "contact_name": onto.get("contact", [{}])[0].get("name") if onto.get("contact") else None,
            "contact_email": onto.get("contact", [{}])[0].get("email") if onto.get("contact") else None
        }
        ontologies_data.append(onto_data)
    
    df = pd.DataFrame(ontologies_data)
    print(f"Retrieved {len(df)} ontologies")
    
    return df

def get_ontology_details(acronym: str) -> Dict:
    """
    Get detailed information for a specific ontology.
    
    Args:
        acronym: Ontology acronym (e.g., 'SNOMEDCT', 'GO', 'ICD10CM')
    
    Returns:
        Detailed ontology information
    """
    print(f"Fetching details for ontology: {acronym}")
    
    response = make_bioportal_request(f"ontologies/{acronym}")
    
    if not response:
        print(f"Failed to fetch details for {acronym}")
        return {}
    
    return response

def get_ontology_metrics(acronym: str) -> Dict:
    """
    Get metrics for a specific ontology.
    
    Args:
        acronym: Ontology acronym
    
    Returns:
        Ontology metrics dictionary
    """
    print(f"Fetching metrics for ontology: {acronym}")
    
    response = make_bioportal_request(f"ontologies/{acronym}/metrics")
    
    if not response:
        print(f"Failed to fetch metrics for {acronym}")
        return {}
    
    return response

# Example 1: Get all ontologies
print("Example 1: Fetching all ontologies...")
all_ontologies = get_all_ontologies()

if not all_ontologies.empty:
    print(f"\nTop 10 ontologies by name:")
    print(all_ontologies[["acronym", "name", "group", "format", "submissions_count"]].head(10))
    
    # Show some popular biomedical ontologies
    biomedical_keywords = ['SNOMED', 'ICD', 'UMLS', 'GO', 'LOINC', 'MESH', 'HPO']
    biomedical_ontos = all_ontologies[
        all_ontologies['acronym'].str.contains('|'.join(biomedical_keywords), na=False, case=False) |
        all_ontologies['name'].str.contains('|'.join(biomedical_keywords), na=False, case=False)
    ]
    
    print(f"\nPopular biomedical ontologies found ({len(biomedical_ontos)}):")
    print(biomedical_ontos[["acronym", "name", "status"]].head())
else:
    print("No ontologies retrieved. Check your API key configuration.")

## 5. Extracting Classes/Concepts from Ontologies

In [None]:
def get_ontology_classes(acronym: str, limit: int = 100, page: int = 1) -> pd.DataFrame:
    """
    Extract classes/concepts from a specific ontology.
    
    Args:
        acronym: Ontology acronym
        limit: Maximum number of classes to retrieve per page
        page: Page number for pagination
    
    Returns:
        DataFrame with class information
    """
    print(f"Fetching classes from ontology: {acronym} (page {page}, limit {limit})")
    
    params = {
        "pagesize": limit,
        "page": page,
        "display": "prefLabel,definition,synonym,properties"
    }
    
    response = make_bioportal_request(f"ontologies/{acronym}/classes", params)
    
    if not response or 'collection' not in response:
        print(f"Failed to fetch classes for {acronym}")
        return pd.DataFrame()
    
    classes_data = []
    
    for cls in response['collection']:
        # Extract basic information
        class_data = {
            "class_id": cls.get("@id"),
            "pref_label": cls.get("prefLabel"),
            "definition": cls.get("definition", [None])[0] if cls.get("definition") else None,
            "synonyms": cls.get("synonym", []),
            "cui": cls.get("cui"),  # UMLS CUI if available
            "notation": cls.get("notation"),
            "type": cls.get("@type"),
            "obsolete": cls.get("obsolete", False)
        }
        
        # Extract properties if available
        properties = cls.get("properties", {})
        if properties:
            class_data.update({
                "has_children": properties.get("hasChildren", False),
                "parents_count": len(properties.get("parents", [])),
                "children_count": len(properties.get("children", []))
            })
        
        classes_data.append(class_data)
    
    df = pd.DataFrame(classes_data)
    print(f"Retrieved {len(df)} classes from {acronym}")
    
    return df

def get_class_details(acronym: str, class_id: str) -> Dict:
    """
    Get detailed information for a specific class/concept.
    
    Args:
        acronym: Ontology acronym
        class_id: Full class IRI or encoded class ID
    
    Returns:
        Detailed class information
    """
    # URL encode the class ID
    encoded_id = quote(class_id, safe='')
    
    print(f"Fetching details for class: {class_id}")
    
    response = make_bioportal_request(f"ontologies/{acronym}/classes/{encoded_id}")
    
    if not response:
        print(f"Failed to fetch details for class {class_id}")
        return {}
    
    return response

def get_class_hierarchy(acronym: str, class_id: str) -> Dict:
    """
    Get the hierarchical structure (parents/children) for a class.
    
    Args:
        acronym: Ontology acronym  
        class_id: Class IRI or encoded class ID
    
    Returns:
        Dictionary with parents and children information
    """
    encoded_id = quote(class_id, safe='')
    
    # Get parents
    parents_response = make_bioportal_request(f"ontologies/{acronym}/classes/{encoded_id}/parents")
    children_response = make_bioportal_request(f"ontologies/{acronym}/classes/{encoded_id}/children")
    
    hierarchy = {
        "parents": parents_response if parents_response else [],
        "children": children_response if children_response else []
    }
    
    return hierarchy

def search_classes(query: str, ontologies: List[str] = None, limit: int = 50) -> pd.DataFrame:
    """
    Search for classes across ontologies.
    
    Args:
        query: Search query string
        ontologies: List of ontology acronyms to search in (None for all)
        limit: Maximum number of results
    
    Returns:
        DataFrame with search results
    """
    print(f"Searching for: '{query}' (limit: {limit})")
    
    params = {
        "q": query,
        "pagesize": limit,
        "ontologies": ",".join(ontologies) if ontologies else "",
        "suggest": "true"
    }
    
    response = make_bioportal_request("search", params)
    
    if not response or 'collection' not in response:
        print("Search failed")
        return pd.DataFrame()
    
    search_data = []
    
    for result in response['collection']:
        result_data = {
            "class_id": result.get("@id"),
            "pref_label": result.get("prefLabel"),
            "definition": result.get("definition", [None])[0] if result.get("definition") else None,
            "ontology": result.get("links", {}).get("ontology", "").split("/")[-1] if result.get("links") else None,
            "match_type": result.get("matchType"),
            "cui": result.get("cui"),
            "synonyms": result.get("synonym", [])
        }
        search_data.append(result_data)
    
    df = pd.DataFrame(search_data)
    print(f"Found {len(df)} results")
    
    return df

# Example 2: Extract classes from Gene Ontology (if available)
print("\nExample 2: Extracting classes from ontologies...")

# First, let's try a small ontology or a popular one like Gene Ontology
target_ontologies = ["GO", "MESH", "ICD10CM", "HPO"]  # Try these in order

sample_classes = pd.DataFrame()
selected_ontology = None

if 'all_ontologies' in locals() and not all_ontologies.empty:
    # Find available ontologies from our target list
    available_targets = all_ontologies[all_ontologies['acronym'].isin(target_ontologies)]
    
    if not available_targets.empty:
        selected_ontology = available_targets.iloc[0]['acronym']
        print(f"Using ontology: {selected_ontology}")
        
        # Get a small sample of classes
        sample_classes = get_ontology_classes(selected_ontology, limit=20)
        
        if not sample_classes.empty:
            print(f"\nSample classes from {selected_ontology}:")
            print(sample_classes[["pref_label", "definition", "obsolete"]].head())
        else:
            print(f"No classes retrieved from {selected_ontology}")
    else:
        print("None of the target ontologies found in the available list")
else:
    print("No ontologies available for class extraction")