If you are developing a tool that will require ICD-11 codes and the structure as shown in the ICD-11 Release then in the API you need to use the linearization endpoints with the linearizationname set to mms.

In [None]:
import os
import json
import time
import requests
import collections
import pandas as pd
from tqdm import tqdm

: 

# Crawlers 
DEPRECATED. See [ICDparser.py](ICDparser.py) for final version.

In [45]:
# ========== CONFIGURATION ==========
CLIENT_ID = "6afc7b11-86c2-40f5-82d7-39c6c5869dec_38c25903-b150-4472-80cf-58648427fd58"
CLIENT_SECRET = "4kfkQUPOf4fBozRuZIMYsfff4n22mxUg/7dz85YXMqM="
TOKEN_ENDPOINT = 'https://icdaccessmanagement.who.int/connect/token'
ROOT_ENTITY_URI = 'https://id.who.int/icd/release/11/2023-01/mms'
MMS_BASE_URI = 'https://id.who.int/icd/release/11/2023-01/mms/'
OUTPUT_DIR = 'crawling_results/icd11_mms_crawled_entities_recursive'
SLEEP_TIME = 0.2  # To avoid rate limiting
MAX_CHAPTERS = 1
MAX_CHILDREN = 100
# ===================================

def get_access_token():
    payload = {
        'client_id': CLIENT_ID,
        'client_secret': CLIENT_SECRET,
        'scope': 'icdapi_access',
        'grant_type': 'client_credentials'
    }
    response = requests.post(TOKEN_ENDPOINT, data=payload)
    response.raise_for_status()
    token_data = response.json()
 
    return token_data['access_token']

def fetch_entity(entity_id, headers, visited, output_dir):
    if entity_id in visited:
        return
    visited.add(entity_id)

    uri = f'{MMS_BASE_URI}{entity_id}'
    response = requests.get(uri, headers=headers)
    if response.status_code != 200:
        print(f"⚠️ Failed to fetch {uri}")
        return

    data = response.json()
   
    with open(os.path.join(output_dir, f"{entity_id}.json"), 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

    # Recurse into children (limit to x children per node for quick test)
    child_uris = data.get('child', [])[:MAX_CHILDREN]
    for child_uri in child_uris:
        child_id = child_uri.split("/")[-1]
        time.sleep(SLEEP_TIME)
        fetch_entity(child_id, headers, visited, output_dir)

def crawl_icd11():
    os.makedirs(OUTPUT_DIR, exist_ok=True)

    print("🔐 Getting access token...")
    token = get_access_token()

    headers = {
        'Authorization': f'Bearer {token}',
        'Accept': 'application/json',
        'Accept-Language': 'en',
        'API-Version': 'v2'
    }

    response = requests.get(ROOT_ENTITY_URI, headers=headers)
    root_data = response.json()
    root_children = root_data.get('child', [])

    root_ids = [uri.split('/')[-1] for uri in root_children[:MAX_CHAPTERS]]
    with open("MMS_ROOTS.json", 'w') as f:
        json.dump(root_ids, f)

    visited = set()
    for entity_id in tqdm(root_ids):
        fetch_entity(entity_id, headers, visited, OUTPUT_DIR)

    print(f"✅ Done! MMS linearized entities saved to '{OUTPUT_DIR}'")
    
crawl_icd11()

🔐 Getting access token...


  0%|          | 0/1 [00:00<?, ?it/s]

⚠️ Failed to fetch https://id.who.int/icd/release/11/2023-01/mms/other
⚠️ Failed to fetch https://id.who.int/icd/release/11/2023-01/mms/unspecified


100%|██████████| 1/1 [07:26<00:00, 446.68s/it]

✅ Done! MMS linearized entities saved to 'icd11_mms_crawled_entities_recursive'





In [None]:
# ========== CONFIGURATION ==========
CLIENT_ID = "6afc7b11-86c2-40f5-82d7-39c6c5869dec_38c25903-b150-4472-80cf-58648427fd58"
CLIENT_SECRET = "4kfkQUPOf4fBozRuZIMYsfff4n22mxUg/7dz85YXMqM="
TOKEN_ENDPOINT = 'https://icdaccessmanagement.who.int/connect/token'
ROOT_ENTITY_URI = 'https://id.who.int/icd/release/11/2023-01/mms'
MMS_BASE_URI = 'https://id.who.int/icd/release/11/2023-01/mms/'

SLEEP_TIME = 0.2  # To avoid rate limiting
EXTRACT_CHAPTER_NUMBER = 1
MAX_CHILDREN = 1

OUTPUT_DIR = f'crawling_results/icd11_crawled_entities_iterative_CH_{str(EXTRACT_CHAPTER_NUMBER)}'
# ===================================

def get_access_token():
    payload = {
        'client_id': CLIENT_ID,
        'client_secret': CLIENT_SECRET,
        'scope': 'icdapi_access',
        'grant_type': 'client_credentials'
    }
    response = requests.post(TOKEN_ENDPOINT, data=payload)
    response.raise_for_status()
    token_data = response.json()
 
    return token_data['access_token']

def crawl_icd11():
    os.makedirs(OUTPUT_DIR, exist_ok=True)

    print("🔐 Getting access token...")
    token = get_access_token()

    headers = {
        'Authorization': f'Bearer {token}',
        'Accept': 'application/json',
        'Accept-Language': 'en',
        'API-Version': 'v2'
    }

    # Fetch root structure
    response = requests.get(ROOT_ENTITY_URI, headers=headers)
    root_data = response.json()
    
    root_children = root_data.get('child', [])

    # Get root IDs and save them
    root_ids = [uri.split('/')[-1] for uri in root_children[:MAX_CHAPTERS]]
    with open("MMS_ROOTS.json", 'w') as f:
        json.dump(root_ids, f)

    # Initialize queue with root IDs
    chapter_to_process = root_ids[EXTRACT_CHAPTER_NUMBER-1]
    queue = collections.deque(chapter_to_process)
    visited = set(chapter_to_process)  # Track visited entities
    total_processed = 0
    
    # Iterative BFS traversal
    with tqdm(desc="Fetching entities", unit=" entity") as pbar:
        while queue:
            entity_id = queue.popleft()
            
            # Fetch entity data
            uri = f'{MMS_BASE_URI}{entity_id}'
            try:
                response = requests.get(uri, headers=headers)
                if response.status_code != 200:
                    print(f"⚠️ Failed to fetch {uri}")
                    continue
                
                data = response.json()
                
                # Save entity data
                with open(os.path.join(OUTPUT_DIR, f"{entity_id}.json"), 'w', encoding='utf-8') as f:
                    json.dump(data, f, ensure_ascii=False, indent=2)
                
                # Queue children (limited by MAX_CHILDREN)
                child_uris = data.get('child', [])[:MAX_CHILDREN]
                for child_uri in child_uris:
                    child_id = child_uri.split("/")[-1]
                    if child_id not in visited:
                        queue.append(child_id)
                        visited.add(child_id)
                
                # Update progress
                total_processed += 1
                pbar.update(1)
                
                # Sleep to avoid rate limiting
                time.sleep(SLEEP_TIME)
                
            except Exception as e:
                print(f"⚠️ Error processing {uri}: {str(e)}")
    
    print(f"✅ Done! {total_processed} MMS linearized entities saved to folder '{OUTPUT_DIR}'")

🔐 Getting access token...


Fetching entities: 23 entity [00:11,  2.27 entity/s]

⚠️ Failed to fetch https://id.who.int/icd/release/11/2023-01/mms/unspecified


Fetching entities: 39 entity [00:20,  1.73 entity/s]

⚠️ Failed to fetch https://id.who.int/icd/release/11/2023-01/mms/other


Fetching entities: 745 entity [07:02,  1.76 entity/s]

✅ Done! 745 MMS linearized entities saved to folder 'icd11_mms_crawled_entities_iterative'





# JSON to PD

In [6]:
def extract_icd_text_data(json_folder):
    records = []

    for filename in os.listdir(json_folder):
        if not filename.endswith(".json"):
            continue
        filepath = os.path.join(json_folder, filename)
        with open(filepath, "r", encoding="utf-8") as f:
            data = json.load(f)
        
        # Initialize record with basic fields that might be present
        record = {}
        
        # Core identifiers (always try to get these)
        record["id"] = data.get("@id", "").split("/")[-1]
        record["code"] = data.get("code", "")
        
        # Optional basic metadata
        if "title" in data:
            record["title"] = data.get("title", {}).get("@value", "")
        if "browserUrl" in data:
            record["browser_url"] = data.get("browserUrl", "")
        if "classKind" in data:
            record["class_kind"] = data.get("classKind", "")
        
        # Optional detailed information
        if "definition" in data:
            record["definition"] = data.get("definition", {}).get("@value", "")
        if "fullySpecifiedName" in data:
            record["fully_specified_name"] = data.get("fullySpecifiedName", {}).get("@value", "")

        # Hierarchical relationships (if present)
        if "parent" in data:
            record["parent"] = "; ".join([p.split("/")[-1] for p in data.get("parent", [])])
        if "child" in data:
            record["children"] = "; ".join([c.split("/")[-1] for c in data.get("child", [])])

        # Inclusions (if present)
        if "inclusion" in data:
            inclusions = []
            for inclusion in data["inclusion"]:
                label = inclusion.get("label", {}).get("@value", "")
                if label:
                    inclusions.append(label)
            record["inclusions"] = "; ".join(inclusions)

        # Exclusions (if present)
        if "exclusion" in data:
            exclusions = []
            exclusion_refs = []
            for excl in data["exclusion"]:
                label = excl.get("label", {}).get("@value", "")
                if label:
                    exclusions.append(label)
                    if "foundationReference" in excl:
                        exclusion_refs.append(f"{label}: {excl['foundationReference']}")
            record["exclusions"] = "; ".join(exclusions)
            if exclusion_refs:
                record["exclusion_references"] = "; ".join(exclusion_refs)

        # Foundation children (if present)
        if "foundationChildElsewhere" in data:
            foundation_children = []
            foundation_child_refs = []
            for child in data["foundationChildElsewhere"]:
                label = child.get("label", {}).get("@value", "")
                if label:
                    foundation_children.append(label)
                    if "foundationReference" in child:
                        foundation_child_refs.append(f"{label}: {child['foundationReference']}")
            record["foundation_children"] = "; ".join(foundation_children)
            if foundation_child_refs:
                record["foundation_child_references"] = "; ".join(foundation_child_refs)

        # Index terms (if present)
        if "indexTerm" in data:
            index_terms = []
            index_term_refs = []
            for term in data["indexTerm"]:
                label = term.get("label", {}).get("@value", "")
                if label:
                    index_terms.append(label)
                    if "foundationReference" in term:
                        index_term_refs.append(f"{label}: {term['foundationReference']}")
            record["index_terms"] = "; ".join(index_terms)
            if index_term_refs:
                record["index_term_references"] = "; ".join(index_term_refs)

        # Postcoordination scales (if present)
        if "postcoordinationScale" in data:
            postcoord_scales = []
            for scale in data["postcoordinationScale"]:
                scale_info = {
                    "axis_name": scale.get("axisName", "").split("/")[-1],
                    "required": scale.get("requiredPostcoordination", ""),
                    "allow_multiple": scale.get("allowMultipleValues", ""),
                    "entities": "; ".join([e.split("/")[-1] for e in scale.get("scaleEntity", [])])
                }
                postcoord_scales.append(str(scale_info))
            record["postcoordination_scales"] = " || ".join(postcoord_scales)

        # Related entities (if present)
        if "relatedEntitiesInPerinatalChapter" in data:
            record["related_entities"] = "; ".join([e.split("/")[-1] for e in data["relatedEntitiesInPerinatalChapter"]])

        # Construct full text only from available fields
        full_text_parts = []
        for field in ["title", "definition", "fully_specified_name", "inclusions", 
                     "exclusions", "foundation_children", "index_terms"]:
            if field in record and record[field]:
                full_text_parts.append(record[field])
        record["full_text"] = " ".join(full_text_parts)

        records.append(record)

    # Create DataFrame with all possible columns, filling missing values with empty strings
    df = pd.DataFrame(records)
    df = df.fillna("")
    return df

In [8]:
df3 = extract_icd_text_data("crawling_results/icd11_crawled_entities_iterative_CH_3").sort_values(by="code").reset_index(drop=True)
df3[df3['id']=='224336967']

Unnamed: 0,id,code,title,browser_url,class_kind,definition,parent,index_terms,index_term_references,full_text,related_entities,children,exclusions,exclusion_references,foundation_children,foundation_child_references,postcoordination_scales,inclusions
11,224336967,,Anaemias or other erythrocyte disorders,https://icd.who.int/browse/2023-01/mms/en#2243...,block,,1766440644,,,Anaemias or other erythrocyte disorders Anaemi...,1148519290,963670118; 162762794; 330259189; 975559344; 19...,,,"Anaemia complicating pregnancy, childbirth or ...","Anaemia complicating pregnancy, childbirth or ...",,"Anaemia, unspecified"
