# TODO:
- farlo funzionare con tutti i nodi
- includere le informazioni di passi intermedi
- includere le informazioni dei collegamenti attraverso i nodi
 

If you are developing a tool that will require ICD-11 codes and the structure as shown in the ICD-11 Release then in the API you need to use the linearization endpoints with the linearizationname set to mms.

In [1]:
import requests
import json
import time
import os
import collections

# ========== CONFIGURATION ==========
CLIENT_ID = "6afc7b11-86c2-40f5-82d7-39c6c5869dec_38c25903-b150-4472-80cf-58648427fd58"
CLIENT_SECRET = "4kfkQUPOf4fBozRuZIMYsfff4n22mxUg/7dz85YXMqM="
TOKEN_ENDPOINT = 'https://icdaccessmanagement.who.int/connect/token'
ROOT_ENTITY_URI = 'https://id.who.int/icd/release/11/2023-01/mms'
MMS_BASE_URI = 'https://id.who.int/icd/release/11/2023-01/mms/'
OUTPUT_DIR = 'icd11_mms_crawled_entities'
SLEEP_TIME = 0.3  # To avoid rate limiting
MAX_ENTITIES = 26
# ===================================

def get_access_token():
    payload = {
        'client_id': CLIENT_ID,
        'client_secret': CLIENT_SECRET,
        'scope': 'icdapi_access',
        'grant_type': 'client_credentials'
    }
    response = requests.post(TOKEN_ENDPOINT, data=payload)
    response.raise_for_status()
    token_data = response.json()
 
    return token_data['access_token']

def fetch_entity(entity_id, headers, visited, output_dir):
    if entity_id in visited:
        return
    visited.add(entity_id)

    uri = f'{MMS_BASE_URI}{entity_id}'
    response = requests.get(uri, headers=headers)
    if response.status_code != 200:
        print(f"⚠️ Failed to fetch {uri}")
        return

    data = response.json()
   
    with open(os.path.join(output_dir, f"{entity_id}.json"), 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

    # Recurse into children (limit to x children per node for quick test)
    child_per_node_limit = 2
    child_uris = data.get('child', [])[:child_per_node_limit]
    for child_uri in child_uris:
        child_id = child_uri.split("/")[-1]
        time.sleep(SLEEP_TIME)
        fetch_entity(child_id, headers, visited, output_dir)

def crawl_icd11():
    os.makedirs(OUTPUT_DIR, exist_ok=True)

    print("🔐 Getting access token...")
    token = get_access_token()

    headers = {
        'Authorization': f'Bearer {token}',
        'Accept': 'application/json',
        'Accept-Language': 'en',
        'API-Version': 'v2'
    }

    
    response = requests.get(ROOT_ENTITY_URI, headers=headers)
    root_data = response.json()
    root_children = root_data.get('child', [])

    root_ids = [uri.split('/')[-1] for uri in root_children[:MAX_ENTITIES]]
    with open("MMS_ROOTS.json", 'w') as f:
        json.dump(root_ids, f)

    visited = set()
    for entity_id in root_ids:
        fetch_entity(entity_id, headers, visited, OUTPUT_DIR)

    print(f"✅ Done! MMS linearized entities saved to '{OUTPUT_DIR}'")
    
crawl_icd11()

🔐 Getting access token...
⚠️ Failed to fetch https://id.who.int/icd/release/11/2023-01/mms/other
⚠️ Failed to fetch https://id.who.int/icd/release/11/2023-01/mms/unspecified
✅ Done! MMS linearized entities saved to 'icd11_mms_crawled_entities'


In [2]:
import os
import json
import pandas as pd

def extract_icd_text_data(json_folder):
    records = []

    for filename in os.listdir(json_folder):
        if not filename.endswith(".json"):
            continue
        filepath = os.path.join(json_folder, filename)
        with open(filepath, "r", encoding="utf-8") as f:
            data = json.load(f)
        
        code = data.get("code", "")
        id_ = data.get("@id", "").split("/")[-1]
        title = data.get("title", {}).get("@value", "")
        definition = data.get("definition", {}).get("@value", "")
        
        # Collect synonyms if available
        synonyms = []
        for s in data.get("synonym", []):
            label = s.get("label", {}).get("@value")
            if label:
                synonyms.append(label)
        
        full_text = f"{title}. {definition} {' '.join(synonyms)}"

        records.append({
            "id": id_,
            "code": code,
            "title": title,
            "definition": definition,
            "synonyms": "; ".join(synonyms),
            "full_text": full_text
        })

    return pd.DataFrame(records)

# Usage
df = extract_icd_text_data("icd11_mms_crawled_entities")
#df.to_csv("icd11_text_data.csv", index=False)


In [22]:
df.sort_values(by="code", inplace=True)
df[df['code'] != ''].iloc[-50:]

Unnamed: 0,id,code,title,definition,synonyms,full_text
929,1704767204,NA21.0,Laceration without foreign body of neck,,,Laceration without foreign body of neck.
587,1193245831,NA21.1,Laceration with foreign body of neck,,,Laceration with foreign body of neck.
295,1114138453,PA00,Unintentional land transport traffic event inj...,,,Unintentional land transport traffic event inj...
441,1460349350,PA01,Unintentional land transport traffic event inj...,,,Unintentional land transport traffic event inj...
287,1595005935,PA10,Unintentional land transport nontraffic event ...,,,Unintentional land transport nontraffic event ...
939,2133070765,PA11,Unintentional land transport nontraffic event ...,,,Unintentional land transport nontraffic event ...
427,415990995,PA60,Unintentional fall on the same level or from l...,,,Unintentional fall on the same level or from l...
728,1729453916,PA61,Unintentional fall from a height of 1 metre or...,,,Unintentional fall from a height of 1 metre or...
318,1494862045,PB80,Intentional self-harm by land transport road t...,,,Intentional self-harm by land transport road t...
1223,110743840,PB81,Intentional self-harm by land transport off-ro...,,,Intentional self-harm by land transport off-ro...
