In [1]:
import pandas as pd

In [2]:
d = pd.read_csv('icd11_data_raw.csv')

In [3]:
d.columns

Index(['id', 'code', 'title', 'browser_url', 'class_kind', 'definition',
       'parent', 'inclusions', 'foundation_children',
       'foundation_child_references', 'index_terms', 'related_entities',
       'full_text', 'children', 'postcoordination_scales',
       'index_term_references', 'exclusions', 'exclusion_references',
       'fully_specified_name'],
      dtype='object')

# Generated descriptions

In [107]:
from openai import OpenAI
from dotenv import load_dotenv
import os

# Load environment variables
load_dotenv()

# Define the system message
SYSTEM_MESSAGE = """
Your name is Llama3-OpenBioLLM-70B. You are an expert and experienced from the healthcare and biomedical domain with extensive medical knowledge.
Your mission is to provide comprehensive, technical, and accurate medical description descriptions of diseases and disease categories.
The user will input the name of a diseases or the name of a category of diseases.
You will provide the description of the query. 
Always structure the sentences of your response in this order: overview, causes, symptoms, transmission, diagnosis. 
Write full sentences, using a concise and clear language.
"""

def get_medical_definition(query, temperature=0.2):
    """
    Get a medical definition using the Nebius API and Llama3-OpenBioLLM-70B model.
    
    Args:
        query (str): The medical condition or term to define
        temperature (float, optional): Controls randomness in the response. 
                                     Lower values make the output more focused and deterministic.
                                     Defaults to 0.1.
    
    Returns:
        str: The model's response containing the medical definition
        
    Raises:
        ValueError: If the API key is not found in environment variables
    """
    # Check if API key exists
    api_key = os.getenv('NEBIUS_API_KEY')
    if not api_key:
        raise ValueError("NEBIUS_API_KEY not found in environment variables")
    
    # Initialize the client
    client_nebius = OpenAI(
        base_url="https://api.studio.nebius.com/v1/",
        api_key=api_key)
    
    try:
        response = client_nebius.chat.completions.create(
            model="aaditya/Llama3-OpenBioLLM-70B",
            temperature=temperature,
            messages=[
                {"role": "system", "content": SYSTEM_MESSAGE},
                {"role": "user", "content": f"Describe {query}?"}
            ],
            max_completion_tokens=800 # set by looking at the length of the 3rd longest definition, first and second longest need cleaning
        )
        
        # Return the response text
        return response.choices[0].message.content
        
    except Exception as e:
        print(f"Error getting medical definition: {str(e)}")
        raise

In [6]:
from tqdm import tqdm
import pandas as pd
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
import numpy as np

# Create a new column for generated descriptions
d['generated_description'] = None

def process_row(row_data):
    """
    Process a single row and return the result
    """
    idx, row = row_data
    try:
        disease_title = row['title']
        generated_desc = get_medical_definition(disease_title)
        return idx, generated_desc
    except Exception as e:
        print(f"\nError processing row {idx} for disease {disease_title}: {str(e)}")
        return idx, None

# Number of threads to use
num_threads = 8

# Create a progress bar
pbar = tqdm(total=13373, desc="Generating descriptions")

# Process rows in parallel using threads
with ThreadPoolExecutor(max_workers=num_threads) as executor:
    # Submit all rows for processing
    future_to_row = {
        executor.submit(process_row, (idx, row)): idx 
        for idx, row in d.iloc[:13373].iterrows()
    }
    
    # Process results as they complete
    for future in as_completed(future_to_row):
        idx, desc = future.result()
        d.at[idx, 'generated_description'] = desc
        pbar.update(1)
        
        # Save progress every 100 rows
        if idx % 100 == 0:
            d.to_csv('icd11_data_with_generated_descriptions.csv', index=False)

pbar.close()

# Final save
d.to_csv('icd11_data_with_generated_descriptions.csv', index=False)

Generating descriptions: 100%|██████████| 13373/13373 [2:58:04<00:00,  1.25it/s]  


In [108]:
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

def generate_descriptions_for_dataframe(df, num_threads=8, batch_size=100):
    """
    Add generated descriptions for disease titles to a dataframe.
    
    Args:
        df: A pandas DataFrame containing at least a 'title' column
        num_threads: Number of parallel threads to use (default: 8)
        batch_size: How often to save intermediate results (default: 100)
        
    Returns:
        DataFrame with added 'generated_description' column
    """
    
    # Create a copy of the dataframe to avoid modifying the original
    result_df = df.copy()
    
    # Add the new column
    result_df['generated_description'] = None
    
    def process_row(row_data):
        """Process a single row and return the result"""
        idx, row = row_data
        try:
            disease_title = row['title']
            generated_desc = get_medical_definition(disease_title)
            return idx, generated_desc
        except Exception as e:
            print(f"\nError processing row {idx} for disease {row['title']}: {str(e)}")
            return idx, None
    
    # Create a progress bar
    total_rows = len(result_df)
    pbar = tqdm(total=total_rows, desc="Generating descriptions")
    
    # Process rows in parallel using threads
    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        # Submit all rows for processing
        future_to_row = {
            executor.submit(process_row, (idx, row)): idx 
            for idx, row in result_df.iterrows()
        }
        
        temp_file_path = 'temp_descriptions_progress.csv'
        
        # Process results as they complete
        for future in as_completed(future_to_row):
            idx, desc = future.result()
            result_df.at[idx, 'generated_description'] = desc
            pbar.update(1)
            
            # Save progress at regular intervals
            if idx % batch_size == 0:
                result_df.to_csv(temp_file_path, index=False)
    
    pbar.close()
    return result_df

# Merging on ICD 2025

In [50]:
df23 = pd.read_csv('icd11-23_data_with_generated_descriptions.csv').iloc[:13373] ; print(df23.shape)
df25 = pd.read_csv('icd11-25_data_raw.csv') ; print(df25.shape)

(13373, 20)
(13960, 19)


In [78]:
df23_ids_gen = df23[['id', 'generated_description']].copy()
df25 = df25.merge(df23_ids_gen, on='id', how='left')

In [112]:
target_ids = df25[df25['generated_description'].isna()].index.tolist()

In [115]:
df25_sub = df25.iloc[target_ids][['id', 'title']].copy()
df25_sub = generate_descriptions_for_dataframe(df25_sub)

Generating descriptions:   0%|          | 0/618 [00:00<?, ?it/s]

Generating descriptions: 100%|██████████| 618/618 [08:25<00:00,  1.22it/s]


In [None]:
df25.loc[target_ids, 'generated_description'] = df25_sub['generated_description']

Unnamed: 0,id,code,title,browser_url,class_kind,definition,parent,inclusions,foundation_children,foundation_child_references,index_terms,related_entities,full_text,children,postcoordination_scales,index_term_references,exclusions,exclusion_references,fully_specified_name,generated_description


In [118]:
df25.to_csv('icd11-25_data_with_generated_descriptions.csv', index=False)