# Import and Load

In [30]:
import re
import pandas as pd
from nltk_setup import setup_nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import seaborn as sns
import matplotlib.pyplot as plt

In [31]:
setup_nltk()

NLTK punkt is already downloaded
NLTK averaged_perceptron_tagger is already downloaded
NLTK stopwords is already downloaded


In [36]:
dg = pd.read_csv('icd11-25_data_clean_with_generated_descriptions.csv')
dg.columns

Index(['id', 'code', 'title', 'browser_url', 'class_kind', 'definition',
       'parent', 'inclusions', 'foundation_children',
       'foundation_child_references', 'index_terms', 'related_entities',
       'full_text', 'children', 'postcoordination_scales',
       'index_term_references', 'exclusions', 'exclusion_references',
       'fully_specified_name', 'generated_description', 'chapter'],
      dtype='object')

In [37]:
dg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13960 entries, 0 to 13959
Data columns (total 21 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   id                           13960 non-null  int64 
 1   code                         13062 non-null  object
 2   title                        13960 non-null  object
 3   browser_url                  13960 non-null  object
 4   class_kind                   13960 non-null  object
 5   definition                   6894 non-null   object
 6   parent                       13960 non-null  int64 
 7   inclusions                   1113 non-null   object
 8   foundation_children          1468 non-null   object
 9   foundation_child_references  1468 non-null   object
 10  index_terms                  10855 non-null  object
 11  related_entities             2287 non-null   object
 12  full_text                    13960 non-null  object
 13  children                     32

# Comparative Analysis

## Pre-processing

In [26]:
def preprocess_text(text, 
                   to_lowercase=True, 
                   remove_punctuation=True, 
                   lemmatize=True,
                   remove_stopwords=True):

    if not isinstance(text, str):
        return ""
    
    # Convert to lowercase
    if to_lowercase:
        text = text.lower()
    
    # Remove punctuation
    if remove_punctuation:
        text = re.sub(r'[^\w\s]', ' ', text)
    
    # Tokenize
    tokens = word_tokenize(text)
    
    # Remove stopwords
    if remove_stopwords:
        stop_words = set(stopwords.words('english'))
        tokens = [token for token in tokens if token not in stop_words]
    
    # Lemmatize
    if lemmatize:
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # Join tokens back into text
    return ' '.join(tokens)

def create_preprocessed_columns(df, original_col='definition',
                                generated_col='generated_description',
                                to_lowercase=True,
                                remove_punctuation=True,
                                lemmatize=True,
                                remove_stopwords=True):

    # Create a copy of the dataframe
    df_processed = df[['id', original_col, generated_col]].copy()
    
    # Create new column names for preprocessed text
    original_processed = f"{original_col}_processed"
    generated_processed = f"{generated_col}_processed"
    
    # Apply preprocessing to both columns
    df_processed[original_processed] = df_processed[original_col].apply(
        lambda x: preprocess_text(x, 
                                to_lowercase=to_lowercase,
                                remove_punctuation=remove_punctuation,
                                lemmatize=lemmatize,
                                remove_stopwords=remove_stopwords)
    )
    
    df_processed[generated_processed] = df_processed[generated_col].apply(
        lambda x: preprocess_text(x,
                                to_lowercase=to_lowercase,
                                remove_punctuation=remove_punctuation,
                                lemmatize=lemmatize,
                                remove_stopwords=remove_stopwords)
    )
    
    return df_processed

## Information Density 

In [27]:
density_df = create_preprocessed_columns(
    dg,
    to_lowercase=True,
    remove_punctuation=True,
    lemmatize=False,
    remove_stopwords=False
)

### Length

### Vocabolary Richness / Complexity

## Overlap

## Named Entity Recongition

# Hierarchy Analysis

# Preparing for embeddings

In [38]:
dg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13960 entries, 0 to 13959
Data columns (total 21 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   id                           13960 non-null  int64 
 1   code                         13062 non-null  object
 2   title                        13960 non-null  object
 3   browser_url                  13960 non-null  object
 4   class_kind                   13960 non-null  object
 5   definition                   6894 non-null   object
 6   parent                       13960 non-null  int64 
 7   inclusions                   1113 non-null   object
 8   foundation_children          1468 non-null   object
 9   foundation_child_references  1468 non-null   object
 10  index_terms                  10855 non-null  object
 11  related_entities             2287 non-null   object
 12  full_text                    13960 non-null  object
 13  children                     32

In [50]:
dv = dg[['id', 'chapter', 'code', 'title', 'generated_description', 'inclusions', 'related_entities', 'children', 'exclusions']].copy()

In [70]:
def create_vectorization_text(df):
    # Create a copy of the dataframe
    df_vector = df.copy()
    
    # Create a dictionary for quick title lookup
    title_lookup = dict(zip(df['id'], df['title']))
    
    def process_row(row):
        # Start with the generated description
        vector_text = row['generated_description']
        
        # Get the condition name from title
        condition_name = row['title']
        
        # Process inclusions if available
        if pd.notna(row['inclusions']):
            # Split inclusions into list and clean
            inclusion_terms = [term.strip() for term in row['inclusions'].split(';')]
            # Filter out terms already in the description
            new_terms = [term for term in inclusion_terms 
                        if term.lower() not in vector_text.lower()]
            if new_terms:
                vector_text += f"\n{condition_name} includes the following diagnostic terms and synonyms: {', '.join(new_terms)}."
        
            # Process related entities if available
        if pd.notna(row['related_entities']):
            # Split related entities into list and clean
            related_ids = [id.strip() for id in row['related_entities'].split(';')]
            # Get titles for each ID
            related_titles = [title_lookup.get(int(id), '') for id in related_ids if id]
            related_titles = [title for title in related_titles if title]  # Remove empty strings
            # Filter out titles that are already mentioned in the text
            new_related_titles = [title for title in related_titles 
                                if title.lower() not in vector_text.lower()]
            if new_related_titles:
                vector_text += f"\n{condition_name} is clinically related to: {', '.join(new_related_titles)}."
        
        # Process children if available
        if pd.notna(row['children']):
            # Split children into list and clean
            child_ids = [id.strip() for id in row['children'].split(';')]
            # Get titles for each ID
            child_titles = [title_lookup.get(int(id), '') for id in child_ids if id and id not in ('other', 'unspecified')]
            child_titles = [title for title in child_titles if title]  # Remove empty strings
            if child_titles:
                vector_text += f"\n{condition_name} is a parent category that includes the following specific conditions: {', '.join(child_titles)}."
        
        # Process exclusions if available
        if pd.notna(row['exclusions']):
            # Split exclusions into list and clean
            exclusion_terms = [term.strip() for term in row['exclusions'].split(';')]
            # Filter out terms already in the description
            new_terms = [term for term in exclusion_terms 
                        if term.lower() not in vector_text.lower()]
            if new_terms:
                vector_text += f"\nTerms that are not categorized under {condition_name} are: {', '.join(new_terms)}."
        
        return vector_text
    
    # Apply the processing function to each row
    df_vector['vectorization_text'] = df_vector.apply(process_row, axis=1)
    
    return df_vector

In [73]:
df_with_vectorization = create_vectorization_text(dv)[['id', 'chapter', 'code', 'title', 'vectorization_text']]

In [76]:
df_with_vectorization.to_csv('icd11-25_data_vectorization.csv')