### Import The Dependencies

In [4]:
# import logging
# import random
# from collections import Counter
# from functools import partial
# from pathlib import Path
# from typing import Optional

# import numpy as np
# import pandas as pd
# %pip install vaex
!conda install -c conda-forge vaex
import vaex


The environment is inconsistent, please check the package plan carefully
The following packages are causing the inconsistency:

  - defaults/win-64::anaconda-catalogs==0.2.0=py311haa95532_0
  - defaults/win-64::anaconda-navigator==2.5.0=py311haa95532_0
  - defaults/win-64::astropy==5.3.4=py311hd7041d2_0
  - defaults/win-64::black==23.11.0=py311haa95532_0
  - defaults/noarch::bleach==4.1.0=pyhd3eb1b0_0
  - defaults/win-64::bokeh==3.3.4=py311h746a85d_0
  - defaults/win-64::conda==23.7.4=py311haa95532_0
  - defaults/win-64::conda-build==3.26.1=py311haa95532_0
  - defaults/noarch::conda-index==0.4.0=pyhd3eb1b0_0
  - defaults/win-64::conda-libmamba-solver==23.7.0=py311haa95532_0
  - defaults/noarch::conda-token==0.4.0=pyhd3eb1b0_0
  - defaults/win-64::dask==2023.11.0=py311haa95532_0
  - defaults/win-64::dask-core==2023.11.0=py311haa95532_0
  - defaults/win-64::datasets==2.12.0=py311haa95532_0
  - defaults/win-64::datashader==0.16.0=py311haa95532_0
  - defaults/win-64::distributed==2023.11.

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... unsuccessful initial attempt using frozen solve. Retrying with flexible solve.
Solving environment: ...working... unsuccessful attempt using repodata from current_repodata.json, retrying with next repodata source.
Collecting package metadata (repodata.json): ...working... done
Solving environment: ...working... unsuccessful initial attempt using frozen solve. Retrying with flexible solve.


ModuleNotFoundError: No module named 'vaex'

### Define the directories 

In [None]:
parent_dir = Path("E:/Paid projects/NLP_ICD10/Final Code/final_data")

### Utilities

In [None]:
def load_gz_file_into_df(path: Path, dtype: Optional[dict] = None):
    """Reads the notes from a path into a dataframe. Saves the file as a feather file."""
    download_dir = path.parents[0]
    stemmed_filename = path.name.split(".")[0]
    if (download_dir / f"{stemmed_filename}.feather").is_file():
        logging.info(
            f"{stemmed_filename}.feather already exists, loading data from {stemmed_filename}.feather into a pandas dataframe."
        )
        return pd.read_feather(download_dir / f"{stemmed_filename}.feather")

    logging.info(
        f"Loading data from {stemmed_filename}.csv.gz into a pandas dataframe. This may take a while..."
    )
    file = pd.read_csv(
        download_dir / f"{stemmed_filename}.csv.gz", compression="gzip", dtype=dtype
    )
    file.to_feather(download_dir / f"{stemmed_filename}.feather")

    return file

def reformat_icd(code: str, version: int, is_diag: bool) -> str:
    """format icd code depending on version"""
    if version == 9:
        return reformat_icd9(code, is_diag)
    elif version == 10:
        return reformat_icd10(code, is_diag)
    else:
        raise ValueError("version must be 9 or 10")

def reformat_icd10(code: str, is_diag: bool) -> str:
    """
    Put a period in the right place because the MIMIC-3 data files exclude them.
    Generally, procedure codes have dots after the first two digits,
    while diagnosis codes have dots after the first three digits.
    """
    code = "".join(code.split("."))
    if not is_diag:
        return code
    return code[:3] + "." + code[3:]

def reformat_icd9(code: str, is_diag: bool) -> str:
    """
    Put a period in the right place because the MIMIC-3 data files exclude them.
    Generally, procedure codes have dots after the first two digits,
    while diagnosis codes have dots after the first three digits.
    """
    code = "".join(code.split("."))
    if is_diag:
        if code.startswith("E"):
            if len(code) > 4:
                return code[:4] + "." + code[4:]
        else:
            if len(code) > 3:
                return code[:3] + "." + code[3:]
    else:
        if len(code) > 2:
            return code[:2] + "." + code[2:]
    return code

def reformat_code_dataframe(row: pd.DataFrame, col: str) -> pd.Series:
    return pd.Series({col: row[col].sort_values().tolist()})

class TextPreprocessor:
    def __init__(
        self,
        lower: bool = True,
        remove_special_characters_mullenbach: bool = True,
        remove_special_characters: bool = False,
        remove_digits: bool = True,
        remove_accents: bool = False,
        remove_brackets: bool = False,
        convert_danish_characters: bool = False,
    ) -> None:
        self.lower = lower
        self.remove_special_characters_mullenbach = remove_special_characters_mullenbach
        self.remove_digits = remove_digits
        self.remove_accents = remove_accents
        self.remove_special_characters = remove_special_characters
        self.remove_brackets = remove_brackets
        self.convert_danish_characters = convert_danish_characters

    def __call__(self, df: vaex.dataframe.DataFrame) -> vaex.dataframe.DataFrame:
        if self.lower:
            df[TEXT_COLUMN] = df[TEXT_COLUMN].str.lower()
        if self.convert_danish_characters:
            df[TEXT_COLUMN] = df[TEXT_COLUMN].str.replace("å", "aa", regex=True)
            df[TEXT_COLUMN] = df[TEXT_COLUMN].str.replace("æ", "ae", regex=True)
            df[TEXT_COLUMN] = df[TEXT_COLUMN].str.replace("ø", "oe", regex=True)
        if self.remove_accents:
            df[TEXT_COLUMN] = df[TEXT_COLUMN].str.replace("é|è|ê", "e", regex=True)
            df[TEXT_COLUMN] = df[TEXT_COLUMN].str.replace("á|à|â", "a", regex=True)
            df[TEXT_COLUMN] = df[TEXT_COLUMN].str.replace("ô|ó|ò", "o", regex=True)
        if self.remove_brackets:
            df[TEXT_COLUMN] = df[TEXT_COLUMN].str.replace("\[[^]]*\]", "", regex=True)
        if self.remove_special_characters:
            df[TEXT_COLUMN] = df[TEXT_COLUMN].str.replace("\n|/|-", " ", regex=True)
            df[TEXT_COLUMN] = df[TEXT_COLUMN].str.replace(
                "[^a-zA-Z0-9 ]", "", regex=True
            )
        if self.remove_special_characters_mullenbach:
            df[TEXT_COLUMN] = df[TEXT_COLUMN].str.replace(
                "[^A-Za-z0-9]+", " ", regex=True
            )
        if self.remove_digits:
            df[TEXT_COLUMN] = df[TEXT_COLUMN].str.replace("(\s\d+)+\s", " ", regex=True)

        df[TEXT_COLUMN] = df[TEXT_COLUMN].str.replace("\s+", " ", regex=True)
        df[TEXT_COLUMN] = df[TEXT_COLUMN].str.strip()
        return df

def preprocess_documents(
    df: pd.DataFrame, preprocessor: TextPreprocessor
) -> pd.DataFrame:
    with vaex.cache.memory_infinite():  # pylint: disable=not-context-manager
        df = vaex.from_pandas(df)
        df = preprocessor(df)
        df["num_words"] = df.text.str.count(" ") + 1
        df["num_targets"] = df[TARGET_COLUMN].apply(len)
        return df.to_pandas_df()

In [None]:
def filter_codes(df: pd.DataFrame, columns: list[str], min_count: int) -> pd.DataFrame:
    """Filter the codes dataframe to only include codes that appear at least min_count times

    Args:
        df (pd.DataFrame): The codes dataframe
        col (str): The column name of the codes
        min_count (int): The minimum number of times a code must appear

    Returns:
        pd.DataFrame: The filtered codes dataframe
    """
    for col in columns:
        code_counts = Counter([code for codes in df[col] for code in codes])
        codes_to_keep = set(
            code for code, count in code_counts.items() if count >= min_count
        )
        df[col] = df[col].apply(lambda x: [code for code in x if code in codes_to_keep])
        print(f"Number of unique codes in {col} before filtering: {len(code_counts)}")
        print(f"Number of unique codes in {col} after filtering: {len(codes_to_keep)}")

    return df


def parse_codes_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    """Parse the codes dataframe"""
    df = df.rename(columns={"hadm_id": ID_COLUMN, "subject_id": SUBJECT_ID_COLUMN})
    df = df.dropna(subset=["icd_code"])
    df = df.drop_duplicates(subset=[ID_COLUMN, "icd_code"])
    df = (
        df.groupby([SUBJECT_ID_COLUMN, ID_COLUMN, "icd_version"])
        .apply(partial(reformat_code_dataframe, col="icd_code"))
        .reset_index()
    )
    return df

def parse_notes_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    """Parse the notes dataframe"""
    df = df.rename(
        columns={
            "hadm_id": ID_COLUMN,
            "subject_id": SUBJECT_ID_COLUMN,
            "text": TEXT_COLUMN,
        }
    )
    df = df.dropna(subset=[TEXT_COLUMN])
    df = df.drop_duplicates(subset=[ID_COLUMN, TEXT_COLUMN])
    return df



### Load the data

In [None]:
mimic_notes = load_gz_file_into_df(parent_dir / "note/discharge.csv.gz")
mimic_proc = load_gz_file_into_df(parent_dir / "hosp/procedures_icd.csv.gz", dtype={"icd_code": str})
mimic_diag = load_gz_file_into_df(parent_dir / "hosp/diagnoses_icd.csv.gz", dtype={"icd_code": str})

### Format the codes by adding decimal points

In [None]:
mimic_proc["icd_code"] = mimic_proc.apply(
    lambda row: reformat_icd(
        code=row["icd_code"], version=row["icd_version"], is_diag=False
    ),
    axis=1,
)
mimic_diag["icd_code"] = mimic_diag.apply(
    lambda row: reformat_icd(
        code=row["icd_code"], version=row["icd_version"], is_diag=True
    ),
    axis=1,
)

### Process codes and notes

In [None]:
# Define column names
ID_COLUMN = "_id"
TEXT_COLUMN = "text"
TARGET_COLUMN = "target"
SUBJECT_ID_COLUMN = "subject_id"

# Process
mimic_proc = parse_codes_dataframe(mimic_proc)
mimic_diag = parse_codes_dataframe(mimic_diag)
mimic_notes = parse_notes_dataframe(mimic_notes)

### Merge the codes and notes into a icd9 and icd10 dataframe

In [None]:
MIN_TARGET_COUNT = 10  # Minimum number of times a code must appear to be included
preprocessor = TextPreprocessor(
    lower=True,
    remove_special_characters_mullenbach=True,
    remove_special_characters=False,
    remove_digits=True,
    remove_accents=False,
    remove_brackets=False,
    convert_danish_characters=False,
)

random.seed(10)

In [None]:
mimic_proc_9 = mimic_proc[mimic_proc["icd_version"] == 9]
mimic_proc_9 = mimic_proc_9.rename(columns={"icd_code": "icd9_proc"})
mimic_proc_10 = mimic_proc[mimic_proc["icd_version"] == 10]
mimic_proc_10 = mimic_proc_10.rename(columns={"icd_code": "icd10_proc"})

mimic_diag_9 = mimic_diag[mimic_diag["icd_version"] == 9]
mimic_diag_9 = mimic_diag_9.rename(columns={"icd_code": "icd9_diag"})
mimic_diag_10 = mimic_diag[mimic_diag["icd_version"] == 10]
mimic_diag_10 = mimic_diag_10.rename(columns={"icd_code": "icd10_diag"})

mimiciv_9 = mimic_notes.merge(
    mimic_proc_9[[ID_COLUMN, "icd9_proc"]], on=ID_COLUMN, how="left"
)
mimiciv_9 = mimiciv_9.merge(
    mimic_diag_9[[ID_COLUMN, "icd9_diag"]], on=ID_COLUMN, how="left"
)

mimiciv_10 = mimic_notes.merge(
    mimic_proc_10[[ID_COLUMN, "icd10_proc"]], on=ID_COLUMN, how="left"
)
mimiciv_10 = mimiciv_10.merge(
    mimic_diag_10[[ID_COLUMN, "icd10_diag"]], on=ID_COLUMN, how="left"
)



In [None]:
# remove notes with no codes
mimiciv_9 = mimiciv_9.dropna(subset=["icd9_proc", "icd9_diag"], how="all")
mimiciv_10 = mimiciv_10.dropna(subset=["icd10_proc", "icd10_diag"], how="all")

# convert NaNs to empty lists
mimiciv_9["icd9_proc"] = mimiciv_9["icd9_proc"].apply(
    lambda x: [] if x is np.nan else x
)
mimiciv_9["icd9_diag"] = mimiciv_9["icd9_diag"].apply(
    lambda x: [] if x is np.nan else x
)
mimiciv_10["icd10_proc"] = mimiciv_10["icd10_proc"].apply(
    lambda x: [] if x is np.nan else x
)
mimiciv_10["icd10_diag"] = mimiciv_10["icd10_diag"].apply(
    lambda x: [] if x is np.nan else x
)

mimiciv_9 = filter_codes(mimiciv_9, ["icd9_proc", "icd9_diag"], MIN_TARGET_COUNT)
mimiciv_10 = filter_codes(mimiciv_10, ["icd10_proc", "icd10_diag"], MIN_TARGET_COUNT)

In [None]:
# define target
mimiciv_9[TARGET_COLUMN] = mimiciv_9["icd9_proc"] + mimiciv_9["icd9_diag"]
mimiciv_10[TARGET_COLUMN] = mimiciv_10["icd10_proc"] + mimiciv_10["icd10_diag"]

# remove empty target
mimiciv_9 = mimiciv_9[mimiciv_9[TARGET_COLUMN].apply(lambda x: len(x) > 0)]
mimiciv_10 = mimiciv_10[mimiciv_10[TARGET_COLUMN].apply(lambda x: len(x) > 0)]


### Text preprocess the notes

In [None]:
mimiciv_9 = preprocess_documents(df=mimiciv_9, preprocessor=preprocessor)
mimiciv_10 = preprocess_documents(df=mimiciv_10, preprocessor=preprocessor)

### Save files to disk

In [None]:
mimiciv_9.to_feather(parent_dir / "mimiciv_icd9.feather")
mimiciv_10.to_feather(parent_dir / "mimiciv_icd10.feather")