## Setup

In [165]:
import pandas as pd
import pickle
from typing import List, Dict


In [125]:
# These pickles are the outputs of the medbert repo: https://github.com/ZhiGroup/Med-BERT/blob/master/Pretraining%20Code/Readme.md
PRETRAINED_TRAIN_PICKLE_PATH = '/sise/home/benshoho/projects/Med-BERT/Pretraining Code/Data Pre-processing Code/temp-try.bencs.train'
MEDBERT_CODES_DICT_PATH = '/sise/home/benshoho/projects/Med-BERT/Pretraining Code/Data Pre-processing Code/temp-try.types'
PRETRAINED_VALIDATION_PICKLE_PATH = '/sise/home/benshoho/projects/Med-BERT/Pretraining Code/Data Pre-processing Code/temp-try.bencs.valid'
PRETRAINED_TEST_PICKLE_PATH = '/sise/home/benshoho/projects/Med-BERT/Pretraining Code/Data Pre-processing Code/temp-try.bencs.test'


MEDBERT_OUTPUT_PICKLES_DIR = '/sise/home/benshoho/projects/Med-BERT/Fine-Tunning-Tutorials/data/mimic-iv'

TARGET_DISEASE_IDS = {'157'} # CCS category code.



In [None]:
import pickle

with open(MEDBERT_CODES_DICT_PATH, 'rb') as file:
    loaded_data = pickle.load(file)
loaded_data

In [127]:
with open(PRETRAINED_TRAIN_PICKLE_PATH, 'rb') as f:
    medbert_train_data = pickle.load(f)
with open(PRETRAINED_VALIDATION_PICKLE_PATH, 'rb') as f:
    medbert_validation_data = pickle.load(f)
with open(PRETRAINED_TEST_PICKLE_PATH, 'rb') as f:
    medbert_test_data = pickle.load(f)


In [128]:
len(medbert_train_data), len(medbert_validation_data), len(medbert_test_data)

(59118, 8445, 16890)

### Convert pickle to df


In [None]:
train_df = pd.DataFrame(medbert_train_data, columns= ['person_id', 'los', 'time_not_used', 'code', 'visits'])
validation_df = pd.DataFrame(medbert_validation_data, columns= ['person_id', 'los', 'time_not_used', 'code', 'visits'])
test_df = pd.DataFrame(medbert_test_data, columns= ['person_id', 'los', 'time_not_used', 'code', 'visits'])
for x in (train_df, validation_df, test_df):
    x.drop(columns=['los', 'time_not_used'], inplace=True)
train_df

In [None]:
with open(MEDBERT_CODES_DICT_PATH, 'rb') as f:
          code_to_id_dict = pickle.load(f)
print(code_to_id_dict)

def convert_codes_to_ids(codes: List[str], code_to_id_dict: Dict[str, int]):
    converted_codes = []
    for code in codes: 
        converted_codes.append(code_to_id_dict[str(code)])
    return converted_codes


In [132]:
TARGET_DISEASE_IDS = convert_codes_to_ids(TARGET_DISEASE_IDS, code_to_id_dict)
TARGET_DISEASE_IDS

[61]


## Convert to medbert format

In [None]:
train_df


In [134]:
def add_sep_between_visits(row):
    codes, visits = row.code, row.visits
    new_codes = []
    new_visits = []
    
    for i in range(len(codes)):
        new_codes.append(codes[i])
        new_visits.append(visits[i])
        if i < len(codes) - 1 and visits[i] != visits[i + 1]:
            new_codes.append('SEP')
            new_visits.append('SEP')
    new_codes.append('SEP')
    new_visits.append('SEP')
    assert len(new_codes) == len(new_visits)
    return new_codes, new_visits

for x in (train_df, validation_df, test_df):
    x[['code', 'visits']] = x.apply(add_sep_between_visits, axis=1, result_type='expand')


In [137]:
def count_lists_with_fewer_seps(inner_list):
    sep_count = inner_list.count('SEP')
    if sep_count < 2:
        return True
    return False


def count_lists_with_target_before_sep(inner_list):
    found_sep = False

    for item in inner_list:
        if item == 'SEP':
            found_sep = False
            return False
        if item in TARGET_DISEASE_IDS and not found_sep:
            return True
    return False

all_codes = list(train_df['code'])
total_count = 0
for codes_list in all_codes:
    if count_lists_with_target_before_sep(codes_list):
        total_count += 1
    elif count_lists_with_fewer_seps(codes_list):
        total_count += 1
    
print(len(all_codes) - total_count)

18654


In [None]:
def target_disease_in_first_visit(row):
    codes = row.code
    visits_num = row.visits
    indexes_to_remove = []
    for code, visit in zip(codes, visits_num):
        if code in TARGET_DISEASE_IDS and visit == 1:
            return True
    return False

for x in (train_df, validation_df, test_df):
    mask = x.apply(target_disease_in_first_visit, axis=1)
    x.drop(index=x[mask].index, inplace=True)
    x.drop(x[x['visits'].apply(lambda x: x.count('SEP') < 2)].index, inplace=True)  # Remove rows with only one visit.

train_df.shape

In [None]:
train_df.shape, validation_df.shape, test_df.shape

### To medbert pickle format for fine-tuning

In [141]:
def find_start_index_of_visit(visits, visit_num_to_predict):
    return visits.index(visit_num_to_predict)

def get_history(codes, visits, visit_num_to_predict):
    index = find_start_index_of_visit(visits, visit_num_to_predict)
    return codes[:index], visits[:index]

In [142]:
get_history([1, 2, 4, 'SEP', 5, 'SEP', 6, 10, 'SEP'], [1, 1, 1, 1, 1, 1, 1, 'SEP', 2, 2, 2, 2, 2, 2, 'SEP'], 2)

([1, 2, 4, 'SEP', 5, 'SEP', 6, 10], [1, 1, 1, 1, 1, 1, 1, 'SEP'])

In [143]:
from typing import List
import random

def has_target_disease(target_disease_ids: List[str], codes: List[str]):
    # return True if at least one from target_disease_ids can be found in codes and its index in the codes.
    for index, code in enumerate(codes):
        if code in target_disease_ids:
            if index == 0:
                print(codes)
            return True
    return False

def random_negative_visit(visits: List[str]):
    # get a random visit number. 
    temp_visits = set(visits)
    # print(temp_visits)
    temp_visits.remove('SEP')
    temp_visits.remove(1) # because we need history of at least one visit. 
    random_visit_num = random.choice(list(temp_visits))
    return random_visit_num # that's the index to predict. we need at least two visits (one for history and second to predict)

def get_positive_visit(codes, visits): 
    if not codes:
        print('Empty list of codes!!!!!!!!!!!!!!!!!!!!!')
        return []
    for index, code in enumerate(codes):
        if code in TARGET_DISEASE_IDS:
            return visits[index] # that's the index to predict. we need at least two visits (one for history and second to predict)
    print('Index was not found!')
    return []

def preprocess_patient_records(codes, visits, was_target_found: bool):
    if was_target_found:
        visit_num = get_positive_visit(codes, visits)
    else:
        visit_num = random_negative_visit(visits)
    return get_history(codes, visits, visit_num)
    
def preprocess_patient_data(row):
    person_id = row['person_id']
    codes = row['code']
    visits = row['visits']
    assert len(codes) == len(visits)
    classification_binary_label = has_target_disease(TARGET_DISEASE_IDS, codes)
    #try:
    codes, visits = preprocess_patient_records(codes, visits, classification_binary_label)
    # except: 
    #     print(f'person_id={person_id}')
    assert len(codes) == len(visits)

    return codes, visits, 1 if classification_binary_label else 0


In [None]:
for x in (train_df, validation_df, test_df):
    x[['code', 'visits', 'label']] = x.apply(preprocess_patient_data, axis=1, result_type='expand')
train_df

In [None]:
train_df['label'].value_counts()

In [None]:
train_df['label'].value_counts()[0] / train_df['label'].value_counts()[1]

In [None]:
validation_df['label'].value_counts()[0] / validation_df['label'].value_counts()[1]

In [None]:
test_df['label'].value_counts()[0] / test_df['label'].value_counts()[1]

In [None]:
train_df[train_df['label'] == 1]

In [None]:
from collections import Counter
Counter(train_df['label'])

In [None]:
def filter_sep(row):
    codes = row.code
    visits_num = row.visits
    indexes_to_remove = []
    assert len(codes) == len(visits_num)
    for index, code in enumerate(codes):
        if code == 'SEP':
            indexes_to_remove.append(index)
    assert len(codes) == len(visits_num)
    codes = [code for i, code in enumerate(codes) if i not in indexes_to_remove]
    visits_num = [num for i, num in enumerate(visits_num) if i not in indexes_to_remove]
    assert len(codes) == len(visits_num)
    return codes, visits_num

for x in (train_df, validation_df, test_df):
    x[['code', 'visits']] = x.apply(filter_sep, axis=1, result_type='expand')

train_df.head()

In [None]:
train_df[train_df['person_id'] == 14339711]

### To pickles

In [127]:
def write_df_to_pickle(df: pd.DataFrame, pickle_output_dir: str, df_type: str, disease_name: str):
    # df with columns: person_id, code
    # Create a list to store patient records
    patient_records = []

    # Iterate over each row in the DataFrame
    for index, row in df.iterrows():
        # Extract the necessary information from the row
        pt_id = row['person_id']
        label = row['label']
        seq_list = row['code']
        segment_list = row['visits']
        # print(seq_list)
        # print(segment_list)
        assert len(seq_list) == len(segment_list)
        
        # Create a patient record as a sublist
        patient_record = [pt_id, label, seq_list, segment_list]
        # Append the patient record to the list of patient records
        patient_records.append(patient_record)

    # Write the list of patient records to a pickle file
    output_pickle_path = f'{pickle_output_dir}/{disease_name}_{df_type}.pickle'
    with open(output_pickle_path, 'wb') as file:
        pickle.dump(patient_records, file)


In [128]:
for current_df, current_df_type in zip([train_df, validation_df, test_df], ['train', 'validation', 'test']):
    write_df_to_pickle(current_df, MEDBERT_OUTPUT_PICKLES_DIR, current_df_type, disease_name='chronic_kidney_disease')

In [None]:
train_df

In [130]:
MEDBERT_OUTPUT_PICKLES_DIR

'/sise/home/benshoho/projects/Med-BERT/Fine-Tunning-Tutorials/data/mimic-iv'

## Represent as icd10 description after aggegation instead the aggegated code

In [131]:
import requests

def get_icd10_description(icd_code):
    print(f'sending http request for icd_code={icd_code}')
    base_url = "http://icd10api.com/"
    params = {
        "code": icd_code,
        "desc": "short",
        "r": "json"
    }

    try:
        response = requests.get(base_url, params=params)
        response.raise_for_status()  # Raise an exception for unsuccessful responses
        data = response.json()
        description = data.get("Description")
        return description
    except Exception as e:
        print(f"Error occurred during API request: {e}")
        return None

In [132]:
import pandas as pd
import requests

id_to_aggregated_code_dict = {v:k for k,v in code_to_id_dict.items()}

icd_aggegator_df = pd.read_csv('/sise/home/benshoho/projects/feature extraction/ccs_dx_icd10cm_2018_1.csv') # ccs aggegations.
icd_aggegator_df.columns = icd_aggegator_df.columns.str.replace("'", "")
icd_aggegator_df = icd_aggegator_df.applymap(lambda x: x.replace("'", ""))

icd10_ccs_mapping = icd_aggegator_df.set_index('CCS CATEGORY')['CCS CATEGORY DESCRIPTION'].to_dict() # for example:'1': 'Tuberculosis',

def from_medbert_code_to_description(code):
    aggregated_code = id_to_aggregated_code_dict[code]
    if aggregated_code in icd10_ccs_mapping:
        text_description = icd10_ccs_mapping[aggregated_code]
    else:
        # in case of missing description we get the description from icd10api. 
        text_description = get_icd10_description(aggregated_code)
    return text_description

def from_medbert_codes_to_description(codes):
    return [from_medbert_code_to_description(code) for code in codes]

In [None]:
from_medbert_codes_to_description([1, 44, 33, 89, 44])

In [None]:
all_desc = set(icd10_ccs_mapping.values())
for data in (train_with_description_df, validation_with_description_df, test_with_description_df):
    data_codes =  data['code']
    for c in data_codes:
        all_desc = all_desc.union(c)

import pickle
with open('/sise/home/benshoho/projects/Med-BERT/Pretraining Code/Data Pre-processing Code/mimic_iv_descriptions_set.types', "wb") as pickle_file:
    pickle.dump(all_desc, pickle_file)


In [None]:
train_with_description_df, validation_with_description_df, test_with_description_df = train_df.copy(), validation_df.copy(), test_df.copy()
for x in (train_with_description_df, validation_with_description_df, test_with_description_df):
    x['code'] = x['code'].apply(from_medbert_codes_to_description)
train_with_description_df

### save new pickles with text description instead of numbers. 

In [136]:
for current_df, current_df_type in zip([train_with_description_df, validation_with_description_df, test_with_description_df], ['train', 'validation', 'test']):
    write_df_to_pickle(current_df, MEDBERT_OUTPUT_PICKLES_DIR, current_df_type, disease_name='chronic_kidney_disease_descriptions')