## This notebook is to load the annotated dataset and work with it.

Input: a EHR note (MIMIC)

Instruction: Please identify 5~10 word tokens from the EHR note. Those 5~10 word tokens should be most important for a patient to understand their clinical conditions, procedures, and treatment plans.

Output: keywords [use either human annotated, or just use MIMIC outputs
[overlapping keywords between notes and discharge instructions- advantage: llama and GPT4]

## ======================================= Process annotation notes

## ========================================== Process with Rankings
### creating 5 fold CV datasets
- I will split 104 datasets into 5 folds (21,21,21,21,20)
- save the dataset

In [1]:
import pandas as pd
import random
random.seed(52)

pages = ['victoria', 'jiaping', 'jinying-3A', 'jinying-2B-part1', 'jinying_2B_part2','jinying_3C']

datas = []
for page in pages :
    df = pd.read_excel("../data/raw/all_20160531_original.xlsx", page)
    # df = pd.read_excel("../data/processed/all_20160531_changed.xlsx", page)
    datas.append(df)

For each Packet, there might be some overlap

In [2]:
def preprocess_data(data, topn) :
    data = data.dropna(subset=['ranking']).copy()
    data = data.drop(columns = ['packet'])

    data['fileid'] = data['fileid'].str.lower()
    data['fileid'] = data['fileid'].str.strip()
    data['fileid'] = data.fileid.str.replace('heart failure', 'heart_failure')
    data['fileid'] = data.fileid.str.replace('liver failure', 'liver_failure')
    data['fileid'] = data['fileid'].str.replace(" ", "")
    data['fileid'] = data['fileid'].apply(lambda x : x + '.txt' if not x.endswith('.txt') else x)

    # process phrases
    data['phrase'] = data['phrase'].str.lower()
    data['phrase'] = data['phrase'].str.strip()

    # process colors 
    data['color'] = data['color'].str.lower()
    data['color'] = data['color'].replace({"yellow" : "y", "green" : "g"})

    data = data.dropna()

    # symptoms : if there are duplicates, drop them
    symptoms = data[data.color == 'y'].sort_values(by=['fileid', 'ranking']).groupby(['fileid','color']).head(topn)
    symptoms = symptoms.drop_duplicates(subset=['fileid', 'phrase'])

    # tests : only filter labtests below topn and drop duplicates
    labtests = data[data.color == 'g'].sort_values(by=['fileid', 'ranking'])
    labtests = labtests[labtests.ranking < (topn + 1)].copy()
    labtests = labtests.drop_duplicates(subset=['fileid', 'phrase'], keep='first')

    result = pd.concat([symptoms, labtests], ignore_index=True)
    result = result.sort_values(by=['fileid','ranking'])

    # grouped = data.sort_values(by=['fileid','ranking'])
    # result = grouped.groupby(['fileid']).head(topn)

    result = result.drop_duplicates(subset=['fileid', 'ranking'])
    # result = result[~result.fileid.isin(ids)].reset_index(drop=True)
    return result.reset_index(drop=True)

In [3]:
# don't erase this!
example_text = "cancer.report24.txt"
example_texts = ["cancer.report24.txt", "diabetes.report310966.txt", "heart_failure.report121015.txt", "liver_failure.report10101.txt"]

In [4]:
fileids = pd.concat(datas).fileid.unique().tolist()

In [5]:
len(fileids)

112

In [6]:
def split_cv(ids : list, nfold : int) :
    random.shuffle(ids)
    N = len(ids)

    counts = N // nfold
    leftover = N % nfold

    indices = [counts for idx in range(1,nfold+1)]
    indices = [idx + 1 if i < leftover else idx for i, idx in enumerate(indices)]

    folds = []
    past_idx = 0
    for i, idx in enumerate(indices) :
        idx += past_idx
        if i == len(indices) :
            folds.append(ids[past_idx:])
        else :
            folds.append(ids[past_idx:idx])
        past_idx = idx

    return folds

In [7]:
ids = []
processed_datasets_top10 = []
for data in datas :
    processed = preprocess_data(data, 10)
    processed_datasets_top10.append(processed)
    # ids.extend(data_ids)

top10_dataset = pd.concat(processed_datasets_top10, ignore_index=True)
top10_dataset = top10_dataset[~(top10_dataset.fileid.isin(example_texts))].reset_index(drop=True)

In [8]:
# process duplicates
top10_dataset = top10_dataset.groupby(['fileid','phrase'], as_index=False)['ranking'].mean()

In [9]:
top10_dataset

Unnamed: 0,fileid,phrase,ranking
0,cancer.report11.txt,asa,4.2
1,cancer.report11.txt,chemo,1.1
2,cancer.report11.txt,colonoscopy,7.3
3,cancer.report11.txt,dexa,7.4
4,cancer.report11.txt,diet-controlled,2.1
...,...,...,...
1303,liver_failure.report60517.txt,omeprazole,3.6
1304,liver_failure.report60517.txt,ranitidine,4.2
1305,liver_failure.report60517.txt,remission,1.1
1306,liver_failure.report60517.txt,sucralfate,4.3


In [11]:
notes = pd.read_pickle("../data/processed/notes.pkl")
notes.head()

Unnamed: 0,category,noteid,text
0,liver_failure,liver_failure.report37286.txt,This is a 50-year-old male with a history of d...
1,liver_failure,liver_failure.report37775.txt,Dr. name has discussed these results with you....
2,liver_failure,liver_failure.report38874.txt,"F/u on Osteoarthritis, chronic pain, HTN, Depr..."
3,liver_failure,liver_failure.report41972.txt,Very high a1c and glucose please follow up in ...
4,liver_failure,liver_failure.report51432.txt,name is a lovely just turned 65-year-old gentl...


In [12]:
overlap = set(notes.noteid) & set(top10_dataset.fileid)
len(overlap)

102

In [13]:
top10_dataset = top10_dataset[top10_dataset.fileid.isin(overlap)]

In [29]:
# drop rank == 0
msk = top10_dataset.ranking < 1
top10_dataset = top10_dataset[~msk].reset_index(drop=True)

In [30]:
fileids = top10_dataset.fileid.unique().tolist()
cv5 = split_cv(fileids, 5)

Now merge with notes

In [31]:
top10_dataset.fileid.nunique()

102

In [32]:
import pickle

# the data are 102 dataset
files = top10_dataset['fileid'].unique()
# file_top5 = top3_dataset['fileid'].unique()

filtered_notes = notes[notes.noteid.isin(files)].reset_index(drop=True)

# ============== Description!! ============== #
# cv5 : 5 fold files
# top 10 dataset : ranking information of 104 datas
# filtered notes : the notes linked with top 10 dataset
processed_ranking_datasets = (cv5, top10_dataset, filtered_notes)

with open("../data/processed/cv_processed_ranking_datasets_new.pkl", 'wb') as f :
    pickle.dump(processed_ranking_datasets, f)

In [33]:
import json
from utils import format_prompt

def save_to_json(list_of_dicts, name) :
    with open(f"../data/processed/finetune/{name}.json", "w") as f :
        json.dump(list_of_dicts, f)


In [34]:
msk = top10_dataset.ranking < 1
top10_dataset[msk]

Unnamed: 0,fileid,phrase,ranking


In [26]:
# * rule for naming
# * Name should be cv0_rank3, cv0_rank5, ...


for idx, cv in enumerate(cv5) : 
    data = []
    for shot in ["zeroshot", "fewshot"] : 
        for rank in [3,5,10] :
            for prompt in ["modified", "specific"] :
                output = format_prompt(cv, rank, prompt)
                data.extend(output)
    name = f"cv{idx}"
    save_to_json(output, name)

    # format_prompt(cv5[0], 3)

In [106]:
from datasets import Dataset
import pandas as pd
from utils import *

config = load_config()
PROJECT_PATH = config.project_path
DATA_PATH = PROJECT_PATH.joinpath("data/processed")
path = DATA_PATH.joinpath("cv_processed_ranking_datasets_new.pkl")
cv5, top10_dataset, filtered_notes = pd.read_pickle(path)

In [111]:
df = Dataset.from_pandas(filtered_notes)
df.filter(lambda x : x['noteid'] not in cv5[4])

Filter:   0%|          | 0/102 [00:00<?, ? examples/s]

Dataset({
    features: ['category', 'noteid', 'text'],
    num_rows: 82
})

In [112]:
top10_dataset

Unnamed: 0,fileid,phrase,ranking
0,cancer.report11.txt,asa,4.2
1,cancer.report11.txt,chemo,1.1
2,cancer.report11.txt,colonoscopy,7.3
3,cancer.report11.txt,dexa,7.4
4,cancer.report11.txt,diet-controlled,2.1
...,...,...,...
1303,liver_failure.report60517.txt,omeprazole,3.6
1304,liver_failure.report60517.txt,ranitidine,4.2
1305,liver_failure.report60517.txt,remission,1.1
1306,liver_failure.report60517.txt,sucralfate,4.3
