In [None]:
import numpy as np
import pandas as pd

import os
import re

from scipy.stats import fisher_exact

from stigmatizing_word_list import STIGMATIZING_WORDS_COMPLETE

In [None]:
def load_data(data_root):
    noteevents = pd.read_csv(os.path.join(data_root, "NOTEEVENTS.csv.gz"), low_memory=False)
    admissions = pd.read_csv(os.path.join(data_root, "ADMISSIONS.csv.gz"), low_memory=False)
    admissions = admissions[~admissions.DIAGNOSIS.isna()] # drop rows without diagnosis
    
    assert admissions.DIAGNOSIS.isna().sum() == 0
    
    combined = admissions.merge(noteevents, on="SUBJECT_ID", how="inner")
    
    return combined
    

In [None]:
clinical_notes = load_data("data/")

In [None]:
STIG_WORD_LIST = pd.read_csv("data/stigmatizing_w_pval.csv")

In [None]:
stig_list = r"|".join([r"\b{}\b".format(word) for word in STIG_WORD_LIST.stigmatizing_words])

In [None]:
clinical_notes_stig = clinical_notes[["SUBJECT_ID", "ETHNICITY", "TEXT"]].apply(lambda x: x.str.contains(stig_list), axis=1).any(1)

In [None]:
clinical_notes_stig.sum()

In [None]:
filtered = clinical_notes.loc[clinical_notes_stig]

In [None]:
filtered

In [None]:
stig_words_per_note = []
for note in filtered.TEXT:
    res = list(set(re.findall(stig_list, note)))
    stig_words_per_note.append("|".join(res))
    

In [None]:
filtered["STIG_WORD"] = stig_words_per_note

In [None]:
filtered.head()

In [None]:
filtered.reset_index(drop=True, inplace=True)

In [None]:
filtered.to_csv("data/clinical_filtered.csv.gz", index=False)