In [1]:
import numpy as np
import pandas as pd

import os
import re

from scipy.stats import fisher_exact

from stigmatizing_word_list import STIGMATIZING_WORDS_COMPLETE

In [2]:
def load_data(data_root):
    noteevents = pd.read_csv(os.path.join(data_root, "NOTEEVENTS.csv.gz"), low_memory=False)
    admissions = pd.read_csv(os.path.join(data_root, "ADMISSIONS.csv.gz"), low_memory=False)
    admissions = admissions[~admissions.DIAGNOSIS.isna()] # drop rows without diagnosis
    
    assert admissions.DIAGNOSIS.isna().sum() == 0
    
    combined = admissions.merge(noteevents, on="SUBJECT_ID", how="inner")
    
    return combined
    

In [3]:
def find_stigmatizing_clinical_notes_v2(stigmatizing_list, notes):
    counts = {word:0 for word in stigmatizing_list}
    total_notes_found = 0
    total_words_from_notes = 0
    total_words = 0
    for word in stigmatizing_list:
        for note in notes:
            note = note.split()
            if word in note:
                counts[word]+=1
                total_notes_found += 1
                total_words_from_notes += len(note)
            total_words += len(note)
    return counts, total_notes_found, total_words_from_notes, total_words

In [4]:
clinical_notes = load_data("data/")

In [None]:
counts, total_notes_found, total_words_from_notes, total_words = find_stigmatizing_clinical_notes_v2(STIGMATIZING_WORDS_COMPLETE, notes=clinical_notes.TEXT)

In [None]:
full_set = pd.DataFrame().from_dict(counts, orient="index")
# full_set.reset_index(inplace=True)

full_set.columns = ["full_sample"]
# full_set["percentage"] = full_set["Full Sample Count"]/total_notes_found

full_set.index.name = "stigmatizing_words"
# full_set.loc["total"]  = full_set.sum()

In [None]:
full_set

In [None]:
print("Notes with any stigmitizing language No. (%)", full_set.full_sample.sum(), "(", total_notes_found/len(clinical_notes), ")")

In [None]:
# full_set.loc[full_set.index[-1], 'term'] = ''
# full_set.fillna("-", inplace=True)

In [None]:
full_set.sort_values(by="full_sample", ascending=False).reset_index().plot.bar("stigmatizing_words", "full_sample", figsize=(10, 5))

In [None]:
white = clinical_notes[(clinical_notes.ETHNICITY.str.contains("WHITE"))]

In [None]:
black = clinical_notes[(clinical_notes.ETHNICITY.str.contains("BLACK"))]

In [None]:
w_counts, w_total_notes_found, w_total_words_from_notes, w_total_words = find_stigmatizing_clinical_notes_v2(STIGMATIZING_WORDS_COMPLETE, notes=white.TEXT)

In [None]:
b_counts, b_total_notes_found, b_total_words_from_notes, b_total_words = find_stigmatizing_clinical_notes_v2(STIGMATIZING_WORDS_COMPLETE, notes=black.TEXT)

In [None]:
full_set["ethnicity_white"] = w_counts.values()
full_set["ethnicity_black"] = b_counts.values()
# full_set["Ethnicity: Hispanic/Latino"] = h_counts.values()

In [None]:
full_set.sort_values(by="ethnicity_white", ascending=False).reset_index().plot.bar("stigmatizing_words", "ethnicity_white", figsize=(10, 5))

In [None]:
full_set.sort_values(by="ethnicity_black", ascending=False).reset_index().plot.bar("stigmatizing_words", "ethnicity_black", figsize=(10, 5))

In [None]:
full_set = full_set[(full_set["ethnicity_white"] > 1) & (full_set["ethnicity_black"] > 1)]# .sort_values(by = "Log Odds Ratio: Black_White")

In [None]:
full_set

In [None]:
full_set["odds_full_sample"] = full_set["full_sample"] / total_words_from_notes
full_set["odds_white"] = full_set["ethnicity_white"] / w_total_words_from_notes
full_set["odds_black"] = full_set["ethnicity_black"] / b_total_words_from_notes
# full_set["Odds Ratio: Hispanic/Latino"] = full_set["Ethnicity: Hispanic/Latino"] / h_total_words_from_notes

In [None]:
def calculate_odds_ratio_columns(word_count_data, white_count_total, black_count_total):
    records = []
    b = white_count_total
    d = black_count_total
    
    for word, row_info in word_count_data.iterrows():
        a = row_info.ethnicity_white
        c = row_info.ethnicity_black

        table = [[a, b],
                 [c, d]]
        
        odds_ratio, p_val = fisher_exact(table)
        log_odds = np.log(odds_ratio)
        
        se = np.sqrt(1/a + 1/b + 1/c + 1/d)
        lower_ci = 1.96 * np.sqrt(se)
        upper_ci = 1.96 * np.sqrt(se)
        
        row_odds = [word, odds_ratio, log_odds, lower_ci, upper_ci, p_val]
        records.append(row_odds)
        
    return records
        
    

In [None]:
full_set = pd.DataFrame().from_dict(counts, orient="index")

full_set.columns = ["full_sample"]

full_set.index.name = "stigmatizing_words"


full_set["ethnicity_white"] = w_counts.values()
full_set["ethnicity_black"] = b_counts.values()

full_set = full_set[(full_set["ethnicity_white"] > 1) & (full_set["ethnicity_black"] > 1)]
full_set["odds_full_sample"] = full_set["full_sample"] / total_words_from_notes
full_set["odds_white"] = full_set["ethnicity_white"] / w_total_words_from_notes
full_set["odds_black"] = full_set["ethnicity_black"] / b_total_words_from_notes
# full_set["Odds Ratio: Hispanic/Latino"] = full_set["Ethnicity: Hispanic/Latino"] / h_total_words_from_notes

In [None]:
odd_calculations = calculate_odds_ratio_columns(full_set[["ethnicity_white", "ethnicity_black"]], w_total_words_from_notes, b_total_words_from_notes)
odds_data = pd.DataFrame(odd_calculations, columns = ["stigmatizing_words", "odds_white_black", "log_odds_white_black", "lower_ci", "upper_ci", "p-val"])

In [None]:
full_set = full_set.reset_index()
full_set = full_set.merge(odds_data, on="stigmatizing_words")
full_set.set_index("stigmatizing_words", inplace=True)
full_set = full_set.sort_values(by = "log_odds_white_black")

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(6, 4), dpi=150)
ci = [full_set.lower_ci,
      full_set.upper_ci]
plt.errorbar(x=full_set.log_odds_white_black, y=full_set.index.values, xerr=ci,
            color='black',  capsize=3, linestyle='None', linewidth=1,
            marker="o", markersize=5, mfc="black", mec="black")

plt.axvline(x=1, linewidth=0.8, linestyle='--', color='black')
plt.tick_params(axis='both', which='major', labelsize=8)
plt.xlabel('Log Odds Ratio and 95% Confidence Interval', fontsize=8)
plt.tight_layout()
# plt.savefig('raw_forest_plot.png')
plt.show()

In [None]:
odds_data.round(4).to_csv("stigmatizing_w_pval_6-26-2023.csv", index=False)