# Inverse Propensity Weighting

Check if GPU is available.

In [None]:
!nvidia-smi

Sun Dec  4 00:48:56 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   71C    P0    30W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

Import necessary libraries.

In [None]:
!pip install datasets
!pip install transformers
import pandas as pd
import numpy as np
import random
from scipy.stats import bernoulli
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
from scipy.stats import percentileofscore

import torch
from torch import nn
from torch.utils.data import DataLoader
from torch.optim import AdamW

from datasets import Dataset
from transformers import DistilBertTokenizer, DistilBertModel

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Using cached datasets-2.7.1-py3-none-any.whl (451 kB)
Collecting xxhash
  Using cached xxhash-3.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
Collecting multiprocess
  Using cached multiprocess-0.70.14-py38-none-any.whl (132 kB)
Collecting responses<0.19
  Using cached responses-0.18.0-py3-none-any.whl (38 kB)
Collecting urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1
  Downloading urllib3-1.25.11-py2.py3-none-any.whl (127 kB)
[K     |████████████████████████████████| 127 kB 4.9 MB/s 
Installing collected packages: urllib3, xxhash, responses, multiprocess, datasets
  Attempting uninstall: urllib3
    Found existing installation: urllib3 1.24.3
    Uninstalling urllib3-1.24.3:
      Successfully uninstalled urllib3-1.24.3
Successfully installed datasets-2.7.1 multiprocess-0.70.14 responses-0.18.0 urllib3-1.25.11 xxhash-3.1.0
Looking in indexes: https:

Preprocess the data.

In [None]:
race_keywords = ['AFRICAN-AMERICAN',
                'AFRICAN AMERICAN',
                'AFRICAN',
                'BLACK',
                'CREOLE',
                'CAUCASIAN',
                'WHITE']

def remove_keys(s, keywords):
    for key in keywords:
        if key in s:
            s = s.replace(key,'')
    return s

def clean_string(s):
    s = s.replace(',',' ')
    s = s.replace('-',' ')
    s = s.replace('\n',' ')
    return s

# Read data and drop nulls
df_adm = pd.read_csv('/content/drive/MyDrive/464ProjectData/ADMISSIONS.csv')
df_nte = pd.read_csv('/content/drive/MyDrive/464ProjectData/NOTEEVENTS.csv')
df_sev = pd.read_csv('/content/drive/MyDrive/464ProjectData/apsiii-score.csv')

df_nte = df_nte.loc[~df_nte.HADM_ID.isnull()].reset_index(drop=True)
df_nte['HADM_ID'] = df_nte['HADM_ID'].apply(int)

df_adm = df_adm[['SUBJECT_ID','HADM_ID','ETHNICITY','DIAGNOSIS']]

# Filter only to nursing notes
df_nte = df_nte[['SUBJECT_ID','HADM_ID','CHARTDATE','CATEGORY','TEXT']]
df_nte = df_nte.loc[df_nte.CATEGORY.str.contains('Nursing')]
df_nte = df_nte.drop_duplicates().reset_index(drop=True)
        
df_sev = df_sev[['subject_id','hadm_id','apsiii']].drop_duplicates()

df = df_adm.merge(df_nte, on=['SUBJECT_ID','HADM_ID'])
df = df.merge(df_sev, left_on=['SUBJECT_ID','HADM_ID'], right_on=['subject_id','hadm_id'])

# Filter to only white/black patients
df = df.loc[df.ETHNICITY.str.contains('WHITE')|df.ETHNICITY.str.contains('BLACK')]
df['ETHNICITY'] = df['ETHNICITY'].apply(lambda x: 'WHITE' if 'WHITE' in x else 'BLACK')

# Removing race-related keywords from text
df['TEXT'] = df['TEXT'].apply(lambda x: x.upper())
df['TEXT'] = df['TEXT'].apply(lambda x: remove_keys(x, race_keywords))

# Clean punctuations
df['TEXT'] = df['TEXT'].apply(clean_string)

df = df.drop(['subject_id','hadm_id'], axis=1)
df = df.dropna(how='any', axis=0)

df.to_csv('/content/drive/MyDrive/464ProjectData/preprocessed_IPW.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


Set global variables.

In [None]:
# Turn on GPU computing.
device = torch.device("cuda")
random.seed(42)

# Global configurations.
BATCH_SIZE = 16
EPOCHS     = 10
TAU        = 0.1
DIAG_PROP  = 0.8

In [None]:
# Samples the treatment disparity between black and white patients.
def sample_disparity(actual_treatment, race, tau, diag_prop):
    race = race.apply(lambda x: 0 if x=='WHITE' else 1)
    bias = race*actual_treatment*tau
    prob = actual_treatment.apply(lambda x: x*diag_prop) - bias
    return bernoulli.rvs(prob)

Define the classifier.

In [None]:
# The classifier class.
class Classifier(nn.Module):
    # Note that the output corresponds to treatment prescribed (1) or treatment
    # not prescribed (0).
    def __init__(self, input_dim, output_dim=1):
        super(Classifier, self).__init__()
        
        self.hidden_layer1 = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.LeakyReLU(0.2)
        )

        self.hidden_layer2 = nn.Sequential(
            nn.Linear(128, 32),
            nn.LeakyReLU(0.2),
            nn.Dropout(0.3)
        )

        self.hidden_layer3 = nn.Sequential(
            nn.Linear(32, output_dim),
            nn.Sigmoid()
        )

    def forward(self, x, labels=None):
        output = self.hidden_layer1(x)
        output = self.hidden_layer2(output)
        output = self.hidden_layer3(output)
        return output

Read in the data.

In [None]:
df = pd.read_csv('/content/drive/MyDrive/464ProjectData/preprocessed_IPW.csv', lineterminator='\n')
df = df.sort_values(by=['SUBJECT_ID','HADM_ID','CHARTDATE'])\
                .groupby(['SUBJECT_ID','HADM_ID'])\
                .head(1)
                
idx_white = random.sample(list(df.loc[df.ETHNICITY=='WHITE'].index), 
              df.loc[df.ETHNICITY=='BLACK'].shape[0])
df = pd.concat([df.loc[idx_white], 
                df.loc[df.ETHNICITY=='BLACK']]).reset_index(drop=True)

df = df[['SUBJECT_ID','ETHNICITY','DIAGNOSIS','TEXT','apsiii']].reset_index(drop=True)
df['apsiii_norm'] = list(map(lambda x: percentileofscore(df['apsiii'], x, 'mean')/100, 
                            df['apsiii']))
df['actual_treatment'] = bernoulli.rvs(df['apsiii_norm'])
df['given_treatment'] = sample_disparity(df['actual_treatment'], df['ETHNICITY'], TAU, DIAG_PROP)

In [None]:
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, _ = RandomOverSampler(random_state=42).fit_resample(df_train, df_train['given_treatment'])
df_test, _ =  RandomOverSampler(random_state=42).fit_resample(df_test,  df_test['given_treatment'])

dataset_train = Dataset.from_pandas(df_train, split='train')
dataset_test  = Dataset.from_pandas(df_test,  split='test')

dataloader_train = DataLoader(dataset_train, batch_size=BATCH_SIZE, shuffle=True)
dataloader_test  = DataLoader(dataset_test,  batch_size=BATCH_SIZE, shuffle=True)

Compute prop.

In [None]:
df_stats = df.groupby(['ETHNICITY','actual_treatment']).aggregate({'SUBJECT_ID':'count',
                                                        'given_treatment':'sum'})
df_stats['prop'] = df_stats['given_treatment']/df_stats['SUBJECT_ID']
print(f"Actual Treatment: {np.mean(df['actual_treatment'])}, Given Treatment: {np.mean(df['given_treatment'])}")
df_stats['prop']

Actual Treatment: 0.5055261893320518, Given Treatment: 0.38322921672272947


ETHNICITY  actual_treatment
BLACK      0                   0.000000
           1                   0.719169
WHITE      0                   0.000000
           1                   0.799315
Name: prop, dtype: float64

In [None]:
black_patients_ipw = df_stats['prop'][1]
white_patients_ipw = df_stats['prop'][3]

In [None]:
def encode(examples, tokenizer):
    inputs = examples['TEXT']  
    labels = nn.functional.one_hot(examples['given_treatment'], 
                                   num_classes=2)
    tokenized_inputs = tokenizer(inputs,
                                 return_tensors='pt',
                                 max_length=512,
                                 truncation=True,
                                 padding=True)
    model_inputs = {}
    model_inputs['input_ids']      = tokenized_inputs['input_ids']
    model_inputs['attention_mask'] = tokenized_inputs['attention_mask']
    return model_inputs, labels.float()

In [None]:
# Use HuggingFace's BERT as the model's encoder.
# https://huggingface.co/docs/transformers/model_doc/distilbert#transformers.DistilBertModel
model_enc = DistilBertModel.from_pretrained("distilbert-base-uncased").to(device)

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased",
                                            output_scores=True,
                                            output_hidden_states=True,
                                            model_max_length=512)
# Define the models and optimizer
classifier = Classifier(input_dim=512, output_dim=2).to(device)
classifier_optimizer = AdamW(classifier.parameters(), 
                          lr=5e-3)

# Define loss
criterion = nn.BCELoss()

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
def train_classifier(batch, labels, weights):
    classifier.train()
    classifier_optimizer.zero_grad()
    
    x = model_enc(**batch)                                       # Get embeddings from encoder
    x = x.last_hidden_state.sum(axis=1).detach()                 # Sum across emb_dim, detach
    x = x[:, :512]

    pred = classifier(x)                                         # Make prediction
    loss = criterion(pred, labels)
    loss = loss * (sum(weights)/len(weights))
    loss.backward()
    classifier_optimizer.step()
    return loss

def eval_classifier(batch, labels):
    classifier.eval()
    with torch.no_grad():
        x = model_enc(**batch)                                       # Get embeddings from encoder
        x = x.last_hidden_state.sum(axis=1).detach()                 # Sum across emb_dim, detach
        x = x[:, :512]

        pred = classifier(x)                                         # Make prediction
        loss = criterion(pred, labels)
        return loss

In [None]:
def train(ep):
    train_loss, test_loss, steps = 0, 0, 0
    for batch in dataloader_train:
        steps += 1
        weights = [float(1/white_patients_ipw) if x == "WHITE" else 1/black_patients_ipw for x in batch["ETHNICITY"]]
        batch.pop('ETHNICITY', None)
        batch, labels           = encode(batch, tokenizer)           # Tokenize and obtain labels
        batch['input_ids']      = batch['input_ids'].to(device)      # Send to GPU
        batch['attention_mask'] = batch['attention_mask'].to(device)
        labels                  = labels.to(device)
        train_loss += float(train_classifier(batch, labels, weights).detach().cpu())
        # if steps % 500 == 0:
        #     torch.save(classifier.state_dict(), f"results/gan/classifier-{ep}-{steps}-{TAU}-{DIAG_PROP}")

    for batch in dataloader_test:
        batch.pop('ETHNICITY', None)
        batch, labels           = encode(batch, tokenizer)           # Tokenize and obtain labels
        batch['input_ids']      = batch['input_ids'].to(device)      # Send to GPU
        batch['attention_mask'] = batch['attention_mask'].to(device)
        labels                  = labels.to(device)
        test_loss += float(eval_classifier(batch, labels).detach().cpu())
    
    train_loss /= len(dataloader_train)
    test_loss  /= len(dataloader_test)
    print(f"Epoch {ep}: {train_loss}, {test_loss}")
    return train_loss, test_loss

Train the classifier.

In [None]:
best_test = np.inf
for ep in range(EPOCHS):
    train_loss, test_loss = train(ep)
    if test_loss < best_test:
        best_test = test_loss
        torch.save(classifier.state_dict(), f"/content/drive/MyDrive/464ProjectData/IPW-classifier-{TAU}-{DIAG_PROP}-final")

Epoch 0: 65.96137668385225, 50.0
Epoch 1: 66.0528232350069, 50.0
Epoch 2: 66.04684315101773, 50.0
Epoch 3: 66.04684315849752, 50.0
Epoch 4: 66.0528232050877, 50.0
Epoch 5: 66.05282298817355, 50.0
Epoch 6: 66.04684299394197, 50.0
Epoch 7: 66.04684315101773, 50.0
Epoch 8: 66.06080955804563, 50.0
Epoch 9: 66.04086259580126, 50.0


Evaluate the classifier.

In [None]:
PNEUMONIA_KEYS = ['PNEUMONIA','PMEUMONIA','PNEUMOMIA',
                  'PNEUMONI','PNAUMONIA','PNEMONIA',
                  'PNEUMNOIA','PNEUMONIN','PNEUMONNIA']
FEVER_KEYS = ['FEVER','FEER']
print(df.shape)
df_filter = df.loc[df['DIAGNOSIS'].apply(lambda s: any([k in s for k in PNEUMONIA_KEYS]))]
print(df_filter.shape)
# df_filter = df.loc[df['DIAGNOSIS'].apply(lambda s: any([k in s for k in FEVER_KEYS]))]

dataset_filter  = Dataset.from_pandas(df_filter,  split='filter')
dataloader_filter = DataLoader(dataset_filter, batch_size=BATCH_SIZE, shuffle=True)

(8324, 8)
(326, 8)


In [None]:
race_lst, actual_lst, pred_lst, apsiii_lst = [], [], [], []
for idx, batch in enumerate(dataloader_filter):
    # print(f"Predicting: Batch {idx}")
    
    # Obtain actual treatment and race features
    actual_treatment        = nn.functional.one_hot(batch['actual_treatment'], 
                                                    num_classes=2).float()
    race                    = torch.tensor(list(map(lambda x: 1*(x=='BLACK'), 
                                                    batch['ETHNICITY'])))
    apsiii = batch['apsiii']
    
    batch, labels           = encode(batch, tokenizer)           # Tokenize and obtain labels
    batch['input_ids']      = batch['input_ids'].to(device)      # Send to GPU
    batch['attention_mask'] = batch['attention_mask'].to(device)
    labels                  = labels.to(device)
    
    x = model_enc(**batch)                                       # Get embeddings from encoder
    x = x.last_hidden_state.sum(axis=1).detach()                 # Sum across emb_dim, detach
    x = x[:, :512]
    pred = classifier(x).cpu().detach()                          # Make prediction
    
    race_lst.append(race)
    actual_lst.append(actual_treatment)
    pred_lst.append(pred)
    apsiii_lst.append(apsiii)

race_lst   = torch.concat(race_lst)
actual_lst = torch.concat(actual_lst)
pred_lst   = torch.concat(pred_lst) 
apsiii_lst = torch.concat(apsiii_lst)

print(race_lst.shape, actual_lst.shape, pred_lst.shape, apsiii_lst.shape)

idx_0 = torch.where(race_lst==0)[0]
idx_1 = torch.where(race_lst==1)[0]

print(idx_0, idx_1)

acc_1 = float(torch.mean(1.0*(actual_lst[idx_1].argmax(dim=1)==pred_lst[idx_1].argmax(dim=1))))
acc_0 = float(torch.mean(1.0*(actual_lst[idx_0].argmax(dim=1)==pred_lst[idx_0].argmax(dim=1))))
roc_0 = roc_auc_score(y_true  = actual_lst[idx_0].numpy(),y_score = pred_lst[idx_0].numpy())
roc_1 = roc_auc_score(y_true  = actual_lst[idx_1].numpy(),
              y_score = pred_lst[idx_1].numpy())
print(f"White: {acc_0} (Acc) {roc_0} ROC-AUC, Black: {acc_1} (Acc) {roc_1} ROC-AUC")

torch.Size([326]) torch.Size([326, 2]) torch.Size([326, 2]) torch.Size([326])
tensor([  4,   8,   9,  13,  16,  17,  18,  20,  21,  22,  26,  27,  29,  30,
         34,  37,  40,  43,  48,  50,  51,  54,  55,  57,  59,  63,  68,  69,
         70,  74,  76,  77,  78,  81,  82,  83,  89,  91,  93,  95,  97,  98,
        101, 102, 103, 107, 110, 112, 113, 115, 116, 117, 118, 119, 121, 123,
        125, 126, 127, 130, 132, 133, 135, 136, 139, 142, 145, 147, 151, 154,
        155, 159, 160, 164, 165, 167, 168, 169, 174, 176, 178, 180, 181, 183,
        187, 188, 193, 195, 196, 197, 199, 200, 201, 205, 206, 207, 208, 212,
        218, 220, 223, 224, 226, 227, 228, 229, 230, 231, 233, 236, 237, 238,
        239, 240, 241, 242, 244, 245, 246, 248, 250, 251, 252, 255, 256, 257,
        258, 259, 260, 263, 264, 268, 270, 271, 273, 275, 276, 284, 286, 288,
        291, 293, 294, 298, 299, 300, 301, 303, 304, 306, 308, 309, 310, 311,
        317, 318, 319, 320, 322]) tensor([  0,   1,   2,   3,   

In [None]:
df_analysis = pd.DataFrame({'race':race_lst.numpy(),
              'actual':np.argmax(actual_lst.numpy(), axis=1),
              'pred_not_prescribe':pred_lst[:,0].numpy(),
              'pred_prescribe':pred_lst[:,1].numpy(),
              'apsiii':apsiii_lst.numpy()})
df_analysis['prescribe'] = 1*(df_analysis['pred_prescribe'] > df_analysis['pred_not_prescribe'])
df_analysis['correct'] = 1*(df_analysis['prescribe']==df_analysis['actual'])

for i in range(8):
    if i<=5:
        temp = df_analysis.loc[(df_analysis.apsiii >= 20*i)&\
                            (df_analysis.apsiii < 20*(i+1))]\
                                .groupby('race')\
                                .aggregate({'correct':'mean', 'prescribe':'mean', 'apsiii':'count'})
    else:
        temp = df_analysis.loc[(df_analysis.apsiii >= 20*i)]\
                                .groupby('race')\
                                .aggregate({'correct':'mean', 'prescribe':'mean', 'apsiii':'count'})
    print(f"Accuracy (Black): {round(100*temp['correct'][1],3)} ({round(100*temp['prescribe'][1],3)}% prescribed, {temp['apsiii'][1]} patients), \t Accuracy (White): {round(100*temp['correct'][0],3)} ({round(100*temp['prescribe'][0],3)}% prescribed, {temp['apsiii'][0]} patients)")

Accuracy (Black): 66.667 (0.0% prescribed, 3 patients), 	 Accuracy (White): 100.0 (0.0% prescribed, 2 patients)
Accuracy (Black): 62.963 (0.0% prescribed, 54 patients), 	 Accuracy (White): 47.368 (0.0% prescribed, 38 patients)
Accuracy (Black): 30.0 (0.0% prescribed, 70 patients), 	 Accuracy (White): 28.571 (0.0% prescribed, 70 patients)
Accuracy (Black): 4.348 (0.0% prescribed, 23 patients), 	 Accuracy (White): 9.677 (0.0% prescribed, 31 patients)
Accuracy (Black): 0.0 (0.0% prescribed, 13 patients), 	 Accuracy (White): 0.0 (0.0% prescribed, 12 patients)
Accuracy (Black): 0.0 (0.0% prescribed, 1 patients), 	 Accuracy (White): 0.0 (0.0% prescribed, 5 patients)
Accuracy (Black): 0.0 (0.0% prescribed, 3 patients), 	 Accuracy (White): 0.0 (0.0% prescribed, 1 patients)


KeyError: ignored

In [None]:
df_analysis
