In [26]:
import os
os.chdir('/work/fairness-privacy')
print('working dir:', os.getcwd())

import datasets
from transformers import AutoModelForSequenceClassification, AutoTokenizer

from torch.utils.data import DataLoader
import torch
from tqdm.notebook import tqdm

working dir: /work/fairness-privacy


### Load tokenizer and tokenize test data

In [2]:
base_model = "FacebookAI/roberta-base"
tokenizer = AutoTokenizer.from_pretrained(base_model)

MAXLEN = 128

def tokenize(batch, tokenizer, maxlen=MAXLEN):
    tokenized = tokenizer(batch['text'], truncation=True, padding="max_length", max_length=maxlen)    
    return {**tokenized}



In [10]:
sentiment_data = datasets.load_from_disk("/work/fairness-privacy/twitteraae-sentiment-data-split/")
test_sentiment_data_tok = sentiment_data['test'].map(tokenize, num_proc=3, batched=True, fn_kwargs={"tokenizer": tokenizer}).with_format("torch")

# separate out AAE and SAE points
test_aae = test_sentiment_data_tok.filter(lambda pt: pt["dialect"] == "AAE")
test_sae = test_sentiment_data_tok.filter(lambda pt: pt["dialect"] == "SAE")

# build dataloaders
aae_dataloader = DataLoader(test_aae, batch_size=64)
sae_dataloader = DataLoader(test_sae, batch_size=64)

Loading cached processed dataset at /work/fairness-privacy/twitteraae-sentiment-data-split/test/cache-6ab371770c5f6a4f_*_of_00003.arrow


Filter:   0%|          | 0/203475 [00:00<?, ? examples/s]

Filter:   0%|          | 0/203475 [00:00<?, ? examples/s]

### Load models

In [5]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

nonpriv_model = AutoModelForSequenceClassification.from_pretrained("models-trained/roberta-no-priv-epochs_3-bs_128").to(device)
priv_model = AutoModelForSequenceClassification.from_pretrained("models-trained/roberta-priv-eps_8_epochs_3-bs_128").to(device)

### Compute performance

In [25]:
from tabulate import tabulate
import evaluate
import matplotlib.pyplot as plt

In [18]:
def evaluate_model(model, dataloader):
    model.eval()  # switch to eval mode
    accuracy = evaluate.load("accuracy")
    f1 = evaluate.load("f1")
    
    for batch in tqdm(dataloader):
        batch_topass = {
            'input_ids': batch['input_ids'].to(device),
            'attention_mask': batch['attention_mask'].to(device)
        }
        with torch.no_grad():
            outputs = model(**batch_topass)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=1)
        
        accuracy.add_batch(predictions=predictions, references=batch['label'])
        f1.add_batch(predictions=predictions, references=batch['label'])
    
    return {'accuracy': accuracy.compute()['accuracy'], 'f1': f1.compute()['f1']}

In [21]:
print('--- Non-private model ---')
print('Computing model performance...')
nonpriv_aae_perf = evaluate_model(nonpriv_model, aae_dataloader)
nonpriv_sae_perf = evaluate_model(nonpriv_model, sae_dataloader)

print('--- Private model ---')
print('Computing model performance...')
priv_aae_perf = evaluate_model(priv_model, aae_dataloader)
priv_sae_perf = evaluate_model(priv_model, sae_dataloader)

--- Non-private model ---
Computing model performance...


  0%|          | 0/165 [00:00<?, ?it/s]

  0%|          | 0/3015 [00:00<?, ?it/s]

--- Private model ---
Computing model performance...


  0%|          | 0/165 [00:00<?, ?it/s]

  0%|          | 0/3015 [00:00<?, ?it/s]

In [40]:
table = [
    ["Model","Acc, SAE","Acc, AAE", "F1, SAE", "F1, AAE"],    
    ["Non-priv.", f"{nonpriv_sae_perf['accuracy']:.3}", f"{nonpriv_aae_perf['accuracy']:.3}", f"{nonpriv_sae_perf['f1']:.3}", f"{nonpriv_aae_perf['f1']:.3}"],
    ["Priv.", f"{priv_sae_perf['accuracy']:.3}", f"{priv_aae_perf['accuracy']:.3}", f"{priv_sae_perf['f1']:.3}", f"{priv_aae_perf['f1']:.3}"]    
]

print(tabulate(table, headers="firstrow"))

Model        Acc, SAE    Acc, AAE    F1, SAE    F1, AAE
---------  ----------  ----------  ---------  ---------
Non-priv.       0.858       0.735      0.898      0.802
Priv.           0.691       0.61       0.817      0.758
