In [71]:
import numpy as np
from tqdm.auto import tqdm

import torch
import torch.optim as optim
from torch.utils.data import DataLoader, sampler

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from transformers import get_scheduler
from datasets import Dataset

from collections import Counter

### Build HuggingFace Dataset from folder

I should build an HF `Dataset` object and then use with a PyTorch `DataLoader` as in: https://huggingface.co/docs/datasets/en/use_with_pytorch

In [48]:
import os

label_map = {'happy': 1, 'sad': 0}

# Download dataset from: https://github.com/mohummedalee/twitteraae-sentiment-data/
def load_twitter_aae(dir):
    sentences = []
    labels = []
    dialects = []
    for dial in ['aae', 'sae']:
        for lab in ['happy', 'sad']:
            # load dialect x sentiment combination
            fpath = os.path.join(dir, f'{dial}_{lab}')
            with open(fpath, 'r', encoding='utf-8') as fh:
                try:
                    for line in fh:
                        sentences.append(line.strip())
                        labels.append(label_map[lab])
                        dialects.append(dial.upper())
                except UnicodeDecodeError:
                    pass

    return sentences, labels, dialects

In [83]:
DATA_DIR = 'data/raw/sentiment_race'
sentences, labels, dialects = load_twitter_aae(DATA_DIR)

dataset = Dataset.from_dict({
    'text': sentences,
    'label': labels,
    'dialect': dialects
}).with_format("torch")

### Set up model and tokenizer

In [33]:
MODEL_PATH = "FacebookAI/roberta-base"

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [84]:
# run tokenizer on dataset using datasets .map function
MAXLEN = 128

def tokenize(instance):
    tokenized = tokenizer(instance['text'], truncation=True, padding="max_length", max_length=MAXLEN)
    # return {**tokenized, "label": instance['label'], "dialect": instance['dialect']}
    return {**tokenized}
    
dataset = dataset.map(tokenize, num_proc=3)

Map (num_proc=3):   0%|          | 0/12473 [00:00<?, ? examples/s]

  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


**TODOs**
- split into train and validation set, build pytorch dataloaders for both
- run raw training loop on the train_dataloader (as in https://huggingface.co/docs/transformers/en/training#train-in-native-pytorch) --- note that we don't want to use the `Trainer` API and should learn to write our own training loops since we will need to do that anyway when using `private-transformers` later (https://github.com/lxuechen/private-transformers)

### Split into train, test, validation splits

In [85]:
# inspect if there's gross imbalances in dialect -- doesn't seem so
print('Dialect counts:', Counter(dataset['dialect']))

# SubsetRandomSampler example: https://stackoverflow.com/questions/50544730/how-do-i-split-a-custom-dataset-into-training-and-test-datasets/50544887#50544887

shuffle_dataset = True
random_seed = 419  # april 19
N = len(dataset)
inds = np.arange(N)
train_size, val_size = 0.8, 0.1
if shuffle_dataset :
    np.random.seed(random_seed)
    np.random.shuffle(inds)
# pull out indices to pass to DataLoader
train_split = int(N*train_size)
train_inds = inds[:train_split]
val_split = int(N*val_size)
val_inds = inds[train_split:train_split+val_split]
test_inds = inds[train_split+val_split:]

print('train_inds:', train_inds, 'val_inds:', val_inds, 'test_inds:', test_inds)

Dialect counts: Counter({'SAE': 6723, 'AAE': 5750})
train_inds: [6611 3681 8943 ... 6466 7430 3730] val_inds: [4651  901  304 ... 9637 2079 5768] test_inds: [4482 8169 4782 ... 7879  568 1106]


In [86]:
# turn into pytorch-appropriate names
# dataset = dataset.remove_columns(['text'])
dataset = dataset.rename_column('label', 'labels')

batch_size = 8

train_sampler = sampler.SubsetRandomSampler(train_inds)
val_sampler = sampler.SubsetRandomSampler(val_inds)
test_sampler = sampler.SubsetRandomSampler(test_inds)

train_dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, sampler=train_sampler)
val_dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, sampler=val_sampler)
test_dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, sampler=test_sampler)

### Fine-tune model

In [79]:
device = torch.device('cpu')

# check if acceleration available
if torch.cuda.is_available():
    device = torch.device('cuda')
elif torch.backends.mps.is_available():
    device = torch.device('mps')

model = model.to(device)

In [88]:
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)

optimizer = optim.AdamW(model.parameters(), lr=5e-5)
# NOTE: not using lr_scheduler for now

model.train()  # set to training mode
progress_bar = tqdm(range(num_training_steps))
for epoch in range(num_epochs):
    for batch in train_dataloader:
        # includes input_ids, attention_mask, labels etc.
        batch_topass = {
            'input_ids': batch['input_ids'].to(device),
            'attention_mask': batch['attention_mask'].to(device),
            'labels': batch['labels'].to(device)
        }  # things like dialect and text are not passed to the model
        outputs = model(**batch_topass)  # unpack dict and pass as kwargs
        
        # normally, i'd have to compute the loss with a custom loss_fn
        # but in HF, it's part of model output for convenience
        loss = outputs.loss
        loss.backward()  # compute gradients (based on `labels` passed to model)

        optimizer.step()  # gradient update based on current training rate
        optimizer.zero_grad()  # clear out gradients, compute new ones for next batch
        progress_bar.update(1)

  0%|          | 0/3744 [00:00<?, ?it/s]

In [91]:
model.save_pretrained('models/roberta-no-privacy')

### Evaluate model on validation set

In [93]:
import evaluate
# if not installed, run:
# !conda install -y evaluate

In [97]:
metric = evaluate.load('accuracy')

model.eval()  # switch to eval mode
for batch in tqdm(val_dataloader):
    batch_topass = {
        'input_ids': batch['input_ids'].to(device),
        'attention_mask': batch['attention_mask'].to(device),
        'labels': batch['labels'].to(device)
    }
    with torch.no_grad():
        outputs = model(**batch_topass)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=1)
    metric.add_batch(predictions=predictions, references=batch['labels'])

metric.compute()

  0%|          | 0/156 [00:00<?, ?it/s]

{'accuracy': 0.7417802726543705}