In [1]:
import torch
from torch.utils.data import DataLoader

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from datasets import Dataset

from collections import Counter

### Build HuggingFace Dataset from folder

I should build an HF `Dataset` object and then use with a PyTorch `DataLoader` as in: https://huggingface.co/docs/datasets/en/use_with_pytorch

In [2]:
import os

label_map = {'happy': 1, 'sad': 0}

# Download dataset from: https://github.com/mohummedalee/twitteraae-sentiment-data/
def load_twitter_aae(dir):
    sentences = []
    labels = []
    dialects = []
    for dial in ['aae', 'sae']:
        for lab in ['happy', 'sad']:
            # load dialect x sentiment combination
            fpath = os.path.join(dir, f'{dial}_{lab}')
            with open(fpath, 'r', encoding='utf-8') as fh:
                try:
                    for line in fh:
                        sentences.append(line.strip())
                        labels.append(label_map[lab])
                        dialects.append(dial.upper())
                except UnicodeDecodeError:
                    pass

    return sentences, labels, dialects

In [22]:
DATA_DIR = 'data/raw/sentiment_race'
sentences, labels, dialects = load_twitter_aae(DATA_DIR)

dataset = Dataset.from_dict({
    'text': sentences,
    'label': labels,
    'dialect': dialects
}).with_format("torch")

In [5]:
dataloader = DataLoader(dataset, batch_size=4, shuffle=True)

# next(iter(dataloader))

### Set up model and tokenizer

In [6]:
MODEL_PATH = "FacebookAI/roberta-base"

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH)

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
# run tokenizer on dataset using datasets .map function
MAXLEN = 128

def tokenize(instance):
    tokenized = tokenizer(instance['text'], truncation=True, padding="max_length", max_length=MAXLEN)
    # return {**tokenized, "label": instance['label'], "dialect": instance['dialect']}
    return {**tokenized}
    
dataset = dataset.map(tokenize, num_proc=3)

Map (num_proc=3):   0%|          | 0/12473 [00:00<?, ? examples/s]

  return cls._concat_blocks(pa_tables_to_concat_vertically, axis=0)
  return cls._concat_blocks(pa_tables_to_concat_vertically, axis=0)
  return cls._concat_blocks(pa_tables_to_concat_vertically, axis=0)


In [30]:
dataset

# dataset now has `input_ids` and `attention_mask` fields too -- this is what is needed for training

Dataset({
    features: ['text', 'label', 'dialect', 'input_ids', 'attention_mask'],
    num_rows: 12473
})

**TODOs**
- split into train and validation set, build pytorch dataloaders for both
- run raw training loop on the train_dataloader (as in https://huggingface.co/docs/transformers/en/training#train-in-native-pytorch) --- note that we don't want to use the `Trainer` API and should learn to write our own training loops since we will need to do that anyway when using `private-transformers` later (https://github.com/lxuechen/private-transformers)