In [5]:
!pip install datasets
!pip install transformers
!pip install evaluate
!pip install seqeval


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 KB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16179 sha256=eac609758521d46ffb09fcb6b998949da4974992f3a3d9328657f59f4ece0fde
  Stored in directory: /root/.cache/pip/wheels/ad/5c/ba/05fa33fa5855777b7d686e843ec07452f22a66a138e290

# Task 3

For this task we will give you three seperate datasets to investigate. Firstly look at each of these datasets and understand what they consist of and what NLP tasks you could finetune using these datasets.

Therefore, we ask you to do the following:


**1. Look at the three datasets below on Hugging Face and investigate them thouroughly. Understand these following aspects before you move on:**
* What features do the datasets contain?
* Are the datasets already tokenised or do they contain text?
* What are suitable tasks to train these datasets on? (e.g. Token classification, sentiment analysis, sequence classification, masked language modelling)

**2. Choose one of these datasets to finetune a model.**
> Understand what the task it is you are going to fine tune on given the dataset.

**3. Choose a model to finetune on this dataset.**

> Use the hugging face documentation to choose a correct model ([HuggingFace models](https://huggingface.co/models))

**4.   Pre-process the dataset to train the model.**
> Understanding exactly what task you are going to be finetuning the model for, will help a lot here. Think about what is it the model needs as an input and see how you need to change the given features to these inputs. Use Hugging Face tokenisers, data collators and general documentation to figure this out.

**5. Train the model on this dataset.**
> Use a manual training loop here, understand the mechanics behind training and implement it yourself (you can find this in the Hugging Face documentation).

**6. Evaluate the new model's performance - compare with the performance of the model before finetuning.**
> Look at what metric you would use to measure the performance of the model, this can be tricky for some language modelling tasks with non-deterministic labels.


The datasets:

1. [tweet_eval](https://huggingface.co/datasets/tweet_eval)
2. [wikitext](https://huggingface.co/datasets/wikitext)
3. [wikiann](https://huggingface.co/datasets/wikiann)

Models available at: https://huggingface.co/models



## Solution to dataset 1 - tweet_eval - Sentiment Analysis


In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer, AdamW,  get_scheduler
from torch.utils.data import DataLoader
import torch
import evaluate
from tqdm.auto import tqdm

In [None]:


def preprocess_tweet_eval(raw_datasets, checkpoint, num_labels, batch_size=8, shuffle=True):
  """
  Method to pre-process the tweet_eval dataset for sentiment analysis.
  :param raw_datasets: Raw dataset downloaded from hugging face.
  :param checkpoint: Checkpoint to be used for the tokenizer.
  :param num_labels: Number of labels to use for the classification task.
  :param batch_size: Batch size to use for the train and eval dataloaders.
  :param shuffle: Whether to shuffle the data for training.
  :return: pytorch dataloader for the training and validation dataset.
  """
  # load the tokenizer and datacollator from hugging face using the checkpoint specified
  tokenizer = AutoTokenizer.from_pretrained(checkpoint, num_labels=num_labels)
  data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

  # lambda function to map to the raw_dataset
  tokenize_function = lambda sample: tokenizer(sample["text"])

  # map lambda function onto each sample to convert text to tokens
  tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
  
  # remove text, convert column label to labels and convert to pytorch tensors
  tokenized_datasets = tokenized_datasets.remove_columns(["text"])
  tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
  tokenized_datasets.set_format("torch")
  
  # define dataloaders
  train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=shuffle, batch_size=batch_size, collate_fn=data_collator
  )
  eval_dataloader = DataLoader(
    tokenized_datasets["validation"], batch_size=batch_size, collate_fn=data_collator
  )
  return train_dataloader, eval_dataloader

In [None]:
# read in tweet_eval dataset
raw_datasets = load_dataset("tweet_eval", "sentiment")

# define the model checkpoint to preprocess data and model
checkpoint = "cardiffnlp/twitter-roberta-base-sentiment"

# pre-process the raw_datasets using a tokenizer and datacollator to pad, giving us the pytorch train and eval dataloaders
train_dataloader, eval_dataloader = preprocess_tweet_eval(raw_datasets, checkpoint, num_labels=3)

# download model from checkpoint researched on hugging face
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3)






  0%|          | 0/3 [00:00<?, ?it/s]



In [None]:
# define optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)
print("number of training steps: {}".format(num_training_steps))

17106


In [None]:
progress_bar = tqdm(range(num_training_steps))
model.train()

for epoch in range(num_epochs):
  for batch in train_dataloader:
    outputs = model(**batch)
    loss = outputs.loss
    loss.backward()

    optimizer.step()
    lr_scheduler.step()
    optimizer.zero_grad()
    progress_bar.update(1)

  0%|          | 0/17106 [00:00<?, ?it/s]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


KeyboardInterrupt: ignored

In [None]:
metric = evaluate.load("glue", "sst2")
model.eval()
for batch in eval_dataloader:
    with torch.no_grad():
        outputs = model(**batch)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])
metric.compute()

## Solution to dataset 2 - wikitext - Text Masking

In [40]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForMaskedLM, DataCollatorWithPadding, TrainingArguments, Trainer, AdamW,  get_scheduler, DataCollatorForLanguageModeling
from torch.utils.data import DataLoader
import torch
import evaluate
from tqdm.auto import tqdm
import math

With the wikitext dataset, there is significantly more pre-processing that one needs to apply. 

In [34]:
def concatenate_texts(samples, chunk_size=256):
  """
  Method to concatenate all samples and return equal length chunks of text for each sample.
  :param samples: The samples to concatenate.
  :param chunk_size: The chunk size to use for each new sample.
  :return results: The new samples.
  """

  concatenated_samples = {k: sum(samples[k], []) for k in samples.keys()}

  total_length = len(concatenated_samples[list(samples.keys())[0]])

  total_length = (total_length // chunk_size) * chunk_size

  results = {
      k: [t[i:i+chunk_size] for i in range(0, total_length, chunk_size)]
      for k,t in concatenated_samples.items()
  }
  results["labels"] = results["input_ids"].copy()
  return results

def whole_word_masking_data_collator():
  pass

def preprocess_wikitext(raw_datasets, checkpoint, batch_size=8, shuffle=True, mask_prob=0.15, whole_word_masking=True, 
                        chunk_size=128):
  """
  Method to pre-process the wikitext dataset for masked language modelling.
  :param raw_datasets: Raw dataset downloaded from hugging face.
  :param checkpoint: Checkpoint to be used for the tokenizer.
  :param batch_size: Batch size to use for the train and eval dataloaders.
  :param shuffle: Whether to shuffle the data for training.
  :param mask_prob: Probability of masking a word within the dataset.
  :param whole_word_masking: Whether to use whole word masking or just token masking.
  :param chunk_size: What size chunks to use for all the samples (different to batch size, this is per sample)
  :return: pytorch dataloader for the training and validation dataset.
  """
  print("Using a batch size of: {}".format(batch_size))
  # load the tokenizer and datacollator from hugging face using the checkpoint specified
  tokenizer = AutoTokenizer.from_pretrained(checkpoint)
  if not whole_word_masking:
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=mask_prob)
  else:
    pass

  # lambda function to map to the raw_dataset
  tokenize_function = lambda sample: tokenizer(sample["text"])

  # map lambda function onto each sample to convert text to tokens
  tokenized_datasets = raw_datasets.map(tokenize_function, batched=True, remove_columns=["text"])
  tokenized_datasets = tokenized_datasets.map(concatenate_texts, fn_kwargs={"chunk_size":chunk_size}, batched=True)
  tokenized_datasets.set_format("torch")
  
  # define dataloaders
  train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=shuffle, batch_size=batch_size, collate_fn=data_collator
  )
  eval_dataloader = DataLoader(
    tokenized_datasets["validation"], batch_size=batch_size, collate_fn=data_collator
  )
  return train_dataloader, eval_dataloader

In [35]:
# read in wiki text dataset
raw_datasets = load_dataset("wikitext", "wikitext-2-raw-v1")
print(raw_datasets)
# define the model checkpoint to preprocess data and model
checkpoint = "distilbert-base-uncased"

# pre-process the raw_datasets using a tokenizer and datacollator to pad, giving us the pytorch train and eval dataloaders
train_dataloader, eval_dataloader = preprocess_wikitext(raw_datasets, checkpoint, batch_size=16, whole_word_masking=False, 
                                                        chunk_size=64)






Downloading builder script:   0%|          | 0.00/8.48k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/6.84k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/9.25k [00:00<?, ?B/s]

Downloading and preparing dataset wikitext/wikitext-2-raw-v1 to /root/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126...


Downloading data:   0%|          | 0.00/4.72M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

Dataset wikitext downloaded and prepared to /root/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['text'],
        num_rows: 36718
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 3760
    })
})
Using a batch size of: 16


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

  0%|          | 0/5 [00:00<?, ?ba/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (525 > 512). Running this sequence through the model will result in indexing errors


  0%|          | 0/37 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/5 [00:00<?, ?ba/s]

  0%|          | 0/37 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

In [36]:
# download model from checkpoint researched on hugging face
model = AutoModelForMaskedLM.from_pretrained(checkpoint)

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [37]:
# define optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)
print("number of training steps: {}".format(num_training_steps))

number of training steps: 6963


In [41]:
progress_bar = tqdm(range(num_training_steps), desc="training loop")
model.train()

for epoch in range(num_epochs):
  for batch in train_dataloader:
    outputs = model(**batch)
    loss = outputs.loss
    loss.backward()

    optimizer.step()
    lr_scheduler.step()
    optimizer.zero_grad()
    progress_bar.update(1)

  0%|          | 0/6963 [00:00<?, ?it/s]

KeyboardInterrupt: ignored

In [54]:
model.eval()
progress_bar = tqdm(range(len(eval_dataloader)), desc="evaluation loop")
losses = []
for step, batch in enumerate(eval_dataloader):
    with torch.no_grad():
        outputs = model(**batch)

    loss = outputs.loss
    losses.append(loss.repeat(1))
    progress_bar.update(1)



losses = torch.cat(losses)
try:
    perplexity = math.exp(torch.mean(losses))
except OverflowError:
    perplexity = float("inf")

print(f">>> Epoch {epoch}: Perplexity: {perplexity}")

evaluation loop:   0%|          | 0/241 [00:00<?, ?it/s]

>>> Epoch 0: Perplexity: 16.140094295989165


## Solution to dataset 3 - wikiann - Named Entity Recognition (NER)

In [10]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, DataCollatorWithPadding, TrainingArguments, Trainer, AdamW,  get_scheduler, DataCollatorForTokenClassification
from torch.utils.data import DataLoader
import torch
import evaluate
from tqdm.auto import tqdm
import numpy as np

In [29]:
def align_labels_with_tokens(labels, word_ids):
  """
  Method to align the labels with the tokens from the dataset.
  :param labels: a set of labels.
  :param word_ids: a set of word_ids from the dataset.
  :return new_labels: the new set of labels that have the same dim.
  """
  new_labels = []
  current_word = None

  # iterate over all word ids 
  for word_id in word_ids:
    # process next word id if not the same word
    if word_id != current_word:
      current_word = word_id
      label = -100 if word_id is None else labels[word_id]
      new_labels.append(label)
    elif word_id is None:
      new_labels.append(-100)
    else:
      label = labels[word_id]
      if label % 2 == 1:
        label +=1
      new_labels.append(label)
  return new_labels

def tokenize_and_align_labels(samples, tokenizer=None):
  """
  Method to tokenize and align the labels accordingly
  :param samples: samples to process.
  :param tokenizer: the tokeniser to use for tokenising.
  """
  tokenized_inputs = tokenizer(
      samples["tokens"], truncation=True, is_split_into_words=True
  )
  all_labels = samples["ner_tags"]
  new_labels = []
  for i, labels in enumerate(all_labels):
    word_ids = tokenized_inputs.word_ids(i)
    new_labels.append(align_labels_with_tokens(labels, word_ids))

  tokenized_inputs["labels"] = new_labels
  return tokenized_inputs


def preprocess_wikiann(raw_datasets, checkpoint, batch_size=8, shuffle=True):
  """
  Method to preprocess the wikiann NER dataset.
  :param raw_datasets: the raw dataset downloaded from hugging face.
  :param checkpoint: the checkpoint to use for the tokeniser.
  :param batch_size: batch size to use for the dataloaders.
  :param shuffle: whether to shuffle the training dataset.
  :return train_dataloader, eval_dataloader: dataloaders for training and evaluation loops.
  """

  # define tokeniser and the data collator
  tokenizer = AutoTokenizer.from_pretrained(checkpoint)
  data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
  
  # tokenise dataset properly
  tokenized_datasets = raw_datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
    fn_kwargs={"tokenizer":tokenizer}
)
  
  # set the format as pytorch tensors
  tokenized_datasets.set_format("torch")
  
  # define dataloaders
  train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=shuffle, batch_size=batch_size, collate_fn=data_collator
  )
  eval_dataloader = DataLoader(
    tokenized_datasets["validation"], batch_size=batch_size, collate_fn=data_collator
  )
  return train_dataloader, eval_dataloader

def post_process(predictions, labels):
  """
  Method to post-process the data, tokens to string for the predictions and labels without padding.
  :param predictions: predictions of tokens.
  :param labels: labels for tokens.
  :return true_labels, true_predictions: the labels and predictions unpadded with -100.
  """
  true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
  true_predictions = [
      [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
      for prediction, label in zip(predictions, labels)
  ]
  return true_labels, true_predictions

                               


In [30]:
# read in tweet_eval dataset
raw_datasets = load_dataset("wikiann", "en")

# define the model checkpoint to preprocess data and model
checkpoint = "bert-base-cased"

# define features and ids to label conversions and label to id conversions for model
ner_feature = raw_datasets["train"].features["ner_tags"]
label_names = ner_feature.feature.names
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

# define model
model = AutoModelForTokenClassification.from_pretrained(
    checkpoint,
    id2label=id2label,
    label2id=label2id,
)

# pre-process data
train_dataloader, val_dataloader = preprocess_wikiann(raw_datasets, checkpoint)



  0%|          | 0/3 [00:00<?, ?it/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cas

  0%|          | 0/10 [00:00<?, ?ba/s]

  0%|          | 0/10 [00:00<?, ?ba/s]

  0%|          | 0/20 [00:00<?, ?ba/s]

In [31]:
# define optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)
metric = evaluate.load("seqeval")
print("number of training steps: {}".format(num_training_steps))

number of training steps: 7500


In [32]:
progress_bar = tqdm(range(num_training_steps))
model.train()

for epoch in range(num_epochs):
  for batch in train_dataloader:
    outputs = model(**batch)
    loss = outputs.loss
    loss.backward()

    optimizer.step()
    lr_scheduler.step()
    optimizer.zero_grad()
    progress_bar.update(1)

  0%|          | 0/7500 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


KeyboardInterrupt: ignored

In [27]:
model.eval()
for batch in val_dataloader:
  with torch.no_grad():
    outputs = model(**batch)
  
  predictions = outputs.logits.argmax(dim=-1)
  labels = batch["labels"]
  labels, predictions = postprocess(predictions, labels)
  metric.add_batch(predictions=predictions, references=labels)
results = metric.compute()
print(
      f"epoch {epoch}:",
      {
          key: results[f"overall_{key}"]
          for key in ["precision", "recall", "f1", "accuracy"]
      },
  )


epoch 0: {'precision': 0.633793146495225, 'recall': 0.7178000848296339, 'f1': 0.6731859316471641, 'accuracy': 0.8745605806909732}
