In [None]:
!pip install datasets # HuggingFace Datasets

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.w

In [None]:
import torch

# We need to first test if the Colab environment is already in the use of GPU.
# Otherwise, we might restart the Colab environment later.
torch.cuda.is_available() # The printed result must be `True`.

True

In [None]:
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
)
from datasets import load_dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np
import pandas as pd

In [None]:
DATA_NAME = "dair-ai/emotion"
MODEL_NAME = "bert-base-uncased" # You can try other models.

In [None]:
train_data = load_dataset(DATA_NAME, split="train")
valid_data = load_dataset(DATA_NAME, split="validation")
test_data = load_dataset(DATA_NAME, split="test")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/9.05k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/1.03M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/127k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/129k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/16000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [None]:
# Check the classes in the dataset
print(np.unique(train_data["label"]))

num_labels = len(np.unique(train_data["label"]))

[0 1 2 3 4 5]


In [None]:
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
# 檢查特殊 token ID (special token ids)
for value in tokenizer.special_tokens_map.values():
    print(f"token {value} 的 ID 為：{tokenizer.convert_tokens_to_ids(value)}")

token [UNK] 的 ID 為：100
token [SEP] 的 ID 為：102
token [PAD] 的 ID 為：0
token [CLS] 的 ID 為：101
token [MASK] 的 ID 為：103


In [None]:
# TODO1: Pre-process sentences with `tokenizer`.
# You should set up the parameter for `tokenizer`
# to cut off the excessive sentences that are longer and the max length of BERT.

# Please note that we don't need to perform padding at this step,
# because we will perform dynamic padding later with DataCollator.

def preprocess_function(examples):
    return tokenizer(# Write your code here)

In [None]:
encoded_train = train_data.map(preprocess_function, batched=True, remove_columns=["text"])
encoded_valid = valid_data.map(preprocess_function, batched=True, remove_columns=["text"])
encoded_test = test_data.map(preprocess_function, batched=True, remove_columns=["text"])

# Please note that`batched` != batch_size
# `batched=True` means to process data in batches with the map function
# Usually, `batched=True` computes faster.

In [None]:
# Observation with a tokenized example

first_token_ids = encoded_test['input_ids'][0]
print(f"第一筆資料被轉換成 IDs 的結果: {first_token_ids}")
print(f"把 IDs 換回原本文字: {tokenizer.decode(first_token_ids, skip_special_tokens=False)}")
print(f"原始文字: {test_data['text'][0]}")

# You can also set `padding=True` in `preprocess_function` to observe the difference.

In [None]:
# Set up DataCollator for dynamic padding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
# Observation: dynamic padding with the DataCollator
# The lengths are differnt in different batches.

tmp_batch1 = data_collator(encoded_test[0:5])
print(len(tmp_batch1["input_ids"][0]))

tmp_batch2 = data_collator(encoded_test[5:10])
print(len(tmp_batch2["input_ids"][0]))

In [None]:
# Set up the BERT model for sequence classification

model = BertForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_labels,
)

In [None]:
# TODO2: Please finish `TrainingArguments`

training_args = TrainingArguments(
    output_dir="./results",
    # Write your code here
    eval_strategy="epoch",
    logging_dir="./logs",
    report_to='tensorboard', # You can use wandb
)

In [None]:
def compute_metrics(model_eval_pred):
    preds = model_eval_pred.predictions.argmax(axis=1)
    labels = model_eval_pred.label_ids

    # TODO3: Write scoring functions to get acc, precision, recall, and f1-score.
    # Write your code here

    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [None]:
# TODO4: Please finish `Trainer`

trainer = Trainer(
    # Write your code here
)

In [None]:
# Use 1 GPU for training
trainer.args._n_gpu=1

In [None]:
trainer.train()

In [None]:
metrics = trainer.evaluate(encoded_test)

In [None]:
print(metrics)

In [None]:
df = pd.DataFrame(metrics.items(), columns=["Metric", "Value"])
print(df.to_markdown(index=False))