<a href="https://colab.research.google.com/github/nyp-sit/it3103/blob/main/week12/bert-multiclass.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Fine-tuning BERT for Multi-Class Classification (Solution)

### Install Hugging Face Transformers library

In [None]:
!pip install transformers

In [None]:
import numpy as np
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split

## Data Preparation

In [None]:
data_url = 'https://nyp-aicourse.s3.ap-southeast-1.amazonaws.com/it3103/news.csv'
df = pd.read_csv(data_url, delimiter='\t')

In [None]:
df.head()

In [None]:
len(df)

In [None]:
SUBSET_SIZE = 2500

subset_df = df.sample(n=SUBSET_SIZE, random_state=128)

In [None]:
subset_df['CATEGORY'].value_counts()

In [None]:
def map_label(x):
    if x == 'e':
        return 0
    elif x == 't':
        return 1
    elif x == 'b':
        return 2
    elif x == 'm':
        return 3

labels_map = ['entertainment','tech','business','medical/health']

We now convert the text label into numeric values of 0 (negative) and 1 (positive) 

In [None]:
subset_df['CATEGORY'] =  subset_df['CATEGORY'].apply(map_label)

In [None]:
texts = subset_df['TITLE']
labels = subset_df['CATEGORY']

In [None]:
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=.2)
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.2)

In [None]:
len(train_texts)

## Tokenization


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

In [None]:
train_texts = train_texts.to_list()
train_labels = train_labels.to_list()
val_texts = val_texts.to_list()
val_labels = val_labels.to_list()
test_texts = test_texts.to_list()
test_labels = test_labels.to_list()

In [None]:
train_encodings = tokenizer(train_texts, padding=True, truncation=True)
val_encodings = tokenizer(val_texts, padding=True, truncation=True)
test_encodings = tokenizer(test_texts, padding=True, truncation=True)

In [None]:
batch_size = 16

train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels
)).batch(batch_size)

val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    val_labels
)).batch(batch_size)

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    test_labels
)).batch(batch_size)

## Fine-tuning the model

In [None]:
from transformers import TFAutoModelForSequenceClassification

model = TFAutoModelForSequenceClassification.from_pretrained(
        "distilbert-base-uncased", num_labels=4)

In [None]:
from tensorflow.keras.optimizers.schedules import PolynomialDecay

num_epochs = 2

# The number of training steps is the number of samples in the dataset, divided by the batch size then multiplied
# by the total number of epochs. Since our dataset is already batched, we can simply take the len.
num_train_steps = len(train_dataset) * num_epochs

lr_scheduler = PolynomialDecay(
    initial_learning_rate=5e-5, end_learning_rate=0.0, decay_steps=num_train_steps
)

In [None]:
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy

opt = Adam(learning_rate=lr_scheduler)

loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=opt, loss=loss, metrics=["accuracy"])

model.fit(train_dataset, validation_data=val_dataset, epochs=num_epochs)

In [None]:
model.evaluate(test_dataset)

In [None]:
model.save_pretrained('multiclass_model')

## Try out the model

In [None]:
my_model = TFAutoModelForSequenceClassification.from_pretrained(
        "multiclass_model")

In [None]:
text = input('Write your news article here:')

In [None]:
inputs = tokenizer(text, return_tensors="tf")
output = my_model(inputs)
pred_prob = tf.nn.softmax(output.logits, axis=-1)
pred = np.argmax(pred_prob)
print(labels_map[pred])