In [1]:
# Import the libraries
import tensorflow as tf
from transformers import __version__ as transformers_version

# Print TensorFlow and Keras version
print("TensorFlow Version:", tf.__version__)

# Print Transformers (Hugging Face BERT) version
print("Transformers Version:", transformers_version)


TensorFlow Version: 2.16.1
Transformers Version: 4.39.3


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset
data = pd.read_csv('.data/emotions.csv')

# Drop the id column
data = data.drop('id', axis='columns')

# Display the first few rows of the dataframe
data.head()

# Use only a random sample of 10% of the dataset
data = data.sample(frac=0.1, random_state=42)


In [3]:
# Print the shape of the dataframe
print(data.shape)

# And let us count the number of unique categories
n_categories = data['label'].nunique()
print(n_categories)

(1000, 2)
6


Labels have the following meanings:

- 0: Sadness
- 1: Joy
- 2: Love
- 3: Anger
- 4: Fear
- 5: Surprise


In [4]:
# Build a dictionary of categories
categories = { 'sadness': 0, 'joy': 1, 'love': 2, 'anger': 3, 'fear': 4, 'surprise': 5 }

In [5]:
X = data['text']
y = data['label']

In [6]:
# Build the BERT model
from transformers import BertTokenizer

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [7]:
import numpy as np

X_tokenized = [tokenizer(text, max_length=512, padding='max_length', truncation=True)['input_ids'] for text in X]
X_tokenized = np.array(X_tokenized)

# Show the shape of X_tokenized
print(X_tokenized.shape)

# And the first element
X_tokenized[0]

(1000, 512)


array([  101,  1045,  2074,  2514,  2428, 13346,  1998,  3082, 18627,
         102,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,

In [8]:
from sklearn.model_selection import train_test_split

X_tokenized_train, X_tokenized_test, y_train, y_test = train_test_split(X_tokenized, y, test_size=0.2, random_state=42)

print(X_tokenized_train.shape)
print(X_tokenized_test.shape)

(800, 512)
(200, 512)


In [9]:
from transformers import TFBertForSequenceClassification
from transformers import BertConfig

# Build the BERT model
foundation_model_name = 'bert-base-uncased'

# Configure the model, there are a number of things we will leave untouched. 
model_config = BertConfig.from_pretrained(foundation_model_name, num_labels=n_categories)

model = TFBertForSequenceClassification.from_pretrained(foundation_model_name, config=model_config)


2024-04-14 21:05:49.836828: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M2
2024-04-14 21:05:49.836854: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 24.00 GB
2024-04-14 21:05:49.836861: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 8.00 GB
2024-04-14 21:05:49.836883: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-04-14 21:05:49.836899: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized f

In [10]:
# This is an untrained model, so the outputs will be meaningless

sample = X_tokenized_train[0:5]
result = model(sample)

# Print the shape of the logits
print(result.logits.shape)

# Print the first element of the logits (this is a tensor with 6 elements, each corresponding to the probability of a category)
print(result.logits[0])


(5, 6)
tf.Tensor([ 0.51544654  0.36981708  0.01936332  0.30641085 -0.01010974 -0.44475287], shape=(6,), dtype=float32)


In [11]:
import tensorflow as tf

# Now let us train the model
optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=5e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy')
model.compile(optimizer = optimizer, loss=loss, metrics=[metric])

# Add a callback to stop the training when the validation loss does not improve
stop_callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)

model.fit(X_tokenized_train, y_train, validation_data=(X_tokenized_test, y_test), epochs=100, batch_size=16, callbacks=[stop_callback])

# Save the model
model.save_pretrained('.data/bert-emotions.model')

Epoch 1/3
Cause: for/else statement not yet supported
Cause: for/else statement not yet supported


2024-04-14 21:06:13.197306: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.




KeyboardInterrupt: 