In [2]:
import transformers

#Set to avoid warning messages.
transformers.logging.set_verbosity_error()


  from .autonotebook import tqdm as notebook_tqdm


## 04.02. Loading a Hugging Face Dataset

In [3]:
from datasets import load_dataset

#Use pretrained model checkpoint from Huggingface
model_name = "distilbert-base-uncased"
#Use pre-labeled dataset from huggingface
dataset_name= "poem_sentiment"

poem_sentiments = load_dataset(dataset_name)

#Apache Arrow format
print(poem_sentiments)
print(poem_sentiments["test"][20:25])

print("\nSentiment Labels used", 
      poem_sentiments["train"].features.get("label").names)

Generating train split: 100%|██████████| 892/892 [00:00<00:00, 247229.18 examples/s]
Generating validation split: 100%|██████████| 105/105 [00:00<00:00, 61620.53 examples/s]
Generating test split: 100%|██████████| 104/104 [00:00<00:00, 70905.01 examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'verse_text', 'label'],
        num_rows: 892
    })
    validation: Dataset({
        features: ['id', 'verse_text', 'label'],
        num_rows: 105
    })
    test: Dataset({
        features: ['id', 'verse_text', 'label'],
        num_rows: 104
    })
})
{'id': [20, 21, 22, 23, 24], 'verse_text': ["as o'er the earth it wanders wide,", 'how hearts were answering to his own,', 'glad on its stone-built hearth; and thorough the wide-mouthed smoke-flue', 'sees the clouds reel and roll above our head,', '’tis to behold his vengeance for my son.'], 'label': [2, 1, 2, 2, 0]}

Sentiment Labels used ['negative', 'positive', 'no_impact', 'mixed']





## 04.03. Encoding and pre-processing the dataset

In [4]:
#Encoding text

from transformers import DistilBertTokenizer

db_tokenizer = DistilBertTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return db_tokenizer(batch["verse_text"], 
                        padding=True, 
                        truncation=True)

enc_poem_sentiment = poem_sentiments.map(
                        tokenize, 
                        batched=True, 
                        batch_size=None)

print(enc_poem_sentiment["train"][0:5])


Map: 100%|██████████| 892/892 [00:00<00:00, 5169.15 examples/s]
Map: 100%|██████████| 105/105 [00:00<00:00, 4360.11 examples/s]
Map: 100%|██████████| 104/104 [00:00<00:00, 4546.39 examples/s]

{'id': [0, 1, 2, 3, 4], 'verse_text': ['with pale blue berries. in these peaceful shades--', 'it flows so long as falls the rain,', 'and that is why, the lonesome day,', 'when i peruse the conquered fame of heroes, and the victories of mighty generals, i do not envy the generals,', 'of inward strife for truth and liberty.'], 'label': [1, 2, 0, 3, 3], 'input_ids': [[101, 2007, 5122, 2630, 22681, 1012, 1999, 2122, 9379, 13178, 1011, 1011, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 2009, 6223, 2061, 2146, 2004, 4212, 1996, 4542, 1010, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 1998, 2008, 2003, 2339, 1010, 1996, 10459, 14045, 2154, 1010, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 2043, 1045, 7304, 3366, 1996, 11438, 4476, 1997, 7348, 1010, 1998, 1996, 9248, 1997, 10478, 11593, 1010, 1045, 2079, 2025, 21103, 1996, 11593, 1010, 102, 0, 0], [101, 1997, 20546, 27865, 2005, 3606, 1998, 7044, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,




In [5]:
#Explore input IDs and Attention Mask

print("Text :", enc_poem_sentiment["train"][1].get("verse_text"))
print("\nInput Map :", enc_poem_sentiment["train"][1].get("input_ids"))
print("\nAttention Mask :", enc_poem_sentiment["train"][1].get("attention_mask"))

print("\nTotal tokens: ", len(enc_poem_sentiment["train"][1].get("input_ids")))
print("Non Zero tokens: ", len(list(filter( 
    lambda x :x > 0, enc_poem_sentiment["train"][1].get("input_ids")))))
print("Attention = 1: ", len(list(filter( 
    lambda x :x > 0, enc_poem_sentiment["train"][1].get("attention_mask")))))

Text : it flows so long as falls the rain,

Input Map : [101, 2009, 6223, 2061, 2146, 2004, 4212, 1996, 4542, 1010, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

Attention Mask : [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

Total tokens:  28
Non Zero tokens:  11
Attention = 1:  11


In [6]:
#Separate training and validation sets
training_dataset = enc_poem_sentiment["train"]
validation_dataset=enc_poem_sentiment["validation"]

print("\nColumn Names : ",training_dataset.column_names)
print("\nFeatures : ",training_dataset.features)

labels = training_dataset.features.get("label")
num_labels=len(labels.names)



Column Names :  ['id', 'verse_text', 'label', 'input_ids', 'attention_mask']

Features :  {'id': Value(dtype='int32', id=None), 'verse_text': Value(dtype='string', id=None), 'label': ClassLabel(names=['negative', 'positive', 'no_impact', 'mixed'], id=None), 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}


## 04.04. Creating the Model Architecture

In [7]:
from transformers import TFAutoModelForSequenceClassification

#Load transformer checkpoint from huggingface
sentiment_model = (TFAutoModelForSequenceClassification
            .from_pretrained(model_name, num_labels=num_labels))

sentiment_model.get_config()


2025-06-26 21:06:17.624294: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-06-26 21:06:17.627600: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-06-26 21:06:17.635091: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750971977.649268    6133 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750971977.653549    6133 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1750971977.665619    6133 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linkin

{'vocab_size': 30522,
 'max_position_embeddings': 512,
 'sinusoidal_pos_embds': False,
 'n_layers': 6,
 'n_heads': 12,
 'dim': 768,
 'hidden_dim': 3072,
 'dropout': 0.1,
 'attention_dropout': 0.1,
 'activation': 'gelu',
 'initializer_range': 0.02,
 'qa_dropout': 0.1,
 'seq_classif_dropout': 0.2,
 'return_dict': True,
 'output_hidden_states': False,
 'torchscript': False,
 'torch_dtype': None,
 'use_bfloat16': False,
 'tf_legacy_loss': False,
 'pruned_heads': {},
 'tie_word_embeddings': True,
 'chunk_size_feed_forward': 0,
 'is_encoder_decoder': False,
 'is_decoder': False,
 'cross_attention_hidden_size': None,
 'add_cross_attention': False,
 'tie_encoder_decoder': False,
 'max_length': 20,
 'min_length': 0,
 'do_sample': False,
 'early_stopping': False,
 'num_beams': 1,
 'num_beam_groups': 1,
 'diversity_penalty': 0.0,
 'temperature': 1.0,
 'top_k': 50,
 'top_p': 1.0,
 'typical_p': 1.0,
 'repetition_penalty': 1.0,
 'length_penalty': 1.0,
 'no_repeat_ngram_size': 0,
 'encoder_no_repeat_

In [8]:
#Freeze the first layer if needed
sentiment_model.layers[0].trainable = True

#Add/remove layers if needed.
#sentiment_model.layers [append()/insert()/remove()]

print(sentiment_model.summary())



Model: "tf_distil_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 distilbert (TFDistilBertMa  multiple                  66362880  
 inLayer)                                                        
                                                                 
 pre_classifier (Dense)      multiple                  590592    
                                                                 
 classifier (Dense)          multiple                  3076      
                                                                 
 dropout_19 (Dropout)        multiple                  0 (unused)
                                                                 
Total params: 66956548 (255.42 MB)
Trainable params: 66956548 (255.42 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


# 04.05. Training the Sentiment Model

In [9]:
#Using features from a pretrained model

batch_size=64
tokenizer_columns = db_tokenizer.model_input_names

# The column names to convert to TF tensors
train_dataset = training_dataset.to_tf_dataset(
    columns=tokenizer_columns, label_cols=["label"], shuffle=True,
    batch_size=batch_size)
val_dataset = validation_dataset.to_tf_dataset(
    columns=tokenizer_columns, label_cols=["label"], shuffle=False,
    batch_size=batch_size)

import tensorflow as tf

sentiment_model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=tf.metrics.SparseCategoricalAccuracy())

sentiment_model.fit(train_dataset, validation_data=val_dataset, epochs=5)

Old behaviour: columns=['a'], labels=['labels'] -> (tf.Tensor, tf.Tensor)  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor)  
New behaviour: columns=['a'],labels=['labels'] -> ({'a': tf.Tensor}, {'labels': tf.Tensor})  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor) 


Epoch 1/5


2025-06-26 21:06:27.133806: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 93763584 exceeds 10% of free system memory.


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tf_keras.src.callbacks.History at 0x7267181a56d0>

## 04.06. Predicting Sentiment with the Custom Model

In [10]:
from datasets import Dataset,DatasetDict

#Input data for interference to predict sentiment
# the "label" array is not needed for inference, but added to provide true labels for comparison
infer_data = {'id':[0,1], 
             'verse_text':['and be glad in the summer morning when the kindred ride on their way', 
                           'how hearts were answering to his own'],
             'label':[1,0]}

infer_dataset = Dataset.from_dict(infer_data)

ds_dict=DatasetDict()
ds_dict["infer"] = infer_dataset

print(ds_dict)

#Encode the dataset, similar to training
enc_dataset=ds_dict.map(tokenize, batched=True, batch_size=None)

#Convert to Tensors
infer_final_dataset = enc_dataset["infer"].to_tf_dataset(
    columns=tokenizer_columns,  shuffle=True,
    batch_size=batch_size)

print(infer_final_dataset)

#Predict with the model
predictions=sentiment_model.predict(infer_final_dataset)

DatasetDict({
    infer: Dataset({
        features: ['id', 'verse_text', 'label'],
        num_rows: 2
    })
})


Map: 100%|██████████| 2/2 [00:00<00:00, 455.75 examples/s]


<_PrefetchDataset element_spec={'input_ids': TensorSpec(shape=(None, 17), dtype=tf.int64, name=None), 'attention_mask': TensorSpec(shape=(None, 17), dtype=tf.int64, name=None)}>


In [11]:
predictions.logits

array([[-1.6812866 ,  2.474289  , -0.84737706, -0.25531378],
       [-1.5751283 ,  2.5304136 , -1.2026411 , -0.17950436]],
      dtype=float32)

In [12]:
import numpy as np
pred_label_ids=np.argmax(predictions.logits, axis=1)

for i in range(len(pred_label_ids)):
    print("Poem =", infer_data["verse_text"][i], 
          " Predicted=",labels.names[pred_label_ids[i]], 
          " True-Label=",labels.names[infer_data["label"][i]])

Poem = and be glad in the summer morning when the kindred ride on their way  Predicted= positive  True-Label= positive
Poem = how hearts were answering to his own  Predicted= positive  True-Label= negative
