## Importing important libraries

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from transformers import GPT2Tokenizer, TFGPT2Model, GPT2Config
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
import os

# Hyperparameters
BATCH_SIZE = 32
LEARNING_RATE = 1e-5
EPOCHS = 30
TEST_SIZE = 0.2
RANDOM_STATE = 42
MAX_LEN = 128

FILE_PATH = 'emotion_data_merged_4.csv'
# Define the base path for saving model information
base_save_path = "GPT-2_checkpoints"
checkpoint_path = "GPT-2_checkpoints/cp-{epoch:04d}.ckpt"

2024-04-03 18:47:45.273351: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-03 18:47:45.273404: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-03 18:47:45.274811: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-04-03 18:47:45.281760: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Define a dictionary to map encoded emotions to their corresponding labels
emotion_labels = {0: 'anger', 1: 'fear', 2: 'joy', 3: 'sadness', 4: 'surprise', 5: 'neutral'}



In [3]:
# Load dataset
dataset = pd.read_csv(FILE_PATH)
dataset = dataset.dropna(subset=['emotion']).query("emotion != 'neutral'")
dataset = dataset.drop_duplicates()
# Assuming you start with 'sentence' and 'emotion' columns

# Encode the 'emotion' column to numeric labels
label_encoder = LabelEncoder()
dataset['encoded_emotion'] = label_encoder.fit_transform(dataset['emotion'])

# Create prompts without merging the labels into the prompts
dataset['prompt'] = "This text expresses: " + dataset['sentence']

In [4]:
# Splitting the dataset while keeping prompts and labels separate
X_train, X_val, y_train, y_val = train_test_split(
    dataset['prompt'], dataset['encoded_emotion'], 
    test_size=TEST_SIZE, 
    random_state=RANDOM_STATE, 
    stratify=dataset['encoded_emotion']
)


In [5]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

def encode_prompts(prompts):
    input_ids, attention_masks = [], []
    for prompt in prompts:
        encoded = tokenizer(prompt, padding='max_length', truncation=True, max_length=MAX_LEN, return_tensors="tf")
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
    return tf.concat(input_ids, axis=0), tf.concat(attention_masks, axis=0)

# Encode prompts using tokenizer (shown previously)
input_ids_train, attention_masks_train = encode_prompts(X_train.tolist())
input_ids_val, attention_masks_val = encode_prompts(X_val.tolist())

2024-04-03 18:47:50.789323: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:47] Overriding orig_value setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0.
2024-04-03 18:47:50.789595: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1929] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 11131 MB memory:  -> device: 0, name: NVIDIA RTX A6000, pci bus id: 0000:01:00.0, compute capability: 8.6


In [6]:
# Fit label encoder and return encoded labels
labels_encoded = dataset['encoded_emotion']

In [7]:
# Count the number of occurrences of each class
class_counts = np.unique(labels_encoded, return_counts=True)[1]

# Calculate total number of samples
total_samples = len(labels_encoded)

# Calculate class weights inversely proportional to the class frequencies
class_weights = {i: total_samples/(count * len(class_counts)) for i, count in enumerate(class_counts)}

In [8]:
# Specify the directory where you want to save the checkpoints
checkpoint_dir = os.path.dirname(checkpoint_path)

# Ensure the directory exists. If it doesn't, create it.
os.makedirs(base_save_path, exist_ok=True)

# Specify the checkpoint file path pattern
checkpoint_path = os.path.join(base_save_path, "cp-{epoch:04d}.ckpt")
checkpoint_dir = os.path.dirname(checkpoint_path)

# Create a ModelCheckpoint callback
cp_callback = ModelCheckpoint(
    filepath=checkpoint_path, 
    verbose=1, 
    save_weights_only=True, 
    save_best_only=True,  # Saves only the best model
    monitor='val_loss',  # Monitoring validation loss to determine the best model
    mode='min',  # Since we're monitoring 'val_loss', 'min' mode saves the model when the metric has decreased
    save_freq='epoch')  # Saving the model after each epoch

In [9]:
lr_scheduler = tf.keras.callbacks.ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.3,  # Reduction factor for the learning rate
    patience=2,  # Number of epochs with no improvement after which learning rate will be reduced
    verbose=1
)

In [10]:
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_accuracy',  # Monitor validation accuracy
    patience=3,             # Number of epochs with no improvement after which training will be stopped
    verbose=1,              # Log when training is stopped
    restore_best_weights=True, # Restore model weights from the epoch with the best value of the monitored quantity
    mode='max'
)

In [11]:
def build_model(num_labels):
    # Load the pre-trained GPT-2 model configuration
    config = GPT2Config.from_pretrained('gpt2', num_labels=num_labels)
    # Load the GPT-2 model from its configuration
    gpt2_model = TFGPT2Model.from_pretrained('gpt2', config=config)
    
    # Define the inputs
    input_ids = tf.keras.Input(shape=(MAX_LEN,), dtype=tf.int32, name="input_ids")
    attention_mask = tf.keras.Input(shape=(MAX_LEN,), dtype=tf.int32, name="attention_mask")
    
    # Get the outputs from the GPT-2 model
    outputs = gpt2_model(input_ids=input_ids, attention_mask=attention_mask)
    sequence_output = outputs.last_hidden_state[:, -1, :]  # Use the last hidden state
    
    # Add a dense layer for classification
    classifier_layer = tf.keras.layers.Dense(num_labels, activation='softmax', name='classifier')(sequence_output)
    
    # Construct the final model
    model = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=classifier_layer)
    
    # Compile the model
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE),
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])
    
    return model

model = build_model(num_labels=6)  # Set num_labels to 6 for multi-class classification


All PyTorch model weights were used when initializing TFGPT2Model.

All the weights of TFGPT2Model were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2Model for predictions without further training.


In [12]:
print("Shape of y_train:", y_train.shape)  # Expected to be something like (n_samples,)
print("Shape of y_val:", y_val.shape)     # Expected to be something like (n_samples,)

Shape of y_train: (383312,)
Shape of y_val: (95828,)


In [13]:
y_train = y_train.astype('int32')
y_val = y_val.astype('int32')

In [14]:
print("Shape of y_train:", y_train.shape)  # Expected to be something like (n_samples,)
print("Shape of y_val:", y_val.shape)     # Expected to be something like (n_samples,)

Shape of y_train: (383312,)
Shape of y_val: (95828,)


In [15]:
# Final layer check
model.summary()


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_ids (InputLayer)      [(None, 128)]                0         []                            
                                                                                                  
 attention_mask (InputLayer  [(None, 128)]                0         []                            
 )                                                                                                
                                                                                                  
 tfgpt2_model (TFGPT2Model)  TFBaseModelOutputWithPastA   1244398   ['input_ids[0][0]',           
                             ndCrossAttentions(last_hid   08         'attention_mask[0][0]']      
                             den_state=(None, 128, 768)                                       

In [16]:
print("Input IDs shape:", input_ids_train.shape)
print("Attention Masks shape:", attention_masks_train.shape)
print("Labels shape:", y_train.shape)
print("Labels type:", y_train.dtype)


Input IDs shape: (383312, 128)
Attention Masks shape: (383312, 128)
Labels shape: (383312,)
Labels type: int32


In [17]:
print("Unique labels in y_train:", np.unique(y_train))
print("Unique labels in y_val:", np.unique(y_val))
print("Number of unique labels:", len(np.unique(np.concatenate([y_train, y_val]))))
print("Shape of y_train:", y_train.shape)
print("Shape of y_val:", y_val.shape)

Unique labels in y_train: [0 1 2 3 4 5]
Unique labels in y_val: [0 1 2 3 4 5]
Number of unique labels: 6
Shape of y_train: (383312,)
Shape of y_val: (95828,)


In [18]:
print("Input IDs shape (train):", input_ids_train.shape)
print("Attention Masks shape (train):", attention_masks_train.shape)
print("Input IDs shape (val):", input_ids_val.shape)
print("Attention Masks shape (val):", attention_masks_val.shape)

Input IDs shape (train): (383312, 128)
Attention Masks shape (train): (383312, 128)
Input IDs shape (val): (95828, 128)
Attention Masks shape (val): (95828, 128)


In [19]:
print("First few labels in y_train:", y_train[:10])
print("First few labels in y_val:", y_val[:10])

First few labels in y_train: 72541     4
108553    3
659867    1
288572    0
525872    1
233682    5
236228    3
247269    0
47555     3
370301    0
Name: encoded_emotion, dtype: int32
First few labels in y_val: 399915    3
423120    3
49249     3
187865    3
15332     2
91596     3
73119     3
350455    3
524762    2
411584    0
Name: encoded_emotion, dtype: int32


In [20]:
# Print the first few labels and their corresponding emotions for y_train
print("First few labels in y_train:")
for index, label in y_train[:10].items():
    print("Label:", label, "| Emotion:", emotion_labels[label])

# Print the first few labels and their corresponding emotions for y_val
print("\nFirst few labels in y_val:")
for index, label in y_val[:10].items():
    print("Label:", label, "| Emotion:", emotion_labels[label])


First few labels in y_train:
Label: 4 | Emotion: surprise
Label: 3 | Emotion: sadness
Label: 1 | Emotion: fear
Label: 0 | Emotion: anger
Label: 1 | Emotion: fear
Label: 5 | Emotion: neutral
Label: 3 | Emotion: sadness
Label: 0 | Emotion: anger
Label: 3 | Emotion: sadness
Label: 0 | Emotion: anger

First few labels in y_val:
Label: 3 | Emotion: sadness
Label: 3 | Emotion: sadness
Label: 3 | Emotion: sadness
Label: 3 | Emotion: sadness
Label: 2 | Emotion: joy
Label: 3 | Emotion: sadness
Label: 3 | Emotion: sadness
Label: 3 | Emotion: sadness
Label: 2 | Emotion: joy
Label: 0 | Emotion: anger


In [23]:
# Proceed with model training as previously outlined, ensuring class weights and callbacks are applied
history = model.fit(
    [input_ids_train, attention_masks_train], y_train,
    validation_data=([input_ids_val, attention_masks_val], y_val),
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    class_weight=class_weights,  # Ensure class weights are correctly applied
    callbacks=[early_stopping, lr_scheduler, cp_callback],
)

Epoch 1/30

ValueError: Unexpected result of `train_function` (Empty logs). This could be due to issues in input pipeline that resulted in an empty dataset. Otherwise, please use `Model.compile(..., run_eagerly=True)`, or `tf.config.run_functions_eagerly(True)` for more information of where went wrong, or file a issue/bug to `tf.keras`.

In [None]:
# Prepare data
labels = dataset['encoded_emotion'].values
X_train, X_val, y_train, y_val = train_test_split(input_ids, labels, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=labels)

# Convert to TensorFlow datasets for efficiency
train_dataset = tf.data.Dataset.from_tensor_slices(({"input_ids": X_train, "attention_mask": attention_masks[X_train.index]}), y_train).shuffle(10000).batch(BATCH_SIZE)
val_dataset = tf.data.Dataset.from_tensor_slices(({"input_ids": X_val, "attention_mask": attention_masks[X_val.index]}), y_val).batch(BATCH_SIZE)

In [None]:
# Calculate class weights
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)

class_weights_dict = {i: weight for i, weight in enumerate(class_weights)}
print("Class weights:", class_weights_dict)

In [None]:
# Callbacks
os.makedirs(BASE_SAVE_PATH, exist_ok=True)
checkpoint_path = os.path.join(BASE_SAVE_PATH, "cp-{epoch:04d}.ckpt")

callbacks = [
    EarlyStopping(monitor='val_loss', patience=3, verbose=1, restore_best_weights=True),
    ModelCheckpoint(filepath=checkpoint_path, monitor='val_loss', save_best_only=True, save_weights_only=True, verbose=1),
    ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=2, verbose=1)
]

# Training
history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=EPOCHS,
    callbacks=callbacks,
    class_weight=class_weights_dict
)

In [None]:
# Evaluate the model
val_loss, val_accuracy = model.evaluate(val_dataset)
print(f"Validation loss: {val_loss}, Validation accuracy: {val_accuracy}")

# Inference example
def predict_emotion(text):
    prompt = "This text expresses: " + text + " The emotion is: [LABEL]"
    encoded_prompt = tokenizer.encode_plus(
        prompt, add_special_tokens=True, max_length=MAX_LEN, padding='max_length',
        truncation=True, return_attention_mask=True, return_tensors='tf'
    )
    input_ids = encoded_prompt['input_ids']
    attention_mask = encoded_prompt['attention_mask']
    
    predictions = model.predict({'input_ids': input_ids, 'attention_mask': attention_mask})
    predicted_label_idx = tf.argmax(predictions, axis=1).numpy()[0]
    predicted_label = label_encoder.inverse_transform([predicted_label_idx])[0]
    return predicted_label

# Example inference
sample_text = "I feel so happy and joyful today!"
predicted_emotion = predict_emotion(sample_text)
print(f"The predicted emotion for '{sample_text}' is: {predicted_emotion}")


## Hyperparameters 

Easy and straightforward hyperparameter tuning

In [2]:
# Hyperparameters
BATCH_SIZE = 64
LEARNING_RATE = 1e-5
EPOCHS = 30
TEST_SIZE = 0.2
RANDOM_STATE = 42
MAX_LEN = 128  # Maximum sequence length
NUM_LABELS = None  # This will be set after loading the data

# Paths
FILE_PATH = 'Tasks/emotion_data_merged_4.csv'
BASE_SAVE_PATH = "GPT-2_v1_checkpoints"
CHECKPOINT_PATH = "GPT-2_v1_checkpoints/cp-{epoch:04d}.ckpt"

## F1 Metric and tokenizer function

In [3]:
def f1_metric(y_true, y_pred):
    def recall(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [4]:
def extract_label_from_generated_text(generated_text):
    # Assuming labels is a list of label names
    predicted_label_idx = tf.argmax(generated_text, axis=-1)
    with tf.compat.v1.Session() as sess:  # Using a TensorFlow session to evaluate the tensor
        predicted_label_idx_value = sess.run(predicted_label_idx)
    predicted_label = labels[predicted_label_idx_value]
    return predicted_label



In [5]:
class ExactMatchAccuracy(tf.keras.metrics.Metric):
    def __init__(self, name='exact_match_accuracy', **kwargs):
        super(ExactMatchAccuracy, self).__init__(name=name, **kwargs)
        self.correct_predictions = self.add_weight(name='cp', initializer='zeros')
        self.total_predictions = self.add_weight(name='tp', initializer='zeros')
    
    def update_state(self, y_true, y_pred, sample_weight=None):
        with ag__.FunctionScope('update_state', 'fscope', ag__.STD) as fscope:
            y_pred_texts = [extract_label_from_generated_text(text) for text in tf.unstack(y_pred, axis=-1)]
            y_true_texts = [tf.strings.decode(label) for label in tf.unstack(y_true, axis=-1)]
        
        for true, pred in zip(y_true_texts, y_pred_texts):
            if true == pred:
                self.correct_predictions.assign_add(1)
            self.total_predictions.assign_add(1)

    def result(self):
        return self.correct_predictions / self.total_predictions

    def reset_states(self):
        self.correct_predictions.assign(0)
        self.total_predictions.assign(0)

## Setup and Data Preparation

Loading the dataset, preprocessing the text, and preparing the data for the model.

In [6]:
# Specify the path to your CSV file
file_path = 'Tasks/emotion_data_merged_4.csv'

# Load the dataset
dataset = pd.read_csv(file_path)

dataset = dataset.dropna(subset=['emotion']).query("emotion != 'neutral'")
data = dataset.drop_duplicates()

# Shuffle the combined dataset
data = data.sample(frac=1).reset_index(drop=True)
# Assuming `data` is your DataFrame with 'sentence' and 'emotion' columns
data['prompt'] = "This text expresses: " + data['sentence'] + " The emotion is:"

# Display basic information about the combined dataset
print("Preview of the dataset:")
print(data.head())

print("\nSummary statistics of the  dataset:")
print(data.describe())

print("\nInformation about columns in the dataset:")
print(data.info())

sentences = data['sentence'].values
labels = data['emotion'].values

Preview of the dataset:
                                            sentence    emotion  \
0  im feeling its time to go their presence is no...    sadness   
1  i handle it this time around should i be very ...  happiness   
2  i feel more and more like a murderer of innoce...  happiness   
3  i really shouldnt rant when im feeling like th...  happiness   
4  i do not ever recall feeling the searing inten...  happiness   

                                              prompt  
0  This text expresses: im feeling its time to go...  
1  This text expresses: i handle it this time aro...  
2  This text expresses: i feel more and more like...  
3  This text expresses: i really shouldnt rant wh...  
4  This text expresses: i do not ever recall feel...  

Summary statistics of the  dataset:
                                                 sentence    emotion  \
count                                              479140     479140   
unique                                             479140     

In [7]:
# Encode the labels
label_encoder = LabelEncoder()

In [8]:
# Initialize tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

input_ids = []
attention_masks = []

for sent in sentences:
    encoded_dict = tokenizer.encode_plus(
        sent,                      
        add_special_tokens=True,   
        max_length=MAX_LEN,       
        padding='max_length',     # Use 'max_length' for explicit padding
        truncation=True,           # Explicitly activate truncation
        return_attention_mask=True,   
        return_tensors='tf',      
    )
    
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])
    
labels = label_encoder.fit_transform(labels)
labels = tf.convert_to_tensor(labels)



2024-04-03 13:47:01.607477: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:47] Overriding orig_value setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0.
2024-04-03 13:47:01.607834: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1929] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 14708 MB memory:  -> device: 0, name: NVIDIA RTX A6000, pci bus id: 0000:01:00.0, compute capability: 8.6


In [9]:
# Ensure all inputs to train_test_split are numpy arrays
input_ids_np = input_ids.numpy() if isinstance(input_ids, tf.Tensor) else input_ids
attention_masks_np = attention_masks.numpy() if isinstance(attention_masks, tf.Tensor) else attention_masks
labels_np = labels.numpy() if isinstance(labels, tf.Tensor) else labels

# Now perform the train-test split
X_train, X_val, y_train, y_val = train_test_split(input_ids_np, labels_np, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=labels_np)
train_mask, val_mask = train_test_split(attention_masks_np, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=labels_np)

X_train = tf.convert_to_tensor(X_train, dtype=tf.int32)
X_val = tf.convert_to_tensor(X_val, dtype=tf.int32)
y_train = tf.convert_to_tensor(y_train, dtype=tf.int32)
y_val = tf.convert_to_tensor(y_val, dtype=tf.int32)


In [10]:
# Create TensorFlow datasets for the training and validation sets
train_dataset = tf.data.Dataset.from_tensor_slices(({"input_ids": X_train, "attention_mask": train_mask}, y_train)).shuffle(10000).batch(BATCH_SIZE)
val_dataset = tf.data.Dataset.from_tensor_slices(({"input_ids": X_val, "attention_mask": val_mask}, y_val)).batch(BATCH_SIZE)


## Model training

Setting up the RoBERTa model, defining the training loop, and initiating the training process.

In [11]:
# Load RoBERTa model
model_config = GPT2Config.from_pretrained('gpt2', num_labels=NUM_LABELS)
class GPT2ForSequenceClassification(tf.keras.Model):
    def __init__(self, model_name, num_labels):
        super(GPT2ForSequenceClassification, self).__init__()
        self.gpt2 = TFGPT2Model.from_pretrained(model_name)
        self.dropout = tf.keras.layers.Dropout(0.1)
        self.classifier = tf.keras.layers.Dense(num_labels, activation='softmax')
        
    def call(self, inputs):
        outputs = self.gpt2(inputs)
        sequence_output = outputs.last_hidden_state[:, -1, :]
        sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output)
        
        return logits


model = GPT2ForSequenceClassification('gpt2', NUM_LABELS)

optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

# Create TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices(({"input_ids": input_ids, "attention_mask": attention_masks}, labels)).shuffle(10000).batch(BATCH_SIZE)




All PyTorch model weights were used when initializing TFGPT2Model.

All the weights of TFGPT2Model were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2Model for predictions without further training.


In [12]:
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_accuracy',  # Monitor validation accuracy
    patience=3,             # Number of epochs with no improvement after which training will be stopped
    verbose=1,              # Log when training is stopped
    restore_best_weights=True, # Restore model weights from the epoch with the best value of the monitored quantity
    mode='max'
)

In [13]:
lr_scheduler = tf.keras.callbacks.ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.1,  # Reduction factor for the learning rate
    patience=2,  # Number of epochs with no improvement after which learning rate will be reduced
    verbose=1
)

In [14]:
# Fit label encoder and return encoded labels
labels_encoded = label_encoder.fit_transform(labels)

In [15]:
# Count the number of occurrences of each class
class_counts = np.unique(labels_encoded, return_counts=True)[1]

# Calculate total number of samples
total_samples = len(labels_encoded)

# Calculate class weights inversely proportional to the class frequencies
class_weights = {i: total_samples/(count * len(class_counts)) for i, count in enumerate(class_counts)}

In [16]:
# Specify the directory where you want to save the checkpoints
checkpoint_dir = os.path.dirname(checkpoint_path)

# Ensure the directory exists. If it doesn't, create it.
os.makedirs(base_save_path, exist_ok=True)

# Specify the checkpoint file path pattern
checkpoint_path = os.path.join(base_save_path, "cp-{epoch:04d}.ckpt")
checkpoint_dir = os.path.dirname(checkpoint_path)

# Create a ModelCheckpoint callback
cp_callback = ModelCheckpoint(
    filepath=checkpoint_path, 
    verbose=1, 
    save_weights_only=True, 
    save_best_only=True,  # Saves only the best model
    monitor='val_loss',  # Monitoring validation loss to determine the best model
    mode='min',  # Since we're monitoring 'val_loss', 'min' mode saves the model when the metric has decreased
    save_freq='epoch')  # Saving the model after each epoch

In [17]:
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy', f1_metric, ExactMatchAccuracy()])

In [18]:
# Fit the model with class weights, early stopping, and learning rate scheduler
history = model.fit(
    train_dataset,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_data=val_dataset,
    callbacks=[early_stopping, lr_scheduler, cp_callback],
    class_weight=class_weights  # Use the calculated class weights
)


Epoch 1/30


2024-04-03 13:54:12.226826: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1929] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 14708 MB memory:  -> device: 0, name: NVIDIA RTX A6000, pci bus id: 0000:01:00.0, compute capability: 8.6
2024-04-03 13:54:12.402946: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:388] MLIR V1 optimization pass is not enabled


InvalidArgumentError: in user code:

    File "/usr/local/lib/python3.11/dist-packages/keras/src/engine/training.py", line 1401, in train_function  *
        return step_function(self, iterator)
    File "/tmp/ipykernel_426852/3574840235.py", line 9, in update_state  *
        y_pred_texts = [extract_label_from_generated_text(text) for text in tf.unstack(y_pred, axis=-1)]
    File "/tmp/ipykernel_426852/3505807301.py", line 5, in extract_label_from_generated_text  *
        predicted_label_idx_value = sess.run(predicted_label_idx)

    InvalidArgumentError: Graph execution error:
    
    Detected at node 'iterator' defined at (most recent call last):
        File "<frozen runpy>", line 198, in _run_module_as_main
        File "<frozen runpy>", line 88, in _run_code
        File "/usr/local/lib/python3.11/dist-packages/ipykernel_launcher.py", line 18, in <module>
        File "/usr/local/lib/python3.11/dist-packages/traitlets/config/application.py", line 1075, in launch_instance
        File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelapp.py", line 739, in start
        File "/usr/local/lib/python3.11/dist-packages/tornado/platform/asyncio.py", line 205, in start
        File "/usr/lib/python3.11/asyncio/base_events.py", line 604, in run_forever
        File "/usr/lib/python3.11/asyncio/base_events.py", line 1909, in _run_once
        File "/usr/lib/python3.11/asyncio/events.py", line 80, in _run
        File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelbase.py", line 545, in dispatch_queue
        File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelbase.py", line 534, in process_one
        File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelbase.py", line 437, in dispatch_shell
        File "/usr/local/lib/python3.11/dist-packages/ipykernel/ipkernel.py", line 359, in execute_request
        File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelbase.py", line 778, in execute_request
        File "/usr/local/lib/python3.11/dist-packages/ipykernel/ipkernel.py", line 446, in do_execute
        File "/usr/local/lib/python3.11/dist-packages/ipykernel/zmqshell.py", line 549, in run_cell
        File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 3075, in run_cell
        File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 3130, in _run_cell
        File "/usr/local/lib/python3.11/dist-packages/IPython/core/async_helpers.py", line 129, in _pseudo_sync_runner
        File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 3334, in run_cell_async
        File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 3517, in run_ast_nodes
        File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 3577, in run_code
        File "/tmp/ipykernel_426852/3662132575.py", line 2, in <module>
        File "/usr/local/lib/python3.11/dist-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler
        File "/usr/local/lib/python3.11/dist-packages/keras/src/engine/training.py", line 1807, in fit
        File "/usr/local/lib/python3.11/dist-packages/tensorflow/core/function/polymorphism/function_type.py", line 356, in placeholder_arguments
    Node: 'iterator'
    Detected at node 'gpt2_for_sequence_classification/tfgpt2_model/transformer/h_._8/mlp/c_fc/add/ReadVariableOp/resource' defined at (most recent call last):
        File "<frozen runpy>", line 198, in _run_module_as_main
        File "<frozen runpy>", line 88, in _run_code
        File "/usr/local/lib/python3.11/dist-packages/ipykernel_launcher.py", line 18, in <module>
        File "/usr/local/lib/python3.11/dist-packages/traitlets/config/application.py", line 1075, in launch_instance
        File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelapp.py", line 739, in start
        File "/usr/local/lib/python3.11/dist-packages/tornado/platform/asyncio.py", line 205, in start
        File "/usr/lib/python3.11/asyncio/base_events.py", line 604, in run_forever
        File "/usr/lib/python3.11/asyncio/base_events.py", line 1909, in _run_once
        File "/usr/lib/python3.11/asyncio/events.py", line 80, in _run
        File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelbase.py", line 545, in dispatch_queue
        File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelbase.py", line 534, in process_one
        File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelbase.py", line 437, in dispatch_shell
        File "/usr/local/lib/python3.11/dist-packages/ipykernel/ipkernel.py", line 359, in execute_request
        File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelbase.py", line 778, in execute_request
        File "/usr/local/lib/python3.11/dist-packages/ipykernel/ipkernel.py", line 446, in do_execute
        File "/usr/local/lib/python3.11/dist-packages/ipykernel/zmqshell.py", line 549, in run_cell
        File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 3075, in run_cell
        File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 3130, in _run_cell
        File "/usr/local/lib/python3.11/dist-packages/IPython/core/async_helpers.py", line 129, in _pseudo_sync_runner
        File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 3334, in run_cell_async
        File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 3517, in run_ast_nodes
        File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 3577, in run_code
        File "/tmp/ipykernel_426852/3662132575.py", line 2, in <module>
        File "/usr/local/lib/python3.11/dist-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler
        File "/usr/local/lib/python3.11/dist-packages/keras/src/engine/training.py", line 1807, in fit
        File "/usr/local/lib/python3.11/dist-packages/keras/src/engine/training.py", line 1401, in train_function
        File "/usr/local/lib/python3.11/dist-packages/keras/src/engine/training.py", line 1384, in step_function
        File "/usr/local/lib/python3.11/dist-packages/keras/src/engine/training.py", line 1373, in run_step
        File "/usr/local/lib/python3.11/dist-packages/keras/src/engine/training.py", line 1150, in train_step
        File "/usr/local/lib/python3.11/dist-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler
        File "/usr/local/lib/python3.11/dist-packages/keras/src/engine/training.py", line 590, in __call__
        File "/usr/local/lib/python3.11/dist-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler
        File "/usr/local/lib/python3.11/dist-packages/keras/src/engine/base_layer.py", line 1149, in __call__
        File "/usr/local/lib/python3.11/dist-packages/keras/src/utils/traceback_utils.py", line 96, in error_handler
        File "/tmp/ipykernel_426852/3513494651.py", line 11, in call
        File "/usr/local/lib/python3.11/dist-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler
        File "/usr/local/lib/python3.11/dist-packages/keras/src/engine/training.py", line 590, in __call__
        File "/usr/local/lib/python3.11/dist-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler
        File "/usr/local/lib/python3.11/dist-packages/keras/src/engine/base_layer.py", line 1149, in __call__
        File "/usr/local/lib/python3.11/dist-packages/keras/src/utils/traceback_utils.py", line 96, in error_handler
        File "/usr/local/lib/python3.11/dist-packages/transformers/modeling_tf_utils.py", line 783, in run_call_with_unpacked_inputs
        File "/usr/local/lib/python3.11/dist-packages/transformers/models/gpt2/modeling_tf_gpt2.py", line 811, in call
        File "/usr/local/lib/python3.11/dist-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler
        File "/usr/local/lib/python3.11/dist-packages/keras/src/engine/base_layer.py", line 1149, in __call__
        File "/usr/local/lib/python3.11/dist-packages/keras/src/utils/traceback_utils.py", line 96, in error_handler
        File "/usr/local/lib/python3.11/dist-packages/transformers/modeling_tf_utils.py", line 783, in run_call_with_unpacked_inputs
        File "/usr/local/lib/python3.11/dist-packages/transformers/models/gpt2/modeling_tf_gpt2.py", line 518, in call
        File "/usr/local/lib/python3.11/dist-packages/transformers/models/gpt2/modeling_tf_gpt2.py", line 522, in call
        File "/usr/local/lib/python3.11/dist-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler
        File "/usr/local/lib/python3.11/dist-packages/keras/src/engine/base_layer.py", line 1149, in __call__
        File "/usr/local/lib/python3.11/dist-packages/keras/src/utils/traceback_utils.py", line 96, in error_handler
        File "/usr/local/lib/python3.11/dist-packages/transformers/models/gpt2/modeling_tf_gpt2.py", line 327, in call
        File "/usr/local/lib/python3.11/dist-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler
        File "/usr/local/lib/python3.11/dist-packages/keras/src/engine/base_layer.py", line 1149, in __call__
        File "/usr/local/lib/python3.11/dist-packages/keras/src/utils/traceback_utils.py", line 96, in error_handler
        File "/usr/local/lib/python3.11/dist-packages/transformers/models/gpt2/modeling_tf_gpt2.py", line 238, in call
        File "/usr/local/lib/python3.11/dist-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler
        File "/usr/local/lib/python3.11/dist-packages/keras/src/engine/base_layer.py", line 1149, in __call__
        File "/usr/local/lib/python3.11/dist-packages/keras/src/utils/traceback_utils.py", line 96, in error_handler
        File "/usr/local/lib/python3.11/dist-packages/transformers/modeling_tf_utils.py", line 3214, in call
        File "/usr/local/lib/python3.11/dist-packages/tensorflow/core/function/capture/capture_container.py", line 141, in capture_by_value
        File "/usr/local/lib/python3.11/dist-packages/tensorflow/core/function/capture/capture_container.py", line 285, in _create_placeholder_helper
    Node: 'gpt2_for_sequence_classification/tfgpt2_model/transformer/h_._8/mlp/c_fc/add/ReadVariableOp/resource'
    2 root error(s) found.
      (0) INVALID_ARGUMENT: You must feed a value for placeholder tensor 'iterator' with dtype resource
    	 [[{{node iterator}}]]
      (1) INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gpt2_for_sequence_classification/tfgpt2_model/transformer/h_._8/mlp/c_fc/add/ReadVariableOp/resource' with dtype resource
    	 [[{{node gpt2_for_sequence_classification/tfgpt2_model/transformer/h_._8/mlp/c_fc/add/ReadVariableOp/resource}}]]
    0 successful operations.
    0 derived errors ignored.
    
    Original stack trace for 'iterator':
      File "<frozen runpy>", line 198, in _run_module_as_main
      File "<frozen runpy>", line 88, in _run_code
      File "/usr/local/lib/python3.11/dist-packages/ipykernel_launcher.py", line 18, in <module>
      File "/usr/local/lib/python3.11/dist-packages/traitlets/config/application.py", line 1075, in launch_instance
      File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelapp.py", line 739, in start
      File "/usr/local/lib/python3.11/dist-packages/tornado/platform/asyncio.py", line 205, in start
      File "/usr/lib/python3.11/asyncio/base_events.py", line 604, in run_forever
      File "/usr/lib/python3.11/asyncio/base_events.py", line 1909, in _run_once
      File "/usr/lib/python3.11/asyncio/events.py", line 80, in _run
      File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelbase.py", line 545, in dispatch_queue
      File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelbase.py", line 534, in process_one
      File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelbase.py", line 437, in dispatch_shell
      File "/usr/local/lib/python3.11/dist-packages/ipykernel/ipkernel.py", line 359, in execute_request
      File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelbase.py", line 778, in execute_request
      File "/usr/local/lib/python3.11/dist-packages/ipykernel/ipkernel.py", line 446, in do_execute
      File "/usr/local/lib/python3.11/dist-packages/ipykernel/zmqshell.py", line 549, in run_cell
      File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 3075, in run_cell
      File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 3130, in _run_cell
      File "/usr/local/lib/python3.11/dist-packages/IPython/core/async_helpers.py", line 129, in _pseudo_sync_runner
      File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 3334, in run_cell_async
      File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 3517, in run_ast_nodes
      File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 3577, in run_code
      File "/tmp/ipykernel_426852/3662132575.py", line 2, in <module>
      File "/usr/local/lib/python3.11/dist-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler
      File "/usr/local/lib/python3.11/dist-packages/keras/src/engine/training.py", line 1807, in fit
      File "/usr/local/lib/python3.11/dist-packages/tensorflow/python/util/traceback_utils.py", line 150, in error_handler
      File "/usr/local/lib/python3.11/dist-packages/tensorflow/python/eager/polymorphic_function/polymorphic_function.py", line 832, in __call__
      File "/usr/local/lib/python3.11/dist-packages/tensorflow/python/eager/polymorphic_function/polymorphic_function.py", line 888, in _call
      File "/usr/local/lib/python3.11/dist-packages/tensorflow/python/eager/polymorphic_function/polymorphic_function.py", line 695, in _initialize
      File "/usr/local/lib/python3.11/dist-packages/tensorflow/python/eager/polymorphic_function/tracing_compilation.py", line 178, in trace_function
      File "/usr/local/lib/python3.11/dist-packages/tensorflow/python/eager/polymorphic_function/tracing_compilation.py", line 283, in _maybe_define_function
      File "/usr/local/lib/python3.11/dist-packages/tensorflow/python/eager/polymorphic_function/tracing_compilation.py", line 303, in _create_concrete_function
      File "/usr/local/lib/python3.11/dist-packages/tensorflow/core/function/polymorphism/function_type.py", line 356, in placeholder_arguments
      File "/usr/local/lib/python3.11/dist-packages/tensorflow/python/framework/type_spec.py", line 242, in placeholder_value
      File "/usr/local/lib/python3.11/dist-packages/tensorflow/python/util/nest.py", line 631, in map_structure
      File "/usr/local/lib/python3.11/dist-packages/tensorflow/python/util/nest_util.py", line 1066, in map_structure
      File "/usr/local/lib/python3.11/dist-packages/tensorflow/python/util/nest_util.py", line 1106, in _tf_core_map_structure
      File "/usr/local/lib/python3.11/dist-packages/tensorflow/python/util/nest_util.py", line 1106, in <listcomp>
      File "/usr/local/lib/python3.11/dist-packages/tensorflow/python/framework/type_spec.py", line 243, in <lambda>
      File "/usr/local/lib/python3.11/dist-packages/tensorflow/python/framework/tensor.py", line 1022, in placeholder_value
      File "/usr/local/lib/python3.11/dist-packages/tensorflow/python/framework/tensor.py", line 1060, in _graph_placeholder
      File "/usr/local/lib/python3.11/dist-packages/tensorflow/python/framework/func_graph.py", line 670, in _create_op_internal
      File "/usr/local/lib/python3.11/dist-packages/tensorflow/python/framework/ops.py", line 2652, in _create_op_internal
      File "/usr/local/lib/python3.11/dist-packages/tensorflow/python/framework/ops.py", line 1160, in from_node_def
    


Original stack trace for 'iterator':
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/usr/local/lib/python3.11/dist-packages/ipykernel_launcher.py", line 18, in <module>
  File "/usr/local/lib/python3.11/dist-packages/traitlets/config/application.py", line 1075, in launch_instance
  File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelapp.py", line 739, in start
  File "/usr/local/lib/python3.11/dist-packages/tornado/platform/asyncio.py", line 205, in start
  File "/usr/lib/python3.11/asyncio/base_events.py", line 604, in run_forever
  File "/usr/lib/python3.11/asyncio/base_events.py", line 1909, in _run_once
  File "/usr/lib/python3.11/asyncio/events.py", line 80, in _run
  File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelbase.py", line 545, in dispatch_queue
  File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelbase.py", line 534, in process_one
  File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelbase.py", line 437, in dispatch_shell
  File "/usr/local/lib/python3.11/dist-packages/ipykernel/ipkernel.py", line 359, in execute_request
  File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelbase.py", line 778, in execute_request
  File "/usr/local/lib/python3.11/dist-packages/ipykernel/ipkernel.py", line 446, in do_execute
  File "/usr/local/lib/python3.11/dist-packages/ipykernel/zmqshell.py", line 549, in run_cell
  File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 3075, in run_cell
  File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 3130, in _run_cell
  File "/usr/local/lib/python3.11/dist-packages/IPython/core/async_helpers.py", line 129, in _pseudo_sync_runner
  File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 3334, in run_cell_async
  File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 3517, in run_ast_nodes
  File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 3577, in run_code
  File "/tmp/ipykernel_426852/3662132575.py", line 2, in <module>
  File "/usr/local/lib/python3.11/dist-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler
  File "/usr/local/lib/python3.11/dist-packages/keras/src/engine/training.py", line 1807, in fit
  File "/usr/local/lib/python3.11/dist-packages/tensorflow/python/util/traceback_utils.py", line 150, in error_handler
  File "/usr/local/lib/python3.11/dist-packages/tensorflow/python/eager/polymorphic_function/polymorphic_function.py", line 832, in __call__
  File "/usr/local/lib/python3.11/dist-packages/tensorflow/python/eager/polymorphic_function/polymorphic_function.py", line 888, in _call
  File "/usr/local/lib/python3.11/dist-packages/tensorflow/python/eager/polymorphic_function/polymorphic_function.py", line 695, in _initialize
  File "/usr/local/lib/python3.11/dist-packages/tensorflow/python/eager/polymorphic_function/tracing_compilation.py", line 178, in trace_function
  File "/usr/local/lib/python3.11/dist-packages/tensorflow/python/eager/polymorphic_function/tracing_compilation.py", line 283, in _maybe_define_function
  File "/usr/local/lib/python3.11/dist-packages/tensorflow/python/eager/polymorphic_function/tracing_compilation.py", line 303, in _create_concrete_function
  File "/usr/local/lib/python3.11/dist-packages/tensorflow/core/function/polymorphism/function_type.py", line 356, in placeholder_arguments
  File "/usr/local/lib/python3.11/dist-packages/tensorflow/python/framework/type_spec.py", line 242, in placeholder_value
  File "/usr/local/lib/python3.11/dist-packages/tensorflow/python/util/nest.py", line 631, in map_structure
  File "/usr/local/lib/python3.11/dist-packages/tensorflow/python/util/nest_util.py", line 1066, in map_structure
  File "/usr/local/lib/python3.11/dist-packages/tensorflow/python/util/nest_util.py", line 1106, in _tf_core_map_structure
  File "/usr/local/lib/python3.11/dist-packages/tensorflow/python/util/nest_util.py", line 1106, in <listcomp>
  File "/usr/local/lib/python3.11/dist-packages/tensorflow/python/framework/type_spec.py", line 243, in <lambda>
  File "/usr/local/lib/python3.11/dist-packages/tensorflow/python/framework/tensor.py", line 1022, in placeholder_value
  File "/usr/local/lib/python3.11/dist-packages/tensorflow/python/framework/tensor.py", line 1060, in _graph_placeholder
  File "/usr/local/lib/python3.11/dist-packages/tensorflow/python/framework/func_graph.py", line 670, in _create_op_internal
  File "/usr/local/lib/python3.11/dist-packages/tensorflow/python/framework/ops.py", line 2652, in _create_op_internal
  File "/usr/local/lib/python3.11/dist-packages/tensorflow/python/framework/ops.py", line 1160, in from_node_def


In [None]:
results = model.evaluate(val_dataset)
print(f"Validation loss: {results[0]}, Validation accuracy: {results[1]}, Validation F1 Score: {results[2]}")

## Visualization

Generate usefull insights on the training

In [None]:
def plot_training_history(history):
    fig, ax = plt.subplots(1, 3, figsize=(20, 5))
    
    # Plot training & validation accuracy values
    ax[0].plot(history.history['accuracy'])
    ax[0].plot(history.history['val_accuracy'])
    ax[0].set_title('Model accuracy')
    ax[0].set_ylabel('Accuracy')
    ax[0].set_xlabel('Epoch')
    ax[0].legend(['Train', 'Val'], loc='upper left')

    # Plot training & validation loss values
    ax[1].plot(history.history['loss'])
    ax[1].plot(history.history['val_loss'])
    ax[1].set_title('Model loss')
    ax[1].set_ylabel('Loss')
    ax[1].set_xlabel('Epoch')
    ax[1].legend(['Train', 'Val'], loc='upper left')
    
    # Plot training & validation F1 score values
    ax[2].plot(history.history['f1_metric'])
    ax[2].plot(history.history['val_f1_metric'])
    ax[2].set_title('Model F1 Score')
    ax[2].set_ylabel('F1 Score')
    ax[2].set_xlabel('Epoch')
    ax[2].legend(['Train', 'Val'], loc='upper left')

    plt.show()

plot_training_history(history)


##  Generate the Confusion Matrix and Metrics

With the true labels and predictions, we can now generate a confusion matrix and calculate other evaluation metrics like precision, recall, and F1-score.

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Assume `predicted_labels` are extracted and transformed to match `y_val`'s encoding
# Generate the confusion matrix
cm = confusion_matrix(y_val, predicted_labels)

# Plotting
fig, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', ax=ax, cmap='Blues',
            xticklabels=label_encoder.classes_,
            yticklabels=label_encoder.classes_)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Confusion Matrix')
plt.show()


## Prepare the Submission DataFrame and Save

In [None]:
# Assuming `test_df` is your test DataFrame and it contains a 'sentence' column
test_prompts = "This text expresses: " + test_df['sentence'] + " The emotion is:"

# Generate predictions
predicted_texts = []  # Placeholder for generated texts

for prompt in test_prompts:
    inputs = tokenizer(prompt, return_tensors='tf', padding=True, truncation=True, max_length=MAX_LEN)
    outputs = model.generate(**inputs, max_length=MAX_LEN + 10)  # Adjust max_length if necessary
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    predicted_label = extract_label_from_generated_text(generated_text)  # Use the previously discussed extraction function
    predicted_texts.append(predicted_label)

# Assuming you can map predicted_texts to the original labels (e.g., through a dictionary or direct matching)
predicted_labels = [text_to_label_mapping[text] for text in predicted_texts]  # Implement this mapping

# Prepare the submission DataFrame
submission_df = pd.DataFrame({
    'id': test_df['id'],  # Assuming there's an 'id' column
    'emotion': predicted_labels
})

# Save the submission file
submission_df.to_csv('GPT-2_v1_submission.csv', index=False)


In [None]:
model_save_path = "GPT-2_v1"
tokenizer_save_path = "GPT-2_v1"

# Save the model
model.save_pretrained(model_save_path)

# Save the tokenizer
tokenizer.save_pretrained(tokenizer_save_path)
import joblib

# Save the label encoder or any mapping used for label-to-text conversions
joblib.dump(text_to_label_mapping, 'GPT-2_v1.joblib')  # Adjust as necessary


In [None]:
data_sub = 'Roberta_V3_1_task12_2.csv'

# Load the dataset
data_submission = pd.read_csv(data_sub)

# Define the mapping from integer labels to emotion names
emotion_mapping = {
    0: 'anger',
    1: 'disgust',
    2: 'fear',
    3: 'happiness',
    4: 'sadness',
    5: 'surprise'
}

# Apply the mapping to the 'emotion' column
data_submission['emotion'] = data_submission['emotion'].map(emotion_mapping)
# Save the submission file
data_submission.to_csv('GPT-2_v1.csv', index=False, sep=',')