In [5]:
%pip install datasets
%pip install transformers
%pip install torch torchvision torchaudio
%pip install pydantic
%pip install transformers[torch]
%pip install accelerate -U
%pip install scikit-learn
%pip install evaluate
%pip install matplotlib
%pip install datasets
%pip install ray[tune]



In [6]:
### Parser for movie scripts in free text format

from typing import Optional
import re
from io import StringIO
from dataclasses import dataclass
from collections import Counter

@dataclass
class Dialogue:
    character: str
    text: str

@dataclass
class Entry:
    description: str
    dialogue: Optional[Dialogue]

@dataclass
class Scene:
    name: str
    entries: list[Entry]

class MovieScriptParser:

    @dataclass
    class ParsedObject:
        scenes: list[Scene]
        stats: dict
        character_vocabulary: list[str]

        def get_vocabulary_to_label_mapping(self):
            return {v: i for i, v in enumerate(self.character_vocabulary)}

        def get_label_to_vocabulary_mapping(self):
            return {i: v for i, v in enumerate(self.character_vocabulary)}

        def save_character_dialogue_dataset_in_json_format(self, output_filename: str):
            """
            Saves a dataset of character dialogues in JSON format.
            This method iterates through the scenes and their entries, extracting dialogues
            and mapping each character to a label based on the character vocabulary. The result
            is a list of dictionaries, each containing the dialogue text and the corresponding
            character label.

            Args:
                output_filename (str): The name of the output file where the JSON data will be saved.
                None
            """
            import json

            vocabulary_to_label_mapping = self.get_vocabulary_to_label_mapping()

            with open(output_filename, "w") as file:
                for entry in self.scenes:
                    for e in entry.entries:
                        if e.dialogue:
                            file.write(
                                json.dumps({
                                    "text": e.dialogue.text,
                                    "label": vocabulary_to_label_mapping[
                                        e.dialogue.character
                                    ],
                                })
                            )
                            file.write("\n")

    @dataclass
    class ParserState:
        scene_text: StringIO
        dialogue_text: StringIO
        scene_name: str
        character_name: Optional[str]
        stats: Counter
        character_vocabulary: set
        scenes: list[Scene]
        entries: list[Entry]

    @staticmethod
    def from_text_file(filename: str) -> ParsedObject:
        """
        Parse file containing input script in free text format into a structured format.
        Returns a list of Scene objects parsed from the input script text.

        Args:
            filename (str): The path to the file containing the script text.
        """
        with open(filename, "r") as file:
            return MovieScriptParser._parse_text_script(file)

    @staticmethod
    def _parse_text_script(file_like) -> ParsedObject:
        """
        Parse input script in free text format into a structured format.
        Returns a list of Scene objects parsed from the input script text.
        Note: the first scene refers to the title of the script.

        Args:
            file_like (file-like object): An object with a `.read()` or `.readline()` method.
        """
        parser_state = MovieScriptParser.ParserState(
            scene_text=StringIO(),
            dialogue_text=StringIO(),
            scene_name="SCRIPT TITLE",
            character_name=None,
            stats=Counter(),
            character_vocabulary=set(),
            scenes=[],
            entries=[],
        )

        for line in file_like:
            line = line.strip("\n")

            # Skip empty lines
            if not line:
                continue

            # Is it a new scene starting ?
            new_scene_name = MovieScriptParser._get_scene_name(line)
            if new_scene_name:
                # This is a new scene coming up
                MovieScriptParser._add_scene(parser_state)
                parser_state.scene_name = new_scene_name

                # Reset the parser state
                MovieScriptParser._reset_buffer(parser_state.scene_text)
                MovieScriptParser._reset_buffer(parser_state.dialogue_text)
                parser_state.entries = []
                parser_state.character_name = None
            else:
                # Existing scene continues
                num_tabs = MovieScriptParser._count_leading_tabs(line)

                if num_tabs == 6:
                    # A new character (dialogue) is starting

                    if parser_state.character_name:
                        # Add previous entry to the scene, a new entry is starting
                        MovieScriptParser._add_entry(parser_state)

                    # Record the new character name
                    parser_state.character_name = line.strip()

                elif num_tabs in [4, 5] and parser_state.character_name is not None:
                    # Character is still speaking, keep adding the dialogue
                    MovieScriptParser._concatenate_text(
                        parser_state.dialogue_text, line
                    )
                else:
                    # This is a scene description
                    if parser_state.character_name:
                        # Previous entry with a movie character has ended, add it
                        MovieScriptParser._add_entry(parser_state)
                        parser_state.character_name = None

                    # Add the new scene description
                    MovieScriptParser._concatenate_text(parser_state.scene_text, line)

        if parser_state.scene_text.tell() > 0 or parser_state.character_name:
            # Add the last scene
            MovieScriptParser._add_scene(parser_state)

        parser_state.stats["total_characters"] = len(parser_state.character_vocabulary)
        parser_state.stats["total_scenes"] = len(parser_state.scenes)

        return MovieScriptParser.ParsedObject(
            scenes=parser_state.scenes,
            character_vocabulary=sorted(parser_state.character_vocabulary),
            stats=parser_state.stats,
        )

    @staticmethod
    def _add_entry(parser_state: ParserState):
        """
        Adds an entry to the parser state with the current scene and dialogue text.

        This method processes the current scene and dialogue text stored in the parser state,
        creates an Entry object, and appends it to the entries list in the parser state.
        It also updates various statistics related to the script parsing process.

        Args:
            parser_state (ParserState): The current state of the parser, containing buffers
                                        for scene and dialogue text, character name, and statistics.

        Updates:
            - Appends a new Entry object to parser_state.entries.
            - Updates parser_state.stats with word counts and dialogue counts.
            - Adds the character name to parser_state.character_vocabulary if it exists.
            - Resets the dialogue buffer and character name in the parser state.
        """
        scene_text_stirng = parser_state.scene_text.getvalue()
        dialogue_text_string = parser_state.dialogue_text.getvalue()

        parser_state.entries.append(
            Entry(
                description=scene_text_stirng,
                dialogue=(
                    Dialogue(
                        character=parser_state.character_name, text=dialogue_text_string
                    )
                    if parser_state.character_name
                    else None
                ),
            )
        )

        # Do some stats counting
        scene_word_count = len(scene_text_stirng.split())
        dialogue_word_count = len(dialogue_text_string.split())
        if parser_state.character_name:
            parser_state.stats["total_dialogues"] += 1
            parser_state.stats["total_words_in_dialogues"] += dialogue_word_count
            parser_state.character_vocabulary.add(parser_state.character_name)
        parser_state.stats["total_words"] += scene_word_count + dialogue_word_count

        # Reset the dialogue buffer and character name
        parser_state.character_name = None
        MovieScriptParser._reset_buffer(parser_state.dialogue_text)
        MovieScriptParser._reset_buffer(parser_state.scene_text)

    @staticmethod
    def _add_scene(parser_state: ParserState):
        """
        Adds a scene to the list of scenes.

        This function creates a new Scene object with the provided scene_name and entries,
        and appends it to the scenes list.

        Args:
            entries (list): A list of Entry objects representing the entries in the scene.
            scene_name (str): The name of the scene.
            scene_text (io.StringIO): A StringIO buffer containing the text of the scene.
            scenes (list): A list of Scene objects to which the new scene will be added.
        """
        # Check for the last description
        if parser_state.scene_text.tell() > 0 or parser_state.character_name:
            # There is another entry to add
            MovieScriptParser._add_entry(parser_state)
        parser_state.scenes.append(
            Scene(name=parser_state.scene_name, entries=parser_state.entries)
        )

    @staticmethod
    def _reset_buffer(scene_text):
        """
        Resets the buffer of the given scene_text by seeking to the beginning and truncating its content.

        Args:
            scene_text (io.StringIO): The buffer to reset.
        """
        scene_text.seek(0)
        scene_text.truncate(0)

    @staticmethod
    def _get_scene_name(line: str) -> str:
        """
        Try to match a new scene format <number><tab><scene name> and if successful, return scene name. Else return None.

        Args:
            line (str): The line of text to match.
        """
        match = re.match(r"^\d+\t.+$", line)
        return line.split("\t")[1] if match else None

    @staticmethod
    def _count_leading_tabs(input_string: str) -> int:
        """
        Count the nymber of leading tabs in the input string

        Args:
            input_string (str): the input string
        """
        count = 0
        for char in input_string:
            if char == "\t":
                count += 1
            else:
                break
        return count

    @staticmethod
    def _concatenate_text(buffer: StringIO, new_line: str):
        """
        Adds a new string to an existing StringIO buffer by connecting whith a whitespace.
        Note: It will remove a trailing '-' if it exists.

        Args:
            buffer (StringIO): The existing StringIO buffer.
            new_line (str): The new string to append.
        """
        # Move the cursor to the end of the buffer
        buffer.seek(0, 2)

        # Check if the buffer ends with a "-"
        if buffer.tell() > 0:  # Ensure the buffer is not empty
            buffer.seek(buffer.tell() - 1)
            if buffer.read(1) == "-":
                # Remove the trailing "-" by truncating
                buffer.seek(buffer.tell() - 1)
                buffer.truncate()

        # Append the new string
        buffer.write(new_line.strip().lower() + " ")

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Parse the movie script, and save it in json format

In [8]:
import json
import sys
from  collections import Counter

sys.path.append('..')

# from src.data.parse_movie_script import MovieScriptParser

# Parse the dataset in structured format and output a json compatible to parse with Hugging face datasets library.
# The output will be a json file with the following structure:
# { "text": ..., "label": }
parsed_object = MovieScriptParser.from_text_file("/content/drive/MyDrive/movie_script/data/raw/5thelement.txt")

# Print the stats for the parsed script
print("Script basic stats:")
print(json.dumps(parsed_object.stats, indent=2))

# Print counts per class
class_map = parsed_object.get_vocabulary_to_label_mapping()
class_counter = Counter()
for scene in parsed_object.scenes:
    for entry in scene.entries:
        if entry.dialogue is not None:
            class_counter[entry.dialogue.character] += 1

print("Class counts (character speeches):")
cumulative_dialogues = 0
for i, (key, value) in enumerate(class_counter.most_common(), start=1):
    cumulative_dialogues += class_counter[key]
    print(f"{i}. {key} with class id {class_map[key]}: {class_counter[key]} ({class_counter[key] / parsed_object.stats['total_dialogues']}, {cumulative_dialogues / parsed_object.stats['total_dialogues']})")

output_filename = "/content/drive/MyDrive/movie_script/data/parsed/5thelement.json"

# Save the parsed script in json format
parsed_object.save_character_dialogue_dataset_in_json_format(output_filename)

# Print a few lines of the saved file to see the format
with open(output_filename, "r") as f:
    for i in range(5):
        print(f.readline().strip())

Script basic stats:
{
  "total_words": 21849,
  "total_dialogues": 943,
  "total_words_in_dialogues": 9457,
  "total_characters": 83,
  "total_scenes": 281
}
Class counts (character speeches):
1. KORBEN with class id 41: 244 (0.25874867444326616, 0.25874867444326616)
2. CORNELIUS with class id 21: 99 (0.10498409331919406, 0.3637327677624602)
3. ZORG with class id 82: 68 (0.07211028632025451, 0.43584305408271473)
4. LOC RHOD with class id 44: 52 (0.05514316012725345, 0.4909862142099682)
5. PRESIDENT with class id 59: 48 (0.05090137857900318, 0.5418875927889714)
6. LEELOO with class id 43: 48 (0.05090137857900318, 0.5927889713679746)
7. MUNRO with class id 55: 36 (0.03817603393425239, 0.630965005302227)
8. PROFESSOR with class id 65: 24 (0.02545068928950159, 0.6564156945917285)
9. MACTILBURGH with class id 45: 16 (0.016967126193001062, 0.6733828207847296)
10. STAEDERT with class id 72: 15 (0.015906680805938492, 0.689289501590668)
11. CAPTAIN with class id 8: 15 (0.015906680805938492, 0.7

### Create a basic traind and test split of the dataset

In [9]:
from datasets import load_dataset

def split_dataset_from_json(json_file: str, test_size: float = 0.4):
    """
    Load a dataset from a json file and split it into train and test sets by using datasets library from Hugging Face.

    Args:
        json_file (str): The path to the json file containing the dataset.
        test_size (float): The proportion of the dataset to include in the test split.
    """
    # Load dataset from local json file
    dataset = load_dataset("json", data_files=json_file)

    # Split the dataset into a simple train, validation and test sets
    train_test_split = dataset['train'].train_test_split(test_size=test_size)
    train_dataset = train_test_split['train']
    test_dataset = train_test_split['test']

    print(f"Train Size: {len(train_dataset)}, Test Size: {len(test_dataset)}")

    return train_dataset, test_dataset

train_dataset, test_dataset = split_dataset_from_json(output_filename, test_size=0.3)

Generating train split: 0 examples [00:00, ? examples/s]

Train Size: 660, Test Size: 283


### Train a first exploratory Hugging face Transformer model with typical params and do basic evaluation

In [10]:
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)
import evaluate

### Train model
model_name = "microsoft/deberta-base"

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Add a padding token if not present
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token  # Use <EOS> as <PAD>

# Tokenize the data
def preprocess_function(examples):
    tokenized = tokenizer(
        examples["text"],  # raw text to be tokenized
        truncation=True,
        padding="max_length",
        max_length=128,
        return_tensors="pt",
    )
    tokenized["labels"] = examples["label"]  # add the labels to the tokenized input
    return tokenized

encoded_train_dataset = train_dataset.map(preprocess_function, batched=True)
encoded_test_dataset = test_dataset.map(preprocess_function, batched=True)

# Sanity check: print a few examples of the encoded dataset
print("Encoded Train Dataset:")
print(encoded_train_dataset["input_ids"][:2])
print(encoded_train_dataset["attention_mask"][:2])
print(encoded_train_dataset["labels"][:2])

# Load the model
num_labels = len(parsed_object.character_vocabulary)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=num_labels
)

# Add a padding token if not present
if model.config.pad_token_id is None:
    model.config.pad_token_id = model.config.eos_token_id

# Set training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=32,
    num_train_epochs=60,
    weight_decay=0.01,
    logging_dir="./logs",
    save_strategy="epoch",
    logging_steps=30,
    warmup_steps=10,  # learning rate will be gradually increased during the first 10 steps
    load_best_model_at_end=True,
)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.argmax(axis=1)
    metric_acc = evaluate.load("accuracy")
    metric_f1 = evaluate.load("f1")

    accuracy = metric_acc.compute(predictions=predictions, references=labels)
    f1 = metric_f1.compute(predictions=predictions, references=labels, average="macro")
    f1_weighted = metric_f1.compute(predictions=predictions, references=labels, average="weighted")

    return {
        "accuracy": accuracy["accuracy"],
        "f1": f1["f1"],
        "f1_weighted": f1_weighted["f1"],
    }


# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_train_dataset,
    eval_dataset=encoded_test_dataset,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/474 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Map:   0%|          | 0/660 [00:00<?, ? examples/s]

Map:   0%|          | 0/283 [00:00<?, ? examples/s]

Encoded Train Dataset:
[[1, 1640, 1097, 1506, 43, 364, 560, 18735, 4349, 20577, 415, 328, 1437, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 90, 15975, 7, 1871, 110, 8446, 98, 47, 64, 1871, 5, 232, 4, 1437, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

pytorch_model.bin:   0%|          | 0.00/559M [00:00<?, ?B/s]

Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Epoch,Training Loss,Validation Loss,Accuracy,F1,F1 Weighted
1,No log,4.171734,0.212014,0.00714,0.074174
2,4.235700,3.663028,0.212014,0.00714,0.074174
3,3.558400,3.446108,0.212014,0.00714,0.074174
4,3.558400,3.349194,0.212014,0.00714,0.074174
5,3.279700,3.273879,0.215548,0.009145,0.086519
6,3.181900,3.21834,0.240283,0.015591,0.119387
7,3.181900,3.143825,0.243816,0.016506,0.13111
8,2.989000,3.14081,0.250883,0.020315,0.147596
9,2.834700,3.118555,0.261484,0.025167,0.166714
10,2.681000,3.117661,0.250883,0.027998,0.179009


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.79k [00:00<?, ?B/s]

TrainOutput(global_step=1260, training_loss=1.5345349175589424, metrics={'train_runtime': 603.1177, 'train_samples_per_second': 65.659, 'train_steps_per_second': 2.089, 'total_flos': 3037210615296000.0, 'train_loss': 1.5345349175589424, 'epoch': 60.0})

### Compute confusion matrix and additional metrics per class

In [11]:
import pprint

# Compute predictions on test set to compute additional metrics
def get_predictions_and_labels(test_dataset, trainer):
    import numpy as np

    # Predict on the evaluation dataset
    predictions = trainer.predict(test_dataset)

    # Extract predictions and true labels
    y_pred = np.argmax(predictions.predictions, axis=1)
    y_true = predictions.label_ids
    return y_pred, y_true


def compute_precision_recall_per_class(y_pred, y_true, num_labels):

    from sklearn.metrics import confusion_matrix
    import numpy as np

    def compute_precision_recall(confusion_matrix):
        """
        Compute precision and recall for each class from a confusion matrix.

        :param confusion_matrix: 2D array, where rows are actual classes
                                and columns are predicted classes.
        :return: Dictionary with precision and recall for each class.
        """
        num_classes = confusion_matrix.shape[0]
        metrics = []

        for i in range(num_classes):
            # True Positives (diagonal element)
            TP = confusion_matrix[i, i]

            # False Positives (sum of column i, excluding TP)
            FP = np.sum(confusion_matrix[:, i]) - TP

            # False Negatives (sum of row i, excluding TP)
            FN = np.sum(confusion_matrix[i, :]) - TP

            # Precision and Recall
            precision = float(TP / (TP + FP)) if (TP + FP) > 0 else 0
            recall = float(TP / (TP + FN)) if (TP + FN) > 0 else 0

            metrics.append({'Class': i, 'Precision': precision, 'Recall': recall})

        return metrics

    print("# predictions in test set: {}".format(len(y_pred)))
    print("Predictions on test set:")
    print(y_pred)
    print("True labels:")
    print(y_true)

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred, labels=range(num_labels))

    # Compute precision and recall per class
    metrics_per_class = compute_precision_recall(cm)
    metrics_per_class = sorted(metrics_per_class, key=lambda x: x['Recall'], reverse=True)

    print("Precision and recall per class:")
    pprint.pp(metrics_per_class)

y_pred, y_true = get_predictions_and_labels(encoded_test_dataset, trainer)
compute_precision_recall_per_class(y_pred, y_true, num_labels)

# predictions in test set: 283
Predictions on test set:
[41 21 41 41 21 82 43 21 41 82 41 41 21 41 82 41 41 41 82 21 59 82 55 21
 41 82 41 43 41 21 41 41 82 41 41 21 41 43 82 21 82 21 82 21 21 41 41 21
 41 21 21 55 21 21 21 43 21 59 21 41 21 21 41 41 21 41 41 82 41 41 21 21
 41 21 82 41 21 21 82 41 21 41 21 21 55 21 59 21 41 41 41 41 43 44 21 41
 41 82 43 41 21 21 21 21 21 21 41 41 41 21 41 21 21 82 21 41 41 21 21 21
 43 41 43 21 41 21 43 21 41 21 21 41 41 21 41 41 21 43 21 41 21 41 21 21
 41 41 43 44 20 82 21 82 41 21 21 21 41 21 41 21 21 21 41 41 41 82 41 21
 41 21 41 21 41 41 41 21 41 41 59 21 21 82 41 21 41 41 21 21 41 41 21 21
 82 41 43 21 41 21 41 41 41 21 41 41 41 21 21 59 41 21 21 41 21 41 43 21
 41 41 82 43 21 82 55 43 41 21 21 41 41 21 41 41 21 21 41 41 21 21 43 41
 21 41 21 82 59 21 41 41 41 41 21 41 41 21 55 21 21 21 21 21 43 82 41 82
 55 41 43 41 41 21 82 41 41 41 55 21 41 82 21 43 21 21 21]
True labels:
[41 18 41 41 66 82 21 62 82 82 44 41 59 21 21 45 41 82 21 45 41  9 68

### Train and evaluate on oversampled dataset (oversample each class proportionally to max class size)

In [12]:
import random
from datasets import Dataset

from datasets import load_dataset

def oversample_dataset(dataset: Dataset, class_count_threshold=0):
    """
    Oversamples the dataset to balance the class distribution. Ignore classes with count less or equal to class_count_threshold.
    Args:
        dataset (Dataset): A dataset object containing examples with a "label" field.
        class_count_threshold (int): The minimum count of a class to be considered for oversampling.
    Returns:
        Dataset: A new dataset object with balanced class distribution by oversampling the minority classes.
    """
    class_counts = Counter(dataset["label"])

    max_count = max(class_counts.values())
    examples_by_class = {label: [] for label in class_counts}

    for example in dataset:
        examples_by_class[example["label"]].append(example)

    balanced_examples = []
    for _, examples in examples_by_class.items():
        if len(examples) > class_count_threshold:
            balanced_examples.extend(random.choices(examples, k=max_count))
        else:
            balanced_examples.extend(examples)

    random.shuffle(balanced_examples)
    return Dataset.from_list(balanced_examples)

train_dataset_oversampled = oversample_dataset(train_dataset, class_count_threshold=1)


print("Training dataset size:", len(train_dataset))
print("Oversampled training dataset size:", len(train_dataset_oversampled))

encoded_train_dataset_oversampled = train_dataset_oversampled.map(preprocess_function, batched=True)

# Reset model weights to previous state
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=num_labels
)

training_args.num_train_epochs = 20

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_train_dataset_oversampled,
    eval_dataset=encoded_test_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

# Evaluate the model
y_pred, y_true = get_predictions_and_labels(encoded_test_dataset, trainer)
compute_precision_recall_per_class(y_pred, y_true, num_labels)

Training dataset size: 660
Oversampled training dataset size: 9773


Map:   0%|          | 0/9773 [00:00<?, ? examples/s]

Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,F1 Weighted
1,1.9789,3.821523,0.060071,0.031954,0.051404
2,0.4665,3.198951,0.19788,0.080921,0.17988
3,0.1154,3.347213,0.226148,0.111354,0.214991
4,0.0517,3.842962,0.254417,0.103498,0.221034
5,0.038,4.097544,0.250883,0.123635,0.233073
6,0.0195,4.395072,0.257951,0.114666,0.233157
7,0.0399,4.436036,0.268551,0.115038,0.243521
8,0.0196,4.576379,0.275618,0.114871,0.24437
9,0.0345,4.702868,0.275618,0.127801,0.246101
10,0.0118,4.778081,0.268551,0.116464,0.237645


# predictions in test set: 283
Predictions on test set:
[43 44 41 29 55 82 34 59 61 20 44 82 55 56 41 82 59 82  8 44 55 68  8 22
 41 68 22 43 22 44 44 77 45 44 44 82 73  4 82 55 82 44 44 21  8 55 44 55
 82 59 59 10 82 29 44 43 15 44 60 44 61 44 44 66 43 43 77 10 41 36 43 65
 56 43 82 41 82 44 44 41 17 24 33 82 45 61 55 44 41 21 44 44 61 44 59 41
 55 55 10 44 55  1 22 59  1 41 72 44 41 24 82 59 44 82 59 34 41 55 22 61
 44 82 72 44 44 59 65 82 44 14 55 55 82 55 82 44 71 43 59 22 82 82 45 44
 43 55 43 55 20 80 29 82 77 55 59 55 44 44 41 44 44 72 41 44 55 79 65 55
 22 44 59 22 44 44  4 55 82 59 60 55 43 31 41 61 22 43 21 59 77 65 59  1
 44 44 34 43 44 44 44 44 44 59 44 43 41 61 44 55 45 55 82 55 82 44 44 55
 44 41 82 66 79 17 80 82 82 17 44 82  8 55 44 41 65 44 43 41 45 80 43 55
 55 82  1 39 72 55 41 55 44 59 82 44 44 82  1 55 21 43 44 33 43 82 41 82
 45 24 43 59 82 55 65 82 43 44  8 55 44 43  8 34 40 76 77]
True labels:
[41 18 41 41 66 82 21 62 82 82 44 41 59 21 21 45 41 82 21 45 41  9 68

### Use weighted cross-entropy loss to simulate oversampling with less computational effort

In [13]:
import torch
import torch.nn as nn
import numpy as np

# Reset model weights to previous state
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=num_labels
)

def compute_class_weights(dataset: Dataset, num_labels: int, min_count=0, top_k=None):
    """
    Compute class weights for
    Args:
        dataset (Dataset): A dataset object containing examples with a "label" field.
        num_labels (int): The number of classes in the dataset.
        min_count (int): The minimum count of a class to be considered for computing class weights.
    Returns:
        dict: A dictionary with class weights for each class compared to the class with maximum count.
    """
    class_counts = Counter(dataset["label"])

    if top_k is not None:
        class_counts = dict(class_counts.most_common(top_k))

    max_count = max(class_counts.values())
    class_weights = {label: max_count / count for label, count in class_counts.items()}
    class_weights_list = [1.0] * (num_labels)

    for label, weight in class_weights.items():
        if class_counts[label] > min_count:
            class_weights_list[label] = weight

    return class_weights_list

# Compute class weights
class_weights = compute_class_weights(train_dataset, num_labels=num_labels, top_k=10)
for i, weight in enumerate(class_weights):
    if weight > 1.0:
        print(f"Class {i} weight: {weight}")
class_weights = torch.tensor(class_weights, dtype=torch.float32)
device = torch.device('cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu')
class_weights = class_weights.to(device)

# Define the loss function with class weights
loss_fn = nn.CrossEntropyLoss(weight=class_weights)

# Define custom compute_loss_func
def compute_loss_func(outputs, labels, num_items_in_batch=None):
    logits = outputs.logits
    loss = loss_fn(logits.view(-1, logits.size(-1)), labels.view(-1))
    return loss

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_train_dataset,
    eval_dataset=encoded_test_dataset,
    compute_metrics=compute_metrics,
    compute_loss_func=compute_loss_func,
)

training_args.num_train_epochs = 10
trainer.train()

# Evaluate the model
y_pred, y_true = get_predictions_and_labels(encoded_test_dataset, trainer)
compute_precision_recall_per_class(y_pred, y_true, num_labels)

Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Class 21 weight: 2.9206349206349205
Class 43 weight: 5.935483870967742
Class 44 weight: 5.257142857142857
Class 45 weight: 15.333333333333334
Class 55 weight: 7.36
Class 59 weight: 6.133333333333334
Class 61 weight: 16.727272727272727
Class 65 weight: 11.5
Class 82 weight: 4.27906976744186


Epoch,Training Loss,Validation Loss,Accuracy,F1,F1 Weighted
1,No log,4.267042,0.063604,0.002401,0.007788
2,4.336600,3.765795,0.212014,0.00949,0.082597
3,3.798400,3.438709,0.141343,0.009644,0.075933
4,3.798400,3.292531,0.183746,0.017733,0.105643
5,3.397700,3.171652,0.14841,0.012946,0.08435
6,3.200500,3.100432,0.14841,0.020712,0.09311
7,3.200500,3.052423,0.144876,0.019226,0.088793
8,3.096700,3.021186,0.173145,0.023688,0.113786
9,3.033100,3.00332,0.166078,0.025059,0.113847
10,2.959900,2.996767,0.155477,0.024459,0.10661


# predictions in test set: 283
Predictions on test set:
[43 44 44 44 44 44 43 41 44 41 44 44 44 44 43 44 44 44 41 44 44 41 55 44
 41 45 41 43 43 44 44 44 82 41 44 41 44 41 41 44 41 44 44 44 44 44 44 44
 44 41 44 41 44 41 44 43 44 44 44 44 65 44 44 41 41 41 41 44 44 44 41 65
 41 65 44 41 44 44 44 44 44 44 43 44 45 41 44 44 44 41 44 44 65 44 44 41
 44 41 41 44 41 44 44 44 44 44 44 44 41 44 44 44 44 44 44 43 41 44 41 41
 44 44 43 44 44 44 41 59 44 45 44 44 44 44 44 44 44 43 44 44 44 44 44 44
 41 44 43 44 45 65 44 44 41 44 44 44 44 41 41 44 44 45 44 44 41 43 65 44
 41 44 44 44 44 44 41 41 41 44 44 44 44 41 41 44 43 41 44 44 41 65 41 44
 41 44 43 44 44 44 41 44 44 43 44 41 41 45 44 44 44 41 43 41 43 44 65 44
 44 41 44 43 43 44 41 43 44 44 41 44 44 44 44 41 44 44 41 44 44 44 43 44
 44 44 44 44 44 41 44 41 44 44 44 44 44 44 59 44 44 43 44 44 43 44 65 41
 41 44 43 41 41 44 65 44 43 44 45 44 44 43 44 43 41 41 44]
True labels:
[41 18 41 41 66 82 21 62 82 82 44 41 59 21 21 45 41 82 21 45 41  9 68

### Hyperparameter tunning

In [14]:
### Split the dataset into train, validation and test sets

from datasets import load_dataset

# Load dataset from local json file
dataset = load_dataset("json", data_files=output_filename)

# Split the dataset into a simple train, validation and test sets
train_test_split = dataset['train'].train_test_split(test_size=0.4)

train_dataset = train_test_split['train']

temp_split_dataset = train_test_split['test'].train_test_split(test_size=0.5)
val_dataset = temp_split_dataset['train']
test_dataset = temp_split_dataset['test']

print(f"Train Size: {len(train_dataset)}, Test Size: {len(test_dataset)}, Val Size: {len(val_dataset)}")

Train Size: 565, Test Size: 189, Val Size: 189


In [15]:
# Run hyperparameter search

def model_init():
    return AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

encoded_train_dataset = train_dataset.map(preprocess_function, batched=True)
encoded_test_dataset = test_dataset.map(preprocess_function, batched=True)
encoded_val_dataset = val_dataset.map(preprocess_function, batched=True)

trainer_hp = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=encoded_train_dataset,
    eval_dataset=encoded_val_dataset,
    compute_metrics=compute_metrics,
)

best_run = trainer_hp.hyperparameter_search(n_trials=2, direction="maximize")

print(best_run)

Map:   0%|          | 0/565 [00:00<?, ? examples/s]

Map:   0%|          | 0/189 [00:00<?, ? examples/s]

Map:   0%|          | 0/189 [00:00<?, ? examples/s]

Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-01-29 16:52:51,561	INFO worker.py:1841 -- Started a local Ray instance.
2025-01-29 16:52:54,184	INFO tune.py:253 -- Initializing Ray automatically. For cluster usage or custom Ray initialization, call `ray.init(...)` before `tune.run(...)`.


+-------------------------------------------------------------------+
| Configuration for experiment     _objective_2025-01-29_16-52-54   |
+-------------------------------------------------------------------+
| Search algorithm                 BasicVariantGenerator            |
| Scheduler                        FIFOScheduler                    |
| Number of trials                 2                                |
+-------------------------------------------------------------------+

View detailed results here: /root/ray_results/_objective_2025-01-29_16-52-54
To visualize your results with TensorBoard, run: `tensorboard --logdir /tmp/ray/session_2025-01-29_16-52-49_623731_428/artifacts/2025-01-29_16-52-54/_objective_2025-01-29_16-52-54/driver_artifacts`

Trial status: 2 PENDING
Current time: 2025-01-29 16:52:54. Total running time: 0s
Logical resource usage: 0/12 CPUs, 0/1 GPUs (0.0/1.0 accelerator_type:A100)
+--------------------------------------------------------------------------

[36m(_objective pid=13206)[0m 2025-01-29 16:53:02.404776: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(_objective pid=13206)[0m 2025-01-29 16:53:02.432409: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(_objective pid=13206)[0m 2025-01-29 16:53:02.440791: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[36m(_objective pid=13206)[0m Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
[3

[36m(_objective pid=13206)[0m {'eval_loss': 4.466243743896484, 'eval_accuracy': 0.0, 'eval_f1': 0.0, 'eval_f1_weighted': 0.0, 'eval_runtime': 1.3227, 'eval_samples_per_second': 142.89, 'eval_steps_per_second': 18.145, 'epoch': 1.0}

Trial _objective_7bcd7_00000 finished iteration 1 at 2025-01-29 16:53:22. Total running time: 27s
+-----------------------------------------------------------+
| Trial _objective_7bcd7_00000 result                       |
+-----------------------------------------------------------+
| checkpoint_dir_name                     checkpoint_000000 |
| time_this_iter_s                                 21.49783 |
| time_total_s                                     21.49783 |
| training_iteration                                      1 |
| epoch                                                  1. |
| eval_accuracy                                          0. |
| eval_f1                                                0. |
| eval_f1_weighted                             

[36m(_objective pid=13206)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/_objective_2025-01-29_16-52-54/_objective_7bcd7_00000_0_learning_rate=0.0000,num_train_epochs=5,per_device_train_batch_size=64,seed=8.1540_2025-01-29_16-52-54/checkpoint_000000)



Trial status: 1 RUNNING | 1 PENDING
Current time: 2025-01-29 16:53:24. Total running time: 30s
Logical resource usage: 1.0/12 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:A100)
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status       learning_rate     num_train_epochs      seed     ..._train_batch_size     iter     total time (s)     eval_loss     eval_accuracy     eval_f1     eval_f1_weighted |
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_7bcd7_00000   RUNNING        5.61152e-06                    5   8.15396                       64        1            21.4978       4.46624                 0           0                    0 |
| _o

 22%|██▏       | 10/45 [00:18<02:31,  4.32s/it]
 24%|██▍       | 11/45 [00:18<01:45,  3.11s/it]
 27%|██▋       | 12/45 [00:18<01:15,  2.29s/it]
 29%|██▉       | 13/45 [00:19<00:54,  1.71s/it]
 31%|███       | 14/45 [00:19<00:40,  1.31s/it]
 33%|███▎      | 15/45 [00:20<00:30,  1.03s/it]
 36%|███▌      | 16/45 [00:20<00:24,  1.19it/s]
 38%|███▊      | 17/45 [00:20<00:19,  1.43it/s]
 40%|████      | 18/45 [00:21<00:15,  1.72it/s]
  0%|          | 0/24 [00:00<?, ?it/s][A
[36m(_objective pid=13206)[0m 
 21%|██        | 5/24 [00:00<00:00, 42.34it/s][A
[36m(_objective pid=13206)[0m 
 42%|████▏     | 10/24 [00:00<00:00, 36.11it/s][A
[36m(_objective pid=13206)[0m 
 58%|█████▊    | 14/24 [00:00<00:00, 35.14it/s][A
[36m(_objective pid=13206)[0m 
 75%|███████▌  | 18/24 [00:00<00:00, 34.53it/s][A
[36m(_objective pid=13206)[0m 
 92%|█████████▏| 22/24 [00:00<00:00, 34.27it/s][A
                                               
 40%|████      | 18/45 [00:22<00:15,  1.72it/s]
100%|██████

[36m(_objective pid=13206)[0m {'eval_loss': 4.303543567657471, 'eval_accuracy': 0.005291005291005291, 'eval_f1': 0.0016326530612244899, 'eval_f1_weighted': 0.010158730158730159, 'eval_runtime': 1.266, 'eval_samples_per_second': 149.29, 'eval_steps_per_second': 18.957, 'epoch': 2.0}

Trial _objective_7bcd7_00000 finished iteration 2 at 2025-01-29 16:53:38. Total running time: 43s
+-----------------------------------------------------------+
| Trial _objective_7bcd7_00000 result                       |
+-----------------------------------------------------------+
| checkpoint_dir_name                     checkpoint_000001 |
| time_this_iter_s                                 16.10089 |
| time_total_s                                     37.59872 |
| training_iteration                                      2 |
| epoch                                                  2. |
| eval_accuracy                                     0.00529 |
| eval_f1                                           0.0016

[36m(_objective pid=13206)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/_objective_2025-01-29_16-52-54/_objective_7bcd7_00000_0_learning_rate=0.0000,num_train_epochs=5,per_device_train_batch_size=64,seed=8.1540_2025-01-29_16-52-54/checkpoint_000001)
 42%|████▏     | 19/45 [00:33<01:50,  4.25s/it]
 44%|████▍     | 20/45 [00:34<01:17,  3.09s/it]
 47%|████▋     | 21/45 [00:34<00:54,  2.28s/it]
 49%|████▉     | 22/45 [00:35<00:39,  1.71s/it]
 51%|█████     | 23/45 [00:35<00:28,  1.31s/it]
 53%|█████▎    | 24/45 [00:35<00:21,  1.04s/it]
 56%|█████▌    | 25/45 [00:36<00:16,  1.19it/s]
 58%|█████▊    | 26/45 [00:36<00:13,  1.42it/s]
 60%|██████    | 27/45 [00:36<00:10,  1.71it/s]
  0%|          | 0/24 [00:00<?, ?it/s][A
[36m(_objective pid=13206)[0m 
 21%|██        | 5/24 [00:00<00:00, 41.55it/s][A
[36m(_objective pid=13206)[0m 
 42%|████▏     | 10/24 [00:00<00:00, 36.02it/s][A
[36m(_objective pid=13206)[0m 
 58%|█████▊    | 14/24 [00:0

[36m(_objective pid=13206)[0m {'eval_loss': 4.165348052978516, 'eval_accuracy': 0.2962962962962963, 'eval_f1': 0.0103273397879207, 'eval_f1_weighted': 0.13769786383894267, 'eval_runtime': 1.4276, 'eval_samples_per_second': 132.388, 'eval_steps_per_second': 16.811, 'epoch': 3.0}

Trial _objective_7bcd7_00000 finished iteration 3 at 2025-01-29 16:53:54. Total running time: 59s
+-----------------------------------------------------------+
| Trial _objective_7bcd7_00000 result                       |
+-----------------------------------------------------------+
| checkpoint_dir_name                     checkpoint_000002 |
| time_this_iter_s                                 15.92914 |
| time_total_s                                     53.52786 |
| training_iteration                                      3 |
| epoch                                                  3. |
| eval_accuracy                                      0.2963 |
| eval_f1                                           0.01033 |


[36m(_objective pid=13206)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/_objective_2025-01-29_16-52-54/_objective_7bcd7_00000_0_learning_rate=0.0000,num_train_epochs=5,per_device_train_batch_size=64,seed=8.1540_2025-01-29_16-52-54/checkpoint_000002)



Trial status: 1 RUNNING | 1 PENDING
Current time: 2025-01-29 16:53:54. Total running time: 1min 0s
Logical resource usage: 1.0/12 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:A100)
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status       learning_rate     num_train_epochs      seed     ..._train_batch_size     iter     total time (s)     eval_loss     eval_accuracy     eval_f1     eval_f1_weighted |
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_7bcd7_00000   RUNNING        5.61152e-06                    5   8.15396                       64        3            53.5279       4.16535          0.296296   0.0103273             0.137698 |


 62%|██████▏   | 28/45 [00:50<01:13,  4.33s/it]
 64%|██████▍   | 29/45 [00:50<00:50,  3.15s/it]
 67%|██████▋   | 30/45 [00:50<00:34,  2.32s/it]


[36m(_objective pid=13206)[0m {'loss': 4.3616, 'grad_norm': 6.245805263519287, 'learning_rate': 2.4049356065719326e-06, 'epoch': 3.33}


 69%|██████▉   | 31/45 [00:51<00:24,  1.74s/it]
 71%|███████   | 32/45 [00:51<00:17,  1.33s/it]
 73%|███████▎  | 33/45 [00:51<00:12,  1.05s/it]
 76%|███████▌  | 34/45 [00:52<00:09,  1.17it/s]
 78%|███████▊  | 35/45 [00:52<00:07,  1.41it/s]
 80%|████████  | 36/45 [00:53<00:05,  1.70it/s]
  0%|          | 0/24 [00:00<?, ?it/s][A
[36m(_objective pid=13206)[0m 
 21%|██        | 5/24 [00:00<00:00, 41.06it/s][A
[36m(_objective pid=13206)[0m 
 42%|████▏     | 10/24 [00:00<00:00, 35.60it/s][A
[36m(_objective pid=13206)[0m 
 58%|█████▊    | 14/24 [00:00<00:00, 34.08it/s][A
[36m(_objective pid=13206)[0m 
 75%|███████▌  | 18/24 [00:00<00:00, 33.32it/s][A
[36m(_objective pid=13206)[0m 
 92%|█████████▏| 22/24 [00:00<00:00, 32.28it/s][A
                                               
 80%|████████  | 36/45 [00:54<00:05,  1.70it/s]
100%|██████████| 24/24 [00:01<00:00, 32.28it/s][A
                                               [A


[36m(_objective pid=13206)[0m {'eval_loss': 4.065954208374023, 'eval_accuracy': 0.2962962962962963, 'eval_f1': 0.01038961038961039, 'eval_f1_weighted': 0.13544973544973543, 'eval_runtime': 1.4083, 'eval_samples_per_second': 134.204, 'eval_steps_per_second': 17.042, 'epoch': 4.0}

Trial _objective_7bcd7_00000 finished iteration 4 at 2025-01-29 16:54:09. Total running time: 1min 15s
+-----------------------------------------------------------+
| Trial _objective_7bcd7_00000 result                       |
+-----------------------------------------------------------+
| checkpoint_dir_name                     checkpoint_000003 |
| time_this_iter_s                                 15.78284 |
| time_total_s                                      69.3107 |
| training_iteration                                      4 |
| epoch                                                  4. |
| eval_accuracy                                      0.2963 |
| eval_f1                                           0.01

[36m(_objective pid=13206)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/_objective_2025-01-29_16-52-54/_objective_7bcd7_00000_0_learning_rate=0.0000,num_train_epochs=5,per_device_train_batch_size=64,seed=8.1540_2025-01-29_16-52-54/checkpoint_000003)
 82%|████████▏ | 37/45 [01:05<00:33,  4.20s/it]
 84%|████████▍ | 38/45 [01:06<00:21,  3.06s/it]
 87%|████████▋ | 39/45 [01:06<00:13,  2.26s/it]
 89%|████████▉ | 40/45 [01:06<00:08,  1.69s/it]
 91%|█████████ | 41/45 [01:07<00:05,  1.30s/it]
 93%|█████████▎| 42/45 [01:07<00:03,  1.03s/it]
 96%|█████████▌| 43/45 [01:08<00:01,  1.20it/s]
 98%|█████████▊| 44/45 [01:08<00:00,  1.43it/s]
100%|██████████| 45/45 [01:08<00:00,  1.72it/s]
[36m(_objective pid=13206)[0m 
  0%|          | 0/24 [00:00<?, ?it/s][A
[36m(_objective pid=13206)[0m 
 17%|█▋        | 4/24 [00:00<00:00, 39.95it/s][A
[36m(_objective pid=13206)[0m 
 33%|███▎      | 8/24 [00:00<00:00, 35.46it/s][A
[36m(_objective pid=13206)[

[36m(_objective pid=13206)[0m {'eval_loss': 4.029177188873291, 'eval_accuracy': 0.2962962962962963, 'eval_f1': 0.01038961038961039, 'eval_f1_weighted': 0.13544973544973543, 'eval_runtime': 1.2682, 'eval_samples_per_second': 149.03, 'eval_steps_per_second': 18.924, 'epoch': 5.0}

Trial status: 1 RUNNING | 1 PENDING
Current time: 2025-01-29 16:54:24. Total running time: 1min 30s
Logical resource usage: 1.0/12 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:A100)
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status       learning_rate     num_train_epochs      seed     ..._train_batch_size     iter     total time (s)     eval_loss     eval_accuracy     eval_f1     eval_f1_weighted |
+-------------------------------------------------------------------------------------------------------------------------------

[36m(_objective pid=13206)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/_objective_2025-01-29_16-52-54/_objective_7bcd7_00000_0_learning_rate=0.0000,num_train_epochs=5,per_device_train_batch_size=64,seed=8.1540_2025-01-29_16-52-54/checkpoint_000004)



Trial _objective_7bcd7_00000 completed after 5 iterations at 2025-01-29 16:54:35. Total running time: 1min 40s
[36m(_objective pid=13206)[0m {'train_runtime': 86.8355, 'train_samples_per_second': 32.533, 'train_steps_per_second': 0.518, 'train_loss': 4.284735446506076, 'epoch': 5.0}


[36m(_objective pid=13206)[0m                                                100%|██████████| 45/45 [01:26<00:00,  1.72it/s]100%|██████████| 45/45 [01:26<00:00,  1.91s/it]



Trial _objective_7bcd7_00001 started with configuration:
+-------------------------------------------------+
| Trial _objective_7bcd7_00001 config             |
+-------------------------------------------------+
| learning_rate                             2e-05 |
| num_train_epochs                              2 |
| per_device_train_batch_size                  16 |
| seed                                    7.08379 |
+-------------------------------------------------+


[36m(_objective pid=13782)[0m 2025-01-29 16:54:42.152224: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(_objective pid=13782)[0m 2025-01-29 16:54:42.179237: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(_objective pid=13782)[0m 2025-01-29 16:54:42.187478: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[36m(_objective pid=13782)[0m Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
[3

[36m(_objective pid=13782)[0m {'loss': 4.4253, 'grad_norm': 4.226963043212891, 'learning_rate': 1.0581760231885063e-05, 'epoch': 0.83}


 44%|████▍     | 32/72 [00:05<00:04,  8.48it/s]
 46%|████▌     | 33/72 [00:05<00:04,  8.44it/s]
 47%|████▋     | 34/72 [00:05<00:04,  8.40it/s]
 49%|████▊     | 35/72 [00:05<00:04,  8.50it/s]
 50%|█████     | 36/72 [00:05<00:04,  8.40it/s]
[36m(_objective pid=13782)[0m 
  0%|          | 0/24 [00:00<?, ?it/s][A
[36m(_objective pid=13782)[0m 
 21%|██        | 5/24 [00:00<00:00, 38.31it/s][A
[36m(_objective pid=13782)[0m 
 38%|███▊      | 9/24 [00:00<00:00, 34.46it/s][A
[36m(_objective pid=13782)[0m 
 54%|█████▍    | 13/24 [00:00<00:00, 33.61it/s][A
[36m(_objective pid=13782)[0m 
 71%|███████   | 17/24 [00:00<00:00, 33.48it/s][A
[36m(_objective pid=13782)[0m 
 88%|████████▊ | 21/24 [00:00<00:00, 33.24it/s][A



Trial status: 1 TERMINATED | 1 RUNNING
Current time: 2025-01-29 16:54:54. Total running time: 2min 0s
Logical resource usage: 1.0/12 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:A100)
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status         learning_rate     num_train_epochs      seed     ..._train_batch_size     iter     total time (s)     eval_loss     eval_accuracy     eval_f1     eval_f1_weighted |
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_7bcd7_00001   RUNNING          1.56207e-05                    2   7.08379                       16                                                                                     

[36m(_objective pid=13782)[0m                                                
[36m(_objective pid=13782)[0m                                                [A 50%|█████     | 36/72 [00:06<00:04,  8.40it/s]
[36m(_objective pid=13782)[0m 100%|██████████| 24/24 [00:01<00:00, 33.24it/s][A
[36m(_objective pid=13782)[0m                                                [A
[36m(_objective pid=13782)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/_objective_2025-01-29_16-52-54/_objective_7bcd7_00001_1_learning_rate=0.0000,num_train_epochs=2,per_device_train_batch_size=16,seed=7.0838_2025-01-29_16-52-54/checkpoint_000000)



Trial _objective_7bcd7_00001 finished iteration 1 at 2025-01-29 16:55:04. Total running time: 2min 10s
+-----------------------------------------------------------+
| Trial _objective_7bcd7_00001 result                       |
+-----------------------------------------------------------+
| checkpoint_dir_name                     checkpoint_000000 |
| time_this_iter_s                                  23.6726 |
| time_total_s                                      23.6726 |
| training_iteration                                      1 |
| epoch                                                  1. |
| eval_accuracy                                      0.2963 |
| eval_f1                                           0.01039 |
| eval_f1_weighted                                  0.13545 |
| eval_loss                                         3.92494 |
| eval_runtime                                       1.3183 |
| eval_samples_per_second                           143.372 |
| eval_steps_per_second     

[36m(_objective pid=13782)[0m  51%|█████▏    | 37/72 [00:19<02:32,  4.37s/it]
 53%|█████▎    | 38/72 [00:20<01:45,  3.10s/it]
 54%|█████▍    | 39/72 [00:20<01:12,  2.20s/it]
 56%|█████▌    | 40/72 [00:20<00:50,  1.58s/it]
 57%|█████▋    | 41/72 [00:20<00:35,  1.14s/it]
 58%|█████▊    | 42/72 [00:20<00:25,  1.20it/s]
 60%|█████▉    | 43/72 [00:20<00:17,  1.62it/s]
 61%|██████    | 44/72 [00:20<00:13,  2.13it/s]
 62%|██████▎   | 45/72 [00:20<00:09,  2.75it/s]
 64%|██████▍   | 46/72 [00:21<00:07,  3.45it/s]
 65%|██████▌   | 47/72 [00:21<00:05,  4.21it/s]
 67%|██████▋   | 48/72 [00:21<00:04,  4.96it/s]
 68%|██████▊   | 49/72 [00:21<00:04,  5.67it/s]
 69%|██████▉   | 50/72 [00:21<00:03,  6.30it/s]
 71%|███████   | 51/72 [00:21<00:03,  6.84it/s]
 72%|███████▏  | 52/72 [00:21<00:02,  7.27it/s]
 74%|███████▎  | 53/72 [00:21<00:02,  7.59it/s]
 75%|███████▌  | 54/72 [00:21<00:02,  7.84it/s]
 76%|███████▋  | 55/72 [00:22<00:02,  8.03it/s]
 78%|███████▊  | 56/72 [00:22<00:01,  8.16it/s]
 79%|██

[36m(_objective pid=13782)[0m {'loss': 3.8239, 'grad_norm': 5.3583245277404785, 'learning_rate': 3.023360066252875e-06, 'epoch': 1.67}


[36m(_objective pid=13782)[0m  86%|████████▌ | 62/72 [00:22<00:01,  8.47it/s]
 88%|████████▊ | 63/72 [00:23<00:01,  8.49it/s]
 89%|████████▉ | 64/72 [00:23<00:00,  8.48it/s]
 90%|█████████ | 65/72 [00:23<00:00,  8.46it/s]
 92%|█████████▏| 66/72 [00:23<00:00,  8.48it/s]
 93%|█████████▎| 67/72 [00:23<00:00,  8.49it/s]
 94%|█████████▍| 68/72 [00:23<00:00,  8.51it/s]
 96%|█████████▌| 69/72 [00:23<00:00,  8.52it/s]
 97%|█████████▋| 70/72 [00:23<00:00,  8.52it/s]
 99%|█████████▊| 71/72 [00:23<00:00,  8.57it/s]
[36m(_objective pid=13782)[0m 
  0%|          | 0/24 [00:00<?, ?it/s][A
[36m(_objective pid=13782)[0m 
 21%|██        | 5/24 [00:00<00:00, 41.15it/s][A
[36m(_objective pid=13782)[0m 
 42%|████▏     | 10/24 [00:00<00:00, 35.80it/s][A
[36m(_objective pid=13782)[0m 
 58%|█████▊    | 14/24 [00:00<00:00, 34.95it/s][A
[36m(_objective pid=13782)[0m 
 75%|███████▌  | 18/24 [00:00<00:00, 33.26it/s][A
[36m(_objective pid=13782)[0m 
 92%|█████████▏| 22/24 [00:00<00:00, 33.44it

[36m(_objective pid=13782)[0m {'eval_loss': 3.440234422683716, 'eval_accuracy': 0.2962962962962963, 'eval_f1': 0.01038961038961039, 'eval_f1_weighted': 0.13544973544973543, 'eval_runtime': 1.7172, 'eval_samples_per_second': 110.063, 'eval_steps_per_second': 13.976, 'epoch': 2.0}

Trial _objective_7bcd7_00001 finished iteration 2 at 2025-01-29 16:55:23. Total running time: 2min 28s
+-----------------------------------------------------------+
| Trial _objective_7bcd7_00001 result                       |
+-----------------------------------------------------------+
| checkpoint_dir_name                     checkpoint_000001 |
| time_this_iter_s                                 18.63797 |
| time_total_s                                     42.31058 |
| training_iteration                                      2 |
| epoch                                                  2. |
| eval_accuracy                                      0.2963 |
| eval_f1                                           0.01

[36m(_objective pid=13782)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/_objective_2025-01-29_16-52-54/_objective_7bcd7_00001_1_learning_rate=0.0000,num_train_epochs=2,per_device_train_batch_size=16,seed=7.0838_2025-01-29_16-52-54/checkpoint_000001)



Trial status: 1 TERMINATED | 1 RUNNING
Current time: 2025-01-29 16:55:24. Total running time: 2min 30s
Logical resource usage: 1.0/12 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:A100)
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status         learning_rate     num_train_epochs      seed     ..._train_batch_size     iter     total time (s)     eval_loss     eval_accuracy     eval_f1     eval_f1_weighted |
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_7bcd7_00001   RUNNING          1.56207e-05                    2   7.08379                       16        2            42.3106       3.44023          0.296296   0.0103896            

[36m(_objective pid=13782)[0m                                                100%|██████████| 72/72 [00:39<00:00,  8.57it/s]100%|██████████| 72/72 [00:39<00:00,  1.80it/s]
2025-01-29 16:55:27,940	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to '/root/ray_results/_objective_2025-01-29_16-52-54' in 0.0058s.



Trial _objective_7bcd7_00001 completed after 2 iterations at 2025-01-29 16:55:27. Total running time: 2min 33s
[36m(_objective pid=13782)[0m {'train_runtime': 40.6792, 'train_samples_per_second': 27.778, 'train_steps_per_second': 1.77, 'train_loss': 4.057867897881402, 'epoch': 2.0}

Trial status: 2 TERMINATED
Current time: 2025-01-29 16:55:27. Total running time: 2min 33s
Logical resource usage: 1.0/12 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:A100)
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status         learning_rate     num_train_epochs      seed     ..._train_batch_size     iter     total time (s)     eval_loss     eval_accuracy     eval_f1     eval_f1_weighted |
+-------------------------------------------------------------------------------------------------------------------------------

In [19]:
# Train and evaluate the model with the best hyperparameters
for n, v in best_run.hyperparameters.items():
    # Check the expected type of the attribute
    if n in ['seed']:  # Add more attribute names and their types as needed
        expected_type = int
    elif n in ['learning_rate', 'weight_decay']:
        expected_type = float
    else:
        expected_type = type(getattr(trainer_hp.args, n))  # Use the current type if not specified

    # Convert the value to the expected type
    try:
        v = expected_type(v)
    except (ValueError, TypeError):
        print(f"Warning: Could not convert hyperparameter '{n}' to type '{expected_type.__name__}'. Using the original value.")

    # Set the attribute with the converted value
    setattr(trainer_hp.args, n, v)

trainer_hp.train()

# Evaluate the model
### Evaluate the model
results = trainer_hp.evaluate(eval_dataset=encoded_test_dataset)
print("Evaluation Results on Test Set:", results)

Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,F1 Weighted
1,No log,4.466244,0.0,0.0,0.0
2,No log,4.303544,0.005291,0.001633,0.010159
3,No log,4.165348,0.296296,0.010327,0.137698
4,4.361600,4.065954,0.296296,0.01039,0.13545
5,4.361600,4.029177,0.296296,0.01039,0.13545


Evaluation Results on Test Set: {'eval_loss': 4.103791236877441, 'eval_accuracy': 0.24867724867724866, 'eval_f1': 0.009052388289676425, 'eval_f1_weighted': 0.09904941260873465, 'eval_runtime': 1.4772, 'eval_samples_per_second': 127.944, 'eval_steps_per_second': 16.247, 'epoch': 5.0}


In [27]:
from sklearn.model_selection import KFold
import numpy as np

# Perform cross-validation on the previously trained model
def cross_validate_model(trainer, dataset, k=5):
    """
    Perform k-fold cross-validation on the given dataset using the provided trainer.

    Args:
        trainer (Trainer): The Hugging Face Trainer object.
        dataset (Dataset): The dataset to perform cross-validation on.
        k (int): The number of folds for cross-validation.

    Returns:
        dict: A dictionary containing the average metrics across all folds.
    """
    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    metrics = []

    for train_index, val_index in kf.split(dataset):
        train_split = dataset.select(train_index)
        val_split = dataset.select(val_index)

        # Tokenize the data
        encoded_train_split = train_split.map(preprocess_function, batched=True)
        encoded_val_split = val_split.map(preprocess_function, batched=True)

        # Update the trainer's datasets
        trainer.train_dataset = encoded_train_split
        trainer.eval_dataset = encoded_val_split

        # Train the model
        trainer.train()

        # Evaluate the model
        eval_metrics = trainer.evaluate()
        metrics.append(eval_metrics)

    # Compute average metrics
    avg_metrics = {key: np.mean([metric[key] for metric in metrics]) for key in metrics[0].keys()}
    return avg_metrics

# Perform cross-validation
cv_results = cross_validate_model(trainer_hp, load_dataset("json", data_files=output_filename)['train'], k=3)
print("Cross-validation results:", cv_results)


Map:   0%|          | 0/628 [00:00<?, ? examples/s]

Map:   0%|          | 0/315 [00:00<?, ? examples/s]

Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,F1 Weighted
1,No log,4.423577,0.006349,0.001032,0.001681
2,No log,4.286264,0.146032,0.006981,0.07538
3,4.332800,4.148175,0.215873,0.006456,0.076655
4,4.332800,4.060606,0.215873,0.006456,0.076655
5,4.332800,4.029868,0.215873,0.006456,0.076655


Map:   0%|          | 0/629 [00:00<?, ? examples/s]

Map:   0%|          | 0/314 [00:00<?, ? examples/s]

Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,F1 Weighted
1,No log,4.452911,0.003185,0.000115,2.1e-05
2,No log,4.29417,0.015924,0.001553,0.022666
3,4.359600,4.144004,0.248408,0.007106,0.100123
4,4.359600,4.044344,0.248408,0.007106,0.100123
5,4.359600,4.009496,0.251592,0.007179,0.101149


Map:   0%|          | 0/629 [00:00<?, ? examples/s]

Map:   0%|          | 0/314 [00:00<?, ? examples/s]

Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,F1 Weighted
1,No log,4.432452,0.0,0.0,0.0
2,No log,4.240319,0.076433,0.006378,0.100958
3,4.343300,4.079532,0.308917,0.008322,0.146528
4,4.343300,3.975415,0.308917,0.008281,0.145815
5,4.343300,3.938385,0.308917,0.008281,0.145815


Cross-validation results: {'eval_loss': 3.9925831158955893, 'eval_accuracy': 0.2587941900043811, 'eval_f1': 0.007305485926664103, 'eval_f1_weighted': 0.10787288936147073, 'eval_runtime': 1.9527, 'eval_samples_per_second': 162.1113333333333, 'eval_steps_per_second': 20.631666666666668, 'epoch': 5.0}
