In [None]:
%pip install datasets
%pip install transformers
%pip install torch torchvision torchaudio
%pip install pydantic
%pip install transformers[torch]
%pip install accelerate -U
%pip install scikit-learn
%pip install evaluate
%pip install matplotlib

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m

### Parse the movie script, and save it in json format

In [9]:
import json
import sys
from  collections import Counter

sys.path.append('..')

from src.data.parse_movie_script import MovieScriptParser

# Parse the dataset in structured format and output a json compatible to parse with Hugging face datasets library.
# The output will be a json file with the following structure:
# { "text": ..., "label": }
parsed_object = MovieScriptParser.from_text_file("../data/raw/5thelement.txt")

# Print the stats for the parsed script
print("Script basic stats:")
print(json.dumps(parsed_object.stats, indent=2))

# Print counts per class
class_map = parsed_object.get_vocabulary_to_label_mapping()
class_counter = Counter()
for scene in parsed_object.scenes:
    for entry in scene.entries:
        if entry.dialogue is not None:
            class_counter[entry.dialogue.character] += 1

print("Class counts (character speeches):")
for key, value in class_counter.most_common():
    print(f"{key} with class id {class_map[key]}: {class_counter[key]} ({class_counter[key] / parsed_object.stats['total_dialogues']})")

# Save the parsed script in json format
parsed_object.save_character_dialogue_dataset_in_json_format("../data/parsed/5thelement.json")

# Print a few lines of the saved file to see the format
with open("../data/parsed/5thelement.json", "r") as f:
    for i in range(5):
        print(f.readline().strip())

Script basic stats:
{
  "total_words": 21849,
  "total_dialogues": 943,
  "total_words_in_dialogues": 9457,
  "total_characters": 83,
  "total_scenes": 281
}
Class counts (character speeches):
KORBEN with class id 41: 244 (0.25874867444326616)
CORNELIUS with class id 21: 99 (0.10498409331919406)
ZORG with class id 82: 68 (0.07211028632025451)
LOC RHOD with class id 44: 52 (0.05514316012725345)
PRESIDENT with class id 59: 48 (0.05090137857900318)
LEELOO with class id 43: 48 (0.05090137857900318)
MUNRO with class id 55: 36 (0.03817603393425239)
PROFESSOR with class id 65: 24 (0.02545068928950159)
MACTILBURGH with class id 45: 16 (0.016967126193001062)
STAEDERT with class id 72: 15 (0.015906680805938492)
CAPTAIN with class id 8: 15 (0.015906680805938492)
RIGHT ARM with class id 66: 15 (0.015906680805938492)
PRIEST with class id 61: 14 (0.014846235418875928)
FINGER (V.O.) with class id 29: 13 (0.013785790031813362)
DAVID with class id 22: 12 (0.012725344644750796)
VOICE (O.S.) with class i

### Create a basic traind and test split of the dataset

In [22]:
from datasets import load_dataset

def split_dataset_from_json(json_file: str, test_size: float = 0.2):
    """
    Load a dataset from a json file and split it into train and test sets by using datasets library from Hugging Face.

    Args:
        json_file (str): The path to the json file containing the dataset.
        test_size (float): The proportion of the dataset to include in the test split.
    """
    # Load dataset from local json file 
    dataset = load_dataset("json", data_files=json_file)

    # Split the dataset into a simple train, validation and test sets
    train_test_split = dataset['train'].train_test_split(test_size=0.2)
    train_dataset = train_test_split['train']
    test_dataset = train_test_split['test']

    print(f"Train Size: {len(train_dataset)}, Test Size: {len(test_dataset)}")

    return train_dataset, test_dataset

train_dataset, test_dataset = split_dataset_from_json("../data/parsed/5thelement.json")

Train Size: 754, Test Size: 189


### Train a first exploratory Hugging face Transformer model with typical params and do basic evaluation

In [13]:
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)
import evaluate

### Train model
model_name = "microsoft/deberta-base"

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Add a padding token if not present
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token  # Use <EOS> as <PAD>

# Tokenize the data
def preprocess_function(examples):
    tokenized = tokenizer(
        examples["text"],  # raw text to be tokenized
        truncation=True,
        padding="max_length",
        max_length=128,
        return_tensors="pt",
    )
    tokenized["labels"] = examples["label"]  # add the labels to the tokenized input
    return tokenized

encoded_train_dataset = train_dataset.map(preprocess_function, batched=True)
encoded_test_dataset = test_dataset.map(preprocess_function, batched=True)

# Sanity check: print a few examples of the encoded dataset
print("Encoded Train Dataset:")
print(encoded_train_dataset["input_ids"][:2])
print(encoded_train_dataset["attention_mask"][:2])
print(encoded_train_dataset["labels"][:2])

# Load the model
num_labels = len(parsed_object.character_vocabulary)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=num_labels
)

# Add a padding token if not present
if model.config.pad_token_id is None:
    model.config.pad_token_id = model.config.eos_token_id

# Set training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=32,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir="./logs",
    save_strategy="epoch",
    logging_steps=30,
    warmup_steps=10,  # learning rate will be gradually increased during the first 10 steps
    load_best_model_at_end=True,
)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.argmax(axis=1)
    metric_acc = evaluate.load("accuracy")
    metric_f1 = evaluate.load("f1")

    accuracy = metric_acc.compute(predictions=predictions, references=labels)
    f1 = metric_f1.compute(predictions=predictions, references=labels, average="macro")

    return {
        "accuracy": accuracy["accuracy"],
        "f1": f1["f1"],
    }


# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_train_dataset,
    eval_dataset=encoded_test_dataset,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

Map: 100%|██████████| 189/189 [00:00<00:00, 8727.84 examples/s]


Encoded Train Dataset:
[[1, 1116, 768, 4, 1437, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1640, 282, 13533, 154, 43, 2649, 4, 1437, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
[[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
                                               
 50%|█████     | 24/48 [00:21<00:16,  1.46it/s]

{'eval_loss': 4.14119291305542, 'eval_accuracy': 0.24338624338624337, 'eval_f1': 0.009321175278622088, 'eval_runtime': 3.9445, 'eval_samples_per_second': 47.914, 'eval_steps_per_second': 6.084, 'epoch': 1.0}


 62%|██████▎   | 30/48 [00:33<00:26,  1.45s/it]

{'loss': 4.2792, 'grad_norm': 4.887906074523926, 'learning_rate': 4.736842105263158e-06, 'epoch': 1.25}


                                               
100%|██████████| 48/48 [00:55<00:00,  1.45it/s]

{'eval_loss': 3.865752935409546, 'eval_accuracy': 0.24338624338624337, 'eval_f1': 0.009321175278622088, 'eval_runtime': 4.9899, 'eval_samples_per_second': 37.876, 'eval_steps_per_second': 4.81, 'epoch': 2.0}


100%|██████████| 48/48 [00:58<00:00,  1.22s/it]


{'train_runtime': 58.6032, 'train_samples_per_second': 25.732, 'train_steps_per_second': 0.819, 'train_loss': 4.1466875076293945, 'epoch': 2.0}


100%|██████████| 24/24 [00:03<00:00,  6.33it/s]


Evaluation Results on Test Set: {'eval_loss': 3.865752935409546, 'eval_accuracy': 0.24338624338624337, 'eval_f1': 0.009321175278622088, 'eval_runtime': 3.9302, 'eval_samples_per_second': 48.089, 'eval_steps_per_second': 6.107, 'epoch': 2.0}





### Compute confusion matrix and additional metrics per class

In [16]:
# Compute predictions on test set to compute additional metrics
def get_predictions_and_labels(test_dataset, trainer):
    import numpy as np

    # Predict on the evaluation dataset
    predictions = trainer.predict(test_dataset)

    # Extract predictions and true labels
    y_pred = np.argmax(predictions.predictions, axis=1)
    y_true = predictions.label_ids
    return y_pred, y_true


def compute_precision_recall_per_class(y_pred, y_true, num_labels):

    from sklearn.metrics import confusion_matrix
    import numpy as np

    def compute_precision_recall(confusion_matrix):
        """
        Compute precision and recall for each class from a confusion matrix.
        
        :param confusion_matrix: 2D array, where rows are actual classes
                                and columns are predicted classes.
        :return: Dictionary with precision and recall for each class.
        """
        num_classes = confusion_matrix.shape[0]
        metrics = []
        
        for i in range(num_classes):
            # True Positives (diagonal element)
            TP = confusion_matrix[i, i]
            
            # False Positives (sum of column i, excluding TP)
            FP = np.sum(confusion_matrix[:, i]) - TP
            
            # False Negatives (sum of row i, excluding TP)
            FN = np.sum(confusion_matrix[i, :]) - TP
            
            # Precision and Recall
            precision = float(TP / (TP + FP)) if (TP + FP) > 0 else 0
            recall = float(TP / (TP + FN)) if (TP + FN) > 0 else 0
            
            metrics.append({'Class': i, 'Precision': precision, 'Recall': recall})
        
        return metrics

    print("# predictions in test set: {}".format(len(y_pred)))
    print("Predictions on test set:")
    print(y_pred)
    print("True labels:")
    print(y_true)

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred, labels=range(num_labels))

    # Compute precision and recall per class
    metrics_per_class = compute_precision_recall(cm)
    metrics_per_class = sorted(metrics_per_class, key=lambda x: x['Recall'], reverse=True)
    
    print("Precision and recall per class:")
    print(metrics_per_class)

y_pred, y_true = get_predictions_and_labels(encoded_test_dataset, trainer)
compute_precision_recall_per_class(y_pred, y_true, num_labels)

100%|██████████| 24/24 [00:03<00:00,  6.50it/s]

# predictions in test set: 189
Predictions on test set:
[41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41
 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41
 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41
 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41
 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41
 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41
 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41
 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41]
True labels:
[41 34 22 45  0 21 41 80 56 66 82 41 41 65 76 41 41 82 82 59 44 55 41 55
 21  1 41 73 66 71 59 61 82 21 72 68 41 59 55 64 21 66 21 43 41 59 24 76
 41 55 41 41 41 41  7 77 41 21 59 41 21 41 41 44  4 43 43 82 59 43 21 82
 80 55 68 82 41 79 21 41 43 21 82 59 41 73 44 41 41 40 41 29 61 21 41 80
 33 39 41 56 55 21 72 29 43 66 65 41 68 41 21 24 41 53 82 59 72




### Train and evaluate on oversampled dataset (oversample each class proportionally to max class size)

In [None]:
import random
from datasets import Dataset

def oversample_dataset(dataset: Dataset, class_count_threshold=0):
    """
    Oversamples the dataset to balance the class distribution. Ignore classes with count less or equal to class_count_threshold.
    Args:
        dataset (Dataset): A dataset object containing examples with a "label" field.
        class_count_threshold (int): The minimum count of a class to be considered for oversampling.
    Returns:
        Dataset: A new dataset object with balanced class distribution by oversampling the minority classes.
    """
    class_counts = Counter(dataset["label"])

    max_count = max(class_counts.values())
    examples_by_class = {label: [] for label in class_counts}
    
    for example in dataset:
        examples_by_class[example["label"]].append(example)
    
    balanced_examples = []
    for _, examples in examples_by_class.items():
        if len(examples) > class_count_threshold:
            balanced_examples.extend(random.choices(examples, k=max_count))
        else:
            balanced_examples.extend(examples)
    
    random.shuffle(balanced_examples)
    return Dataset.from_list(balanced_examples)

train_dataset_oversampled = oversample_dataset(train_dataset, class_count_threshold=2)

print("Training dataset size:", len(train_dataset))
print("Oversampled training dataset size:", len(train_dataset_oversampled))

encoded_train_dataset_oversampled = train_dataset_oversampled.map(preprocess_function, batched=True)

# Reset model weights to previous state
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=num_labels
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_train_dataset_oversampled,
    eval_dataset=encoded_test_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

# Evaluate the model
y_pred, y_true = get_predictions_and_labels(encoded_test_dataset, trainer)
compute_precision_recall_per_class(y_pred, y_true, num_labels)

Training dataset size: 754
Oversampled training dataset size: 7691


Map: 100%|██████████| 7691/7691 [00:00<00:00, 15532.54 examples/s]
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
                                                 
 16%|█▌        | 104/648 [16:37<07:33,  1.20it/s]

{'loss': 4.3557, 'grad_norm': 3.7209956645965576, 'learning_rate': 9.576271186440679e-06, 'epoch': 0.12}


                                                 
 16%|█▌        | 104/648 [17:00<07:33,  1.20it/s]

{'loss': 4.1152, 'grad_norm': 3.9727783203125, 'learning_rate': 8.940677966101694e-06, 'epoch': 0.25}


                                                 
 16%|█▌        | 104/648 [17:23<07:33,  1.20it/s]

{'loss': 3.9421, 'grad_norm': 4.489749908447266, 'learning_rate': 8.305084745762712e-06, 'epoch': 0.37}


                                                 
 16%|█▌        | 104/648 [17:48<07:33,  1.20it/s]

{'loss': 3.7317, 'grad_norm': 5.299050807952881, 'learning_rate': 7.66949152542373e-06, 'epoch': 0.5}


                                                 
 16%|█▌        | 104/648 [18:11<07:33,  1.20it/s]

{'loss': 3.5218, 'grad_norm': 4.993688106536865, 'learning_rate': 7.033898305084746e-06, 'epoch': 0.62}


                                                 
 16%|█▌        | 104/648 [18:37<07:33,  1.20it/s]

{'loss': 3.3312, 'grad_norm': 5.515802383422852, 'learning_rate': 6.398305084745763e-06, 'epoch': 0.75}


                                                 
 16%|█▌        | 104/648 [19:02<07:33,  1.20it/s]

{'loss': 3.1153, 'grad_norm': 5.577577590942383, 'learning_rate': 5.7627118644067805e-06, 'epoch': 0.87}


                                                 
 16%|█▌        | 104/648 [19:25<07:33,  1.20it/s]

{'loss': 2.9275, 'grad_norm': 6.456362724304199, 'learning_rate': 5.127118644067796e-06, 'epoch': 1.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                 
[A                                              

 16%|█▌        | 104/648 [19:36<07:33,  1.20it/s]
[A
[A

{'eval_loss': 3.532827138900757, 'eval_accuracy': 0.2222222222222222, 'eval_f1': 0.24983444955427714, 'eval_runtime': 7.1677, 'eval_samples_per_second': 26.368, 'eval_steps_per_second': 3.348, 'epoch': 1.0}


                                                 
 16%|█▌        | 104/648 [20:17<07:33,  1.20it/s]

{'loss': 2.7448, 'grad_norm': 6.159979820251465, 'learning_rate': 4.491525423728814e-06, 'epoch': 1.12}


                                                 
 16%|█▌        | 104/648 [20:56<07:33,  1.20it/s]

{'loss': 2.6538, 'grad_norm': 5.831851482391357, 'learning_rate': 3.8559322033898315e-06, 'epoch': 1.24}


                                                 
 16%|█▌        | 104/648 [21:23<07:33,  1.20it/s]

{'loss': 2.527, 'grad_norm': 5.727634429931641, 'learning_rate': 3.2203389830508473e-06, 'epoch': 1.37}


                                                 
 16%|█▌        | 104/648 [21:53<07:33,  1.20it/s]

{'loss': 2.4035, 'grad_norm': 6.117085933685303, 'learning_rate': 2.5847457627118645e-06, 'epoch': 1.49}


                                                 
 16%|█▌        | 104/648 [22:19<07:33,  1.20it/s]

{'loss': 2.2741, 'grad_norm': 6.15022611618042, 'learning_rate': 1.9491525423728816e-06, 'epoch': 1.62}


                                                 
 16%|█▌        | 104/648 [22:49<07:33,  1.20it/s]

{'loss': 2.2912, 'grad_norm': 6.010318279266357, 'learning_rate': 1.3135593220338985e-06, 'epoch': 1.74}


                                                 
 16%|█▌        | 104/648 [23:15<07:33,  1.20it/s]

{'loss': 2.204, 'grad_norm': 5.977491855621338, 'learning_rate': 6.779661016949153e-07, 'epoch': 1.87}


                                                 
 16%|█▌        | 104/648 [23:44<07:33,  1.20it/s]

{'loss': 2.2232, 'grad_norm': 6.491508960723877, 'learning_rate': 4.237288135593221e-08, 'epoch': 1.99}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                 
[A                                              

 16%|█▌        | 104/648 [23:58<07:33,  1.20it/s]
[A
[A

{'eval_loss': 3.227496385574341, 'eval_accuracy': 0.26455026455026454, 'eval_f1': 0.31789557959770726, 'eval_runtime': 6.9893, 'eval_samples_per_second': 27.041, 'eval_steps_per_second': 3.434, 'epoch': 2.0}


                                                 
100%|██████████| 482/482 [07:55<00:00,  1.01it/s]


{'train_runtime': 475.0976, 'train_samples_per_second': 32.377, 'train_steps_per_second': 1.015, 'train_loss': 3.0197754875752936, 'epoch': 2.0}


100%|██████████| 24/24 [00:04<00:00,  5.77it/s]

# predictions in test set: 189
Predictions on test set:
[53 34 22 45  0 65 53 80 56 10 24 44 24 65 76 57 44 20 53 76 53 10 53 53
 68  1 34 73 10 24 24 61 24 44 15 68 24 76 24 10  2 76 40 24 56 53 24 76
 54 68 24 10 10 15 57 24 53 34 34  1 10 65 53 24 72 54 24 24 54 34 73 24
 80 53 68 44  1 79 24 10 34 73 10 73 29 73 34 15 34 14 10 29 61 24 10 80
 69 10 53 56 53 44 15 29 73 10 65 24 68 56 24 24 44 53 53 10 72 57 45 45
 40 24 24 14 29 68 57 22 17 54 24 44 10 17 34 44 53 57 34 65 15 57 53 65
 10 53 44 24 24 68 34 12 40 34 53 10 24 10 34 15 14 34 22  8 53 22 53 34
 73 22 20 61 24 34 57 24 61 53 44 44 45 10 72 53 24 24 55 57 65]
True labels:
[41 34 22 45  0 21 41 80 56 66 82 41 41 65 76 41 41 82 82 59 44 55 41 55
 21  1 41 73 66 71 59 61 82 21 72 68 41 59 55 64 21 66 21 43 41 59 24 76
 41 55 41 41 41 41  7 77 41 21 59 41 21 41 41 44  4 43 43 82 59 43 21 82
 80 55 68 82 41 79 21 41 43 21 82 59 41 73 44 41 41 40 41 29 61 21 41 80
 33 39 41 56 55 21 72 29 43 66 65 41 68 41 21 24 41 53 82 59 72




### Use weighted cross-entropy loss to simulate oversampling with less computational effort

In [None]:
import torch
import torch.nn as nn
import numpy as np

# Reset model weights to previous state
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=num_labels
)

def compute_class_weights(dataset: Dataset, num_labels: int, min_count=0, top_k=None):
    """
    Compute class weights for
    Args:
        dataset (Dataset): A dataset object containing examples with a "label" field.
        num_labels (int): The number of classes in the dataset.
        min_count (int): The minimum count of a class to be considered for computing class weights.
    Returns:
        dict: A dictionary with class weights for each class compared to the class with maximum count.
    """
    class_counts = Counter(dataset["label"])

    if top_k is not None:
        class_counts = dict(class_counts.most_common(top_k))

    max_count = max(class_counts.values())
    class_weights = {label: np.power(max_count / count, 0.5) for label, count in class_counts.items()}
    class_weights_list = [1.0] * (num_labels)

    for label, weight in class_weights.items():
        if class_counts[label] > min_count:
            class_weights_list[label] = weight
            
    return class_weights_list

class_weights = compute_class_weights(train_dataset, num_labels=num_labels, min_count=2)
print(class_weights)
class_weights = torch.tensor(class_weights, dtype=torch.float32)
device = torch.device('cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu')
class_weights = class_weights.to(device)

# Define the loss function with class weights
loss_fn = nn.CrossEntropyLoss(weight=class_weights)

# Define custom compute_loss_func
def compute_loss_func(outputs, labels, num_items_in_batch=None):
    logits = outputs.logits
    loss = loss_fn(logits.view(-1, logits.size(-1)), labels.view(-1))
    return loss

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_train_dataset,
    eval_dataset=encoded_test_dataset,
    compute_metrics=compute_metrics,
    compute_loss_func=compute_loss_func,
)

trainer.train()

# Evaluate the model
y_pred, y_true = get_predictions_and_labels(encoded_test_dataset, trainer)
compute_precision_recall_per_class(y_pred, y_true, num_labels)

Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[np.float64(2.2949985871139686), np.float64(1.9979123127758747), np.float64(2.2949985871139686), 1.0, 1.0, 1.0, 1.0, 1.0, np.float64(1.7698160790574151), 1.0, np.float64(1.9372562265956328), 1.0, np.float64(2.166679504533743), 1.0, np.float64(2.2949985871139686), np.float64(1.9372562265956328), 1.0, np.float64(1.9979123127758747), 1.0, 1.0, np.float64(2.166679504533743), np.float64(1.2023861220940364), np.float64(1.7698160790574151), 1.0, np.float64(1.8862040631540171), 1.0, 1.0, 1.0, 1.0, np.float64(1.8038759709114691), 1.0, np.float64(2.072109360404684), 1.0, 1.0, np.float64(1.8422907500573902), 1.0, 1.0, 1.0, 1.0, 1.0, np.float64(2.072109360404684), np.float64(1.0), 1.0, np.float64(1.3538071871541277), np.float64(1.3352548702231246), np.float64(1.7116619756873133), 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, np.float64(1.9979123127758747), np.float64(2.2949985871139686), np.float64(1.4578977774089315), np.float64(2.166679504533743), np.float64(2.072109360404684), 1.0, np.float64(1.4294753731

  2%|▏         | 1/48 [10:48<8:27:43, 648.17s/it]
 50%|█████     | 24/48 [00:34<00:20,  1.19it/s]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                               

 50%|█████     | 24/48 [00:38<00:20,  1.19it/s]
[A
[A

{'eval_loss': 4.191098213195801, 'eval_accuracy': 0.24338624338624337, 'eval_f1': 0.009321175278622088, 'eval_runtime': 4.1902, 'eval_samples_per_second': 45.105, 'eval_steps_per_second': 5.728, 'epoch': 1.0}


 62%|██████▎   | 30/48 [00:53<00:29,  1.64s/it]
 62%|██████▎   | 30/48 [00:53<00:29,  1.64s/it]

{'loss': 4.3127, 'grad_norm': 4.599071979522705, 'learning_rate': 4.736842105263158e-06, 'epoch': 1.25}


100%|██████████| 48/48 [01:15<00:00,  1.26s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                               

100%|██████████| 48/48 [01:28<00:00,  1.26s/it]
[A
[A

{'eval_loss': 3.955878973007202, 'eval_accuracy': 0.24338624338624337, 'eval_f1': 0.009321175278622088, 'eval_runtime': 7.8028, 'eval_samples_per_second': 24.222, 'eval_steps_per_second': 3.076, 'epoch': 2.0}



100%|██████████| 48/48 [01:40<00:00,  2.09s/it]


{'train_runtime': 100.5158, 'train_samples_per_second': 15.003, 'train_steps_per_second': 0.478, 'train_loss': 4.2025101979573565, 'epoch': 2.0}


100%|██████████| 24/24 [00:03<00:00,  6.63it/s]

# predictions in test set: 189
Predictions on test set:
[41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41
 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41
 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41
 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41
 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41
 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41
 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41
 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41]
True labels:
[41 34 22 45  0 21 41 80 56 66 82 41 41 65 76 41 41 82 82 59 44 55 41 55
 21  1 41 73 66 71 59 61 82 21 72 68 41 59 55 64 21 66 21 43 41 59 24 76
 41 55 41 41 41 41  7 77 41 21 59 41 21 41 41 44  4 43 43 82 59 43 21 82
 80 55 68 82 41 79 21 41 43 21 82 59 41 73 44 41 41 40 41 29 61 21 41 80
 33 39 41 56 55 21 72 29 43 66 65 41 68 41 21 24 41 53 82 59 72




### Hyperparameter tunning

In [None]:
! pip install optuna
! pip install ray[tune]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable
Collecting optuna
  Downloading optuna-4.1.0-py3-none-any.whl (364 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m364.4/364.4 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting sqlalchemy>=1.4.2
  Downloading SQLAlchemy-2.0.36-cp39-cp39-macosx_11_0_arm64.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting colorlog
  Downloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Collecting alembic>=1.5.0
  Downloading alembic-1.14.0-py3-none-any.whl (233 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.5/233.5 kB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting Mako
  Downloading Mako-1.3.6-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:

### Split the dataset into train, validation and test sets

from datasets import load_dataset

# Load dataset from local json file 
dataset = load_dataset("json", data_files="./data/parsed/5thelement.json")

# Split the dataset into a simple train, validation and test sets
train_test_split = dataset['train'].train_test_split(test_size=0.4)

train_dataset = train_test_split['train']

temp_split_dataset = train_test_split['test'].train_test_split(test_size=0.5)
val_dataset = temp_split_dataset['train']
test_dataset = temp_split_dataset['test']

print(f"Train Size: {len(train_dataset)}, Test Size: {len(test_dataset)}, Val Size: {len(val_dataset)}")

Train Size: 565, Test Size: 189, Val Size: 189


In [58]:
# Run hyperparameter search

def model_init():
    return AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

encoded_train_dataset = train_dataset.map(preprocess_function, batched=True)
encoded_test_dataset = test_dataset.map(preprocess_function, batched=True)
encoded_val_dataset = val_dataset.map(preprocess_function, batched=True)

trainer_hp = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=encoded_train_dataset,
    eval_dataset=encoded_val_dataset,
    compute_metrics=compute_metrics,
)

best_run = trainer_hp.hyperparameter_search(n_trials=2, direction="maximize")

print(best_run)

Map: 100%|██████████| 565/565 [00:00<00:00, 7022.66 examples/s]
Map: 100%|██████████| 189/189 [00:00<00:00, 8330.78 examples/s]
Map: 100%|██████████| 189/189 [00:00<00:00, 8285.85 examples/s]
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[I 2024-11-30 21:54:22,213] A new study created in memory with name: no-name-5662d24d-8aac-454c-971a-0841b2ea97d4
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
    

{'loss': 4.4345, 'grad_norm': 4.090267658233643, 'learning_rate': 8.854597805338979e-06, 'epoch': 0.21}


                                                 
 25%|██▌       | 189/756 [07:18<02:22,  3.99it/s]

{'loss': 4.3045, 'grad_norm': 9.708539009094238, 'learning_rate': 6.482830536051752e-06, 'epoch': 0.42}


                                                 
 25%|██▌       | 189/756 [07:25<02:22,  3.99it/s]

{'loss': 3.7897, 'grad_norm': 10.829245567321777, 'learning_rate': 4.111063266764526e-06, 'epoch': 0.63}


                                                 
 25%|██▌       | 189/756 [07:33<02:22,  3.99it/s]

{'loss': 3.6095, 'grad_norm': 12.190399169921875, 'learning_rate': 1.7392959974772993e-06, 'epoch': 0.85}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                 
[A                                              

 25%|██▌       | 189/756 [07:52<02:22,  3.99it/s]
[A
[A

{'eval_loss': 3.5981497764587402, 'eval_accuracy': 0.2222222222222222, 'eval_f1': 0.008658008658008658, 'eval_runtime': 7.1919, 'eval_samples_per_second': 26.28, 'eval_steps_per_second': 3.337, 'epoch': 1.0}


                                                 
100%|██████████| 142/142 [00:57<00:00,  2.49it/s]
[I 2024-11-30 21:55:21,377] Trial 0 finished with value: 0.23088023088023088 and parameters: {'learning_rate': 1.0435775984863796e-05, 'num_train_epochs': 1, 'seed': 7, 'per_device_train_batch_size': 4}. Best is trial 0 with value: 0.23088023088023088.


{'train_runtime': 57.0963, 'train_samples_per_second': 9.896, 'train_steps_per_second': 2.487, 'train_loss': 3.969445242008693, 'epoch': 1.0}


Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
                                                 
 25%|██▌       | 189/756 [08:09<02:22,  3.99it/s]

{'loss': 4.2952, 'grad_norm': 8.204130172729492, 'learning_rate': 8.606272816685223e-06, 'epoch': 0.42}


                                                 
 25%|██▌       | 189/756 [08:20<02:22,  3.99it/s]

{'loss': 3.7404, 'grad_norm': 10.060961723327637, 'learning_rate': 6.301021169358824e-06, 'epoch': 0.85}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                 
[A                                             

 25%|██▌       | 189/756 [08:29<02:22,  3.99it/s]
[A
[A

{'eval_loss': 3.646695613861084, 'eval_accuracy': 0.2222222222222222, 'eval_f1': 0.008658008658008658, 'eval_runtime': 5.448, 'eval_samples_per_second': 34.691, 'eval_steps_per_second': 4.405, 'epoch': 1.0}


                                                 
 25%|██▌       | 189/756 [08:41<02:22,  3.99it/s]

{'loss': 3.6168, 'grad_norm': 7.278314113616943, 'learning_rate': 3.9957695220324245e-06, 'epoch': 1.27}


                                                 
 25%|██▌       | 189/756 [08:50<02:22,  3.99it/s]

{'loss': 3.4228, 'grad_norm': 7.428894519805908, 'learning_rate': 1.6905178747060257e-06, 'epoch': 1.69}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                 

[A[A                                         
 25%|██▌       | 189/756 [09:06<02:22,  3.99it/s]
[A
[A

{'eval_loss': 3.4981696605682373, 'eval_accuracy': 0.2222222222222222, 'eval_f1': 0.008658008658008658, 'eval_runtime': 5.6005, 'eval_samples_per_second': 33.747, 'eval_steps_per_second': 4.285, 'epoch': 2.0}


                                                 
100%|██████████| 142/142 [01:12<00:00,  1.95it/s]
[I 2024-11-30 21:56:35,202] Trial 1 finished with value: 0.23088023088023088 and parameters: {'learning_rate': 1.0143107248236155e-05, 'num_train_epochs': 2, 'seed': 19, 'per_device_train_batch_size': 8}. Best is trial 0 with value: 0.23088023088023088.


{'train_runtime': 72.7923, 'train_samples_per_second': 15.524, 'train_steps_per_second': 1.951, 'train_loss': 3.7333725405410982, 'epoch': 2.0}
BestRun(run_id='0', objective=0.23088023088023088, hyperparameters={'learning_rate': 1.0435775984863796e-05, 'num_train_epochs': 1, 'seed': 7, 'per_device_train_batch_size': 4}, run_summary=None)


In [None]:
# Train and evaluate the model with the best hyperparameters
for n, v in best_run.hyperparameters.items():
    setattr(trainer_hp.args, n, v)

trainer_hp.train()

# Evaluate the model
### Evaluate the model
results = trainer_hp.evaluate(eval_dataset=encoded_test_dataset)
print("Evaluation Results on Test Set:", results)

y_pred, y_true = get_predictions_and_labels(encoded_test_dataset, trainer)

compute_precision_recall_per_class(y_pred, y_true, num_labels)

                                                 
 25%|██▌       | 189/756 [09:27<02:22,  3.99it/s]

{'loss': 4.0689, 'grad_norm': 11.085065841674805, 'learning_rate': 8.88268156424581e-06, 'epoch': 0.16}


                                                 
 25%|██▌       | 189/756 [09:36<02:22,  3.99it/s]

{'loss': 3.7027, 'grad_norm': 10.091877937316895, 'learning_rate': 7.206703910614526e-06, 'epoch': 0.32}


                                                 
 25%|██▌       | 189/756 [09:46<02:22,  3.99it/s]

{'loss': 3.4752, 'grad_norm': 9.614151954650879, 'learning_rate': 5.530726256983241e-06, 'epoch': 0.48}


                                                 
 25%|██▌       | 189/756 [09:56<02:22,  3.99it/s]

{'loss': 3.5799, 'grad_norm': 9.672201156616211, 'learning_rate': 3.854748603351956e-06, 'epoch': 0.63}


                                                 
 25%|██▌       | 189/756 [10:06<02:22,  3.99it/s]

{'loss': 3.525, 'grad_norm': 9.818577766418457, 'learning_rate': 2.1787709497206706e-06, 'epoch': 0.79}


                                                 
 25%|██▌       | 189/756 [10:19<02:22,  3.99it/s]

{'loss': 3.392, 'grad_norm': 11.35380744934082, 'learning_rate': 5.027932960893855e-07, 'epoch': 0.95}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                 
[A                                              

 25%|██▌       | 189/756 [10:34<02:22,  3.99it/s]
[A
[A

{'eval_loss': 3.304694652557373, 'eval_accuracy': 0.2804232804232804, 'eval_f1': 0.010186430905246973, 'eval_runtime': 8.0175, 'eval_samples_per_second': 23.573, 'eval_steps_per_second': 2.993, 'epoch': 1.0}


                                                 
100%|██████████| 189/189 [01:28<00:00,  2.13it/s]


{'train_runtime': 88.6759, 'train_samples_per_second': 8.503, 'train_steps_per_second': 2.131, 'train_loss': 3.6040518523524048, 'epoch': 1.0}


100%|██████████| 24/24 [00:06<00:00,  3.99it/s]


Evaluation Results on Test Set: {'eval_loss': 3.1965441703796387, 'eval_accuracy': 0.328042328042328, 'eval_f1': 0.01073965009527109, 'eval_runtime': 6.1758, 'eval_samples_per_second': 30.603, 'eval_steps_per_second': 3.886, 'epoch': 1.0}


100%|██████████| 24/24 [00:05<00:00,  4.46it/s]

# predictions in test set: 189
Predictions on test set:
[41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41
 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41
 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41
 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41
 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41
 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41
 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41
 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41]
True labels:
[66 21 43 41 41 59 43 45 54 41 44 61 21 41 74 82 41 41 34 21  7 21 82 61
 41 41 41 59 55 41 21 26 82 41 41 77  8 41 48 41 41 78 41 66 44 41 55 80
 44 41  3 21 41 21 41 41 69 41 41 20 44 41 41 21 72 21 41 44 45 41 21 44
 55 41 72 41  1 44 44 82 66 43 41 31 44 41 22 55 41 68 22 72 41 41 21 82
 41 43 77 55 59 50 34 29 41 61 68 21 12 43 10 17 41 41 82 41 21




### Train and evaluate on oversampled dataset

In [18]:
import random
from datasets import Dataset

def oversample_dataset(dataset):
    class_counts = Counter(dataset["label"])

    print(class_counts)

    max_count = max(class_counts.values())
    examples_by_class = {label: [] for label in class_counts}
    
    for example in dataset:
        examples_by_class[example["label"]].append(example)
    
    balanced_examples = []
    for _, examples in examples_by_class.items():
        balanced_examples.extend(random.choices(examples, k=max_count))
    
    random.shuffle(balanced_examples)
    return Dataset.from_list(balanced_examples)

train_dataset, test_dataset = split_dataset_from_json("./data/parsed/5thelement.json", test_size=0.2)

train_dataset = oversample_dataset(train_dataset)

print("Extended dataset size:", len(train_dataset))

encoded_train_dataset = train_dataset.map(preprocess_function, batched=True)
encoded_test_dataset = test_dataset.map(preprocess_function, batched=True)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_train_dataset,
    eval_dataset=encoded_test_dataset,
    compute_metrics=compute_metrics,
)

setattr(trainer_hp.args, "num_train_epochs", 1)

trainer.train()

# Evaluate the model
### Evaluate the model
results = trainer.evaluate(eval_dataset=encoded_test_dataset)
print("Evaluation Results on Test Set:", results)

y_pred, y_true = get_predictions_and_labels(encoded_test_dataset, trainer)

compute_precision_recall_per_class(y_pred, y_true, num_labels)

FileNotFoundError: Unable to find '/Users/mcelikik/Workspace/movie_script/notebooks/./data/parsed/5thelement.json'