In [None]:
import wandb

wandb.login()

# setup wandb environment variables
%env WANDB_ENTITY=meghanadh27/btp_sa
%env WANDB_PROJECT=finetune_distilbert_iemocap_text

In [1]:
from datasets import load_dataset, concatenate_datasets, DatasetDict

dataset = load_dataset("Zahra99/IEMOCAP_Text")
dataset = concatenate_datasets([dataset["session1"], dataset["session2"], dataset["session3"], dataset["session4"], dataset["session5"]])

Found cached dataset parquet (/Users/meghanadhpulivarthi/.cache/huggingface/datasets/Zahra99___parquet/Zahra99--IEMOCAP_Text-96d9699f03987401/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/5 [00:00<?, ?it/s]

In [2]:
dataset.features["label"] 

ClassLabel(names=['ang', 'hap', 'neu', 'sad'], id=None)

In [3]:
# 90% train, 10% test + validation
train_test_dataset = dataset.train_test_split(test_size=0.2)
# Split the 10% test + valid in half test, half valid
test_valid = train_test_dataset['test'].train_test_split(test_size=0.5)
# gather everyone if you want to have a single DatasetDict
train_test_valid_dataset = DatasetDict({
    'train': train_test_dataset['train'].select([0, 1, 2, 3, 4]),
    'validation': test_valid['train'],
    'test': test_valid['test']})

In [4]:
train_test_valid_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 5
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 553
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 554
    })
})

In [5]:
train_test_valid_dataset["train"][0]

{'text': "Penny slots. That's what he plays.", 'label': 2}

In [6]:
id2label_fn = train_test_valid_dataset["train"].features["label"].int2str
id2label_fn(train_test_valid_dataset["train"][0]["label"])

'neu'

In [7]:
from transformers import AutoTokenizer, DataCollatorWithPadding
checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(examples):
   return tokenizer(examples["text"], truncation=True)
    
tokenized_datasets = train_test_valid_dataset.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/553 [00:00<?, ? examples/s]

Map:   0%|          | 0/554 [00:00<?, ? examples/s]

In [8]:
import evaluate
import numpy as np
from datasets import load_metric

def compute_metrics(eval_preds):
#     metric = evaluate.combine(["accuracy", "recall", "precision", "f1"])
#     logits, labels = eval_preds
#     predictions = np.argmax(logits, axis=-1)
#     return metric.compute(predictions=predictions, references=labels, average="weighted")
    load_accuracy = load_metric("accuracy")
    load_f1 = load_metric("f1")
  
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
    f1 = load_f1.compute(predictions=predictions, references=labels, average="weighted")["f1"]
    return {"accuracy": accuracy, "f1": f1}

In [9]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [10]:
!huggingface-cli whoami

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


meghanadh


In [11]:
id2label = {
    str(i): id2label_fn(i)
    for i in range(len(tokenized_datasets["train"].features["label"].names))
}
label2id = {v: k for k, v in id2label.items()}
id2label["2"]

'neu'

In [12]:
import torch
if torch.backends.mps.is_available():
    mps_device = torch.device("mps")
    x = torch.ones(1, device=mps_device)
    print (x)
else:
    print ("MPS device not found.")

tensor([1.], device='mps:0')


In [13]:
from transformers import TrainingArguments, AutoModelForSequenceClassification, Trainer, EarlyStoppingCallback

training_args = TrainingArguments(
    "finetune_bert_iemocap_text", 
    evaluation_strategy="epoch", 
    num_train_epochs=10,
    save_strategy="epoch",
    save_total_limit=2,
    logging_steps=40,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    # use_mps_device=True,
    overwrite_output_dir=True,
    push_to_hub=True
)

checkpoint = "/Users/meghanadhpulivarthi/Desktop/BTProject/Notebooks/iemocap_text_only/finetune_bert_iemocap_text/checkpoint-2212"
num_labels = len(id2label)
model = AutoModelForSequenceClassification.from_pretrained(
    checkpoint, 
    num_labels=num_labels,
    label2id=label2id,
    id2label=id2label,
)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3 )],
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [14]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,1.400554,0.179024,0.103222
2,No log,1.411433,0.200723,0.119306
3,No log,1.428031,0.18264,0.107722
4,No log,1.445064,0.189873,0.109053
5,No log,1.485121,0.173285,0.089996


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datase

KeyboardInterrupt: 

In [14]:
trainer.evaluate()

  load_accuracy = load_metric("accuracy")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01112462546653761, max=1.0)…

{'eval_loss': 0.49315324425697327,
 'eval_accuracy': 0.8752260397830018,
 'eval_f1': 0.8755214168723158,
 'eval_runtime': 12.3909,
 'eval_samples_per_second': 44.629,
 'eval_steps_per_second': 5.649}

In [15]:
trainer.evaluate(eval_dataset=tokenized_datasets["test"])

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'eval_loss': 0.4672025144100189,
 'eval_accuracy': 0.868231046931408,
 'eval_f1': 0.8691378208703979,
 'eval_runtime': 9.9975,
 'eval_samples_per_second': 55.414,
 'eval_steps_per_second': 7.002}

In [28]:
test_preds = trainer.predict(tokenized_datasets["test"])


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [29]:
preds = np.argmax(test_preds.predictions, axis=-1)
preds

array([3, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 3, 0, 3, 3, 3, 3,
       3, 0, 3, 3, 3, 0, 3, 3, 3, 3, 0, 3, 0, 0, 3, 3, 3, 0, 3, 0, 0, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 0, 3,
       3, 0, 3, 3, 0, 3, 3, 3, 0, 0, 3, 3, 0, 3, 0, 3, 3, 3, 0, 3, 3, 3,
       0, 0, 3, 0, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 0,
       3, 0, 3, 3, 3, 0, 3, 3, 3, 0, 3, 0, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0,
       0, 3, 0, 3, 0, 0, 0, 0, 3, 3, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3, 0, 3, 0, 0, 3, 0, 3, 3, 3, 3,
       0, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 3, 3, 3, 3, 3, 0, 0, 3, 3, 3,
       3, 3, 3, 3, 3, 0, 0, 3, 3, 0, 3, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 0, 3, 3, 3, 0, 0, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 0, 3, 3, 0,
       3, 3, 3, 0, 3, 3, 3, 3, 3, 0, 0, 3, 3, 0, 0, 0, 3, 3, 3, 0, 3, 3,
       3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 0, 3, 3, 3, 3,
       3, 3, 3, 0, 0, 0, 3, 0, 3, 3, 3, 0, 3, 3, 3,

In [46]:
id2label_fn(test_preds.label_ids)

['hap',
 'neu',
 'sad',
 'hap',
 'sad',
 'neu',
 'neu',
 'hap',
 'sad',
 'neu',
 'hap',
 'neu',
 'ang',
 'sad',
 'neu',
 'sad',
 'ang',
 'neu',
 'hap',
 'hap',
 'sad',
 'hap',
 'ang',
 'sad',
 'sad',
 'sad',
 'ang',
 'sad',
 'ang',
 'neu',
 'hap',
 'hap',
 'neu',
 'hap',
 'hap',
 'neu',
 'neu',
 'neu',
 'hap',
 'hap',
 'neu',
 'neu',
 'ang',
 'neu',
 'neu',
 'ang',
 'neu',
 'neu',
 'hap',
 'sad',
 'neu',
 'sad',
 'neu',
 'neu',
 'sad',
 'hap',
 'ang',
 'neu',
 'sad',
 'neu',
 'ang',
 'neu',
 'hap',
 'ang',
 'hap',
 'neu',
 'sad',
 'sad',
 'hap',
 'hap',
 'ang',
 'hap',
 'neu',
 'sad',
 'neu',
 'neu',
 'hap',
 'neu',
 'sad',
 'neu',
 'sad',
 'neu',
 'neu',
 'ang',
 'sad',
 'hap',
 'ang',
 'hap',
 'neu',
 'ang',
 'hap',
 'sad',
 'ang',
 'sad',
 'neu',
 'sad',
 'neu',
 'ang',
 'neu',
 'ang',
 'ang',
 'neu',
 'neu',
 'hap',
 'hap',
 'ang',
 'hap',
 'hap',
 'hap',
 'neu',
 'hap',
 'sad',
 'neu',
 'hap',
 'hap',
 'hap',
 'neu',
 'hap',
 'neu',
 'sad',
 'hap',
 'sad',
 'sad',
 'hap',
 'neu',


In [41]:
import pandas as pd
df = pd.DataFrame(list(zip(tokenized_datasets["test"]["text"], preds)))
df.head()

Unnamed: 0,0,1
0,Thank you,3
1,Uh huh. I didn't come here get in yelling matc...,0
2,I got this idea watching them go down. Everyth...,0
3,Charles. That was his name. He did wriggle so ...,3
4,I can't be that strong. A whole year.,3


In [16]:
trainer.push_to_hub()

.DS_Store:   0%|          | 0.00/10.2k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

events.out.tfevents.1710916704.Meghanadhs-MacBook-Pro-2.local.98982.0:   0%|          | 0.00/690 [00:00<?, ?B/…

training_args.bin:   0%|          | 0.00/4.98k [00:00<?, ?B/s]

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/meghanadh/finetune_bert_iemocap_text/commit/107a0fe11f551ee8b721b1d81e8d49c103ba8eff', commit_message='End of training', commit_description='', oid='107a0fe11f551ee8b721b1d81e8d49c103ba8eff', pr_url=None, pr_revision=None, pr_num=None)

In [32]:
# Use a pipeline as a high-level helper
from transformers import pipeline

hub_model = pipeline("text-classification", model="meghanadh/finetune_bert_iemocap_text")

config.json:   0%|          | 0.00/783 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

In [23]:
tokenized_datasets["test"][0]["inp"]

KeyError: 'inp'

In [33]:
dd = hub_model(tokenized_datasets["test"]["text"])

In [34]:
dd

[{'label': 'ang', 'score': 0.2622804045677185},
 {'label': 'ang', 'score': 0.27050065994262695},
 {'label': 'ang', 'score': 0.27191266417503357},
 {'label': 'ang', 'score': 0.26439133286476135},
 {'label': 'ang', 'score': 0.2613983750343323},
 {'label': 'sad', 'score': 0.2626360058784485},
 {'label': 'ang', 'score': 0.2643706798553467},
 {'label': 'ang', 'score': 0.26567575335502625},
 {'label': 'ang', 'score': 0.2673676908016205},
 {'label': 'ang', 'score': 0.2669679820537567},
 {'label': 'ang', 'score': 0.26372230052948},
 {'label': 'ang', 'score': 0.2647041976451874},
 {'label': 'ang', 'score': 0.26163816452026367},
 {'label': 'ang', 'score': 0.2686363756656647},
 {'label': 'ang', 'score': 0.27395957708358765},
 {'label': 'ang', 'score': 0.2717868387699127},
 {'label': 'ang', 'score': 0.2636277675628662},
 {'label': 'ang', 'score': 0.26871436834335327},
 {'label': 'ang', 'score': 0.2615692913532257},
 {'label': 'sad', 'score': 0.2707075774669647},
 {'label': 'ang', 'score': 0.258087

In [43]:
tokenized_datasets["test"]["label"]

[1,
 2,
 3,
 1,
 3,
 2,
 2,
 1,
 3,
 2,
 1,
 2,
 0,
 3,
 2,
 3,
 0,
 2,
 1,
 1,
 3,
 1,
 0,
 3,
 3,
 3,
 0,
 3,
 0,
 2,
 1,
 1,
 2,
 1,
 1,
 2,
 2,
 2,
 1,
 1,
 2,
 2,
 0,
 2,
 2,
 0,
 2,
 2,
 1,
 3,
 2,
 3,
 2,
 2,
 3,
 1,
 0,
 2,
 3,
 2,
 0,
 2,
 1,
 0,
 1,
 2,
 3,
 3,
 1,
 1,
 0,
 1,
 2,
 3,
 2,
 2,
 1,
 2,
 3,
 2,
 3,
 2,
 2,
 0,
 3,
 1,
 0,
 1,
 2,
 0,
 1,
 3,
 0,
 3,
 2,
 3,
 2,
 0,
 2,
 0,
 0,
 2,
 2,
 1,
 1,
 0,
 1,
 1,
 1,
 2,
 1,
 3,
 2,
 1,
 1,
 1,
 2,
 1,
 2,
 3,
 1,
 3,
 3,
 1,
 2,
 0,
 2,
 1,
 2,
 1,
 2,
 2,
 2,
 2,
 3,
 1,
 0,
 3,
 2,
 0,
 2,
 1,
 1,
 3,
 2,
 0,
 1,
 1,
 1,
 2,
 2,
 1,
 3,
 3,
 2,
 1,
 0,
 2,
 0,
 1,
 1,
 2,
 2,
 3,
 1,
 1,
 3,
 1,
 2,
 1,
 2,
 0,
 1,
 0,
 0,
 1,
 2,
 3,
 0,
 3,
 0,
 2,
 2,
 3,
 1,
 1,
 2,
 0,
 3,
 0,
 2,
 2,
 2,
 1,
 3,
 3,
 1,
 0,
 2,
 3,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 2,
 2,
 3,
 1,
 0,
 2,
 1,
 2,
 1,
 0,
 1,
 1,
 2,
 1,
 3,
 3,
 2,
 2,
 2,
 1,
 1,
 0,
 1,
 3,
 2,
 2,
 2,
 1,
 3,
 3,
 3,
 0,
 1,
 1,
 1,
 3,
 2,
 0,
 2,
 1,
 0,
 2,


In [42]:
import pandas as pd
df = pd.DataFrame.from_records(dd)
print(df)

    label     score
0     ang  0.262280
1     ang  0.270501
2     ang  0.271913
3     ang  0.264391
4     ang  0.261398
..    ...       ...
549   sad  0.256456
550   sad  0.275943
551   ang  0.276991
552   ang  0.267923
553   sad  0.263829

[554 rows x 2 columns]


In [None]:
hub_model(tokenized_datasets["test"][0]["text"])[0]["label"]

In [None]:
for i in range(277):
    actual_emo = id2label_fn(tokenized_datasets['test'][i]['label'])
    pred_emo = id2label_fn(hub_model(tokenized_datasets["test"][i]["text"])[0]["label"])
    print(f"Text: {tokenized_datasets['test'][i]['text']}\nActual Label: {actual_emo}\nPredicted Label: {pred_emo}\n")
    