In [1]:
import json
import wandb
wandb.login() 
wandb.init(project='hinenglish', entity='pallavikailas')

with open('/kaggle/input/task-1-semeval/MaSaC_train_erc.json', 'r') as file:
    data = json.load(file)

for batch in data:
    # Now you can extract information from each item if it is a dictionary
    episode = batch.get('episode', None)
    speakers = batch.get('speakers', None)
    utterances = batch.get('utterances', None)
    emotions = batch.get('emotions', None)

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mpallavikailas[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [2]:
import json
from datasets import Dataset
from transformers import BertTokenizer

# Load the JSON data
with open('/kaggle/input/task-1-semeval/MaSaC_train_erc.json', 'r') as file:
    data = json.load(file)

# Flatten the JSON data into a list of sentence and sentiment pairs
# This part depends on your JSON structure
utterances = []
emotions = []
for item in data:
    # Assuming each item has 'utterances' and 'emotions' as lists of same length
    utterances.extend(item['utterances'])
    emotions.extend(item['emotions'])

# Define a mapping from emotions to integers
emotion_to_label = {
    'neutral': 0,
    'joy': 1,
    'contempt': 2,
    'anger': 3,
    'surprise': 4,
    'fear': 5,
    'disgust': 6,
    'sadness': 7
    # Add all your unique emotions and corresponding integers
}

# Apply the mapping to your emotions data
labels = [emotion_to_label[emotion] for emotion in emotions]

# Ensure that `utterances` and `labels` are lists that will form the columns of your dataset
dataset_dict = {
    'utterances': utterances,
    'labels': labels
}

# Convert to Hugging Face Dataset
dataset = Dataset.from_dict(dataset_dict)

# Tokenize the dataset
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_and_format(examples):
    tokenized_inputs = tokenizer(
        examples['utterances'],
        padding='max_length',
        truncation=True,
        max_length=128  # Adjust max_length according to your data if needed
    )
    tokenized_inputs['labels'] = examples['labels']
    return tokenized_inputs

# Tokenize and format the dataset
tokenized_datasets = dataset.map(tokenize_and_format, batched=True)


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

  0%|          | 0/9 [00:00<?, ?ba/s]

In [3]:
from sklearn.metrics import accuracy_score
import numpy as np

def calculate_accuracy(preds, labels):
    # preds could be logits or probabilities, depending on the model output
    # If preds are logits, convert to probabilities using softmax
    if preds.ndim > 1 and preds.shape[1] > 1:  # We have logits for more than one class
        preds = np.argmax(preds, axis=1)
    elif preds.ndim > 1 and preds.shape[1] == 1:  # We have one logit
        preds = np.squeeze((preds > 0).astype(int), axis=1)
    
    # Calculate the accuracy comparing against the true labels
    return accuracy_score(labels, preds)



In [4]:
train_dataset, val_dataset = tokenized_datasets.train_test_split(test_size=0.1).values()

In [5]:
from transformers import BertForSequenceClassification

# Set the number of labels
num_labels = len(emotion_to_label)  # This should be 8 in your case

model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=num_labels  # Specify the number of labels here
)

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
from transformers import TrainingArguments, Trainer, TrainerCallback, EvalPrediction
from pytorch_lightning.callbacks import Callback
import shutil
import os
import warnings
import torch
from torch import nn

warnings.filterwarnings("ignore", category=UserWarning, module="torch.nn.parallel._functions")


def compute_metrics(p: EvalPrediction):
    # Extract the predictions and labels from EvalPrediction object
    preds = p.predictions
    labels = p.label_ids
    # If preds are logits, convert to probabilities using softmax
    if preds.ndim > 1 and preds.shape[1] > 1:  # We have logits for more than one class
        preds = np.argmax(preds, axis=1)
    elif preds.ndim > 1 and preds.shape[1] == 1:  # We have one logit
        preds = np.squeeze((preds > 0).astype(int), axis=1)
    
    # Calculate the accuracy comparing against the true labels
    accuracy = accuracy_score(labels, preds)
    return {"accuracy": accuracy}


# Modify the WandbValidationCallback to include accuracy
class WandbValidationCallback(TrainerCallback):
    """
    A custom callback that logs validation loss, metrics, and custom fine-tuning metrics to wandb.
    """
    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
        # Log validation loss with the key 'loss' and any additional metrics
        if metrics is not None:
            wandb.log({"loss": metrics['eval_loss'], "accuracy": metrics.get('accuracy', 0)}, step=state.global_step)   

class SaveBestModelCallback(TrainerCallback):
    """
    A custom callback that saves only the best model's state_dict at the end of training.
    """
    def on_train_end(self, args, state, control, **kwargs):
        # Assuming the best model is loaded at the end of training
        if state.is_local_process_zero:
            # Save the best model's state_dict
            torch.save(model.state_dict(), os.path.join(args.output_dir, 'pytorch_model.bin'))

            # Clean up all other checkpoints
            checkpoints = [os.path.join(args.output_dir, name) for name in os.listdir(args.output_dir) if name.startswith("checkpoint")]
            for checkpoint in checkpoints:
                shutil.rmtree(checkpoint)
            
            
training_args = TrainingArguments(
    output_dir='./results',
    #save_total_limit=1,
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=10,
    save_strategy="steps",
    save_steps=10,
    load_best_model_at_end=True,  # Moved here
    metric_for_best_model="accuracy",  # Moved here
    greater_is_better=True,  # Moved here
    report_to="wandb",
    run_name="run1"
)

# Initialize Trainer with a model and datasets
trainer = Trainer(
    model=model,  # Make sure your model is defined or loaded
    args=training_args,
    train_dataset=train_dataset,  # Make sure your training dataset is defined
    eval_dataset=val_dataset, # Make sure your validation dataset is defined
    compute_metrics=compute_metrics,  # Make sure your compute_metrics function is defined
    callbacks=[WandbValidationCallback(), SaveBestModelCallback()]
)

# Start training
trainer.train()

Step,Training Loss,Validation Loss,Accuracy
10,2.2007,2.184168,0.059929
20,2.1365,2.07374,0.085781
30,2.0484,1.978463,0.309048
40,1.9673,1.889552,0.444183
50,1.8547,1.829509,0.477086
60,1.8497,1.773251,0.480611
70,1.7343,1.711656,0.480611
80,1.7291,1.668067,0.481786
90,1.6484,1.641875,0.482961
100,1.6847,1.631017,0.486486


TrainOutput(global_step=2400, training_loss=0.8485129996885856, metrics={'train_runtime': 2676.4372, 'train_samples_per_second': 28.601, 'train_steps_per_second': 0.897, 'total_flos': 5035559080857600.0, 'train_loss': 0.8485129996885856, 'epoch': 10.0})

In [7]:
trainer.evaluate(eval_dataset=val_dataset)

{'eval_loss': 1.4030647277832031,
 'eval_accuracy': 0.5346650998824912,
 'eval_runtime': 3.7396,
 'eval_samples_per_second': 227.564,
 'eval_steps_per_second': 1.872,
 'epoch': 10.0}

In [8]:
model.save_pretrained('./results')
tokenizer.save_pretrained('./results')

('./results/tokenizer_config.json',
 './results/special_tokens_map.json',
 './results/vocab.txt',
 './results/added_tokens.json')