##b. Clone or update competition repository

After cloning, under MyDrive, you will see afrisenti-semeval-2023 folder with all the the data for the afrisenti shared task (training and dev)

# Installation of Librararies


In [1]:
!pip install  transformers==4.45.2
!pip install evaluate

Collecting transformers==4.45.2
  Downloading transformers-4.45.2-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Downloading transformers-4.45.2-py3-none-any.whl (9.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m87.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.46.3
    Uninstalling transformers-4.46.3:
      Successfully uninstalled transformers-4.46.3
Successfully installed transformers-4.45.2
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully in

##c. Install required libraries

- Set the project dire
ctory in the cell below, where the requirements file should also be located, and run the cell

# Training setup

##a. Set project parameters

For a list of models that be used for fine-tuning, you can check [HERE](https://huggingface.co/models).

In [2]:
import os
import shutil

directory_path = "NustTitans"

# Check if the directory exists, delete it if it does, and then recreate it
if os.path.exists(directory_path):
    shutil.rmtree(directory_path)  # This deletes the directory and its contents

# Now, create the directory again
os.mkdir("NustTitans")
print(f"Directory '{directory_path}' has been recreated.")

# Create an empty file called "empty_file.txt"
file_name = "NustTitans/run_textclass.py"

# Open the file in write mode ("w") — this creates the file if it doesn't exist
with open(file_name, "w") as file:
    # Write some Python code to the file
    pass

Directory 'NustTitans' has been recreated.


In [8]:
%%writefile /kaggle/working/NustTitans/adaptive_pretraining.py
#!/usr/bin/env python
# coding=utf-8
import transformers, torch
import logging
import os
import random
import sys
from dataclasses import dataclass, field
from typing import Optional
import datasets
import numpy as np
import pandas as pd
from datasets import load_dataset
import evaluate
from transformers import (
    AutoConfig,
    AutoTokenizer,
    DataCollatorWithPadding,
    EvalPrediction,
    HfArgumentParser,
    Trainer,
    TrainingArguments,
    default_data_collator,
    set_seed,
    AutoModelForMaskedLM,
)
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version, send_example_telemetry
from transformers.utils.versions import require_version
from datasets import Features, Value, ClassLabel, Dataset
from transformers import DataCollatorForLanguageModeling

# Ensure required version of datasets library
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")

logger = logging.getLogger(__name__)

@dataclass
class DataTrainingArguments:
    max_seq_length: Optional[int] = field(
        default=128,
        metadata={
            "help": "The maximum total input sequence length after tokenization. Sequences longer than this will be truncated."
        }
    )
    overwrite_cache: bool = field(
        default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
    )
    pad_to_max_length: bool = field(
        default=True,
        metadata={"help": "Whether to pad all samples to `max_seq_length`."}
    )

    domain_corpus: Optional[str] = field(
        default=None, metadata={"help": "Path to domain-specific corpus for adaptive pre-training"}
    )


@dataclass
class ModelArguments:

    model_name_or_path: str = field(default=None, metadata={"help": "Path to pretrained model or model identifier"})
    config_name: Optional[str] = field(default=None, metadata={"help": "Pretrained config name or path"})
    tokenizer_name: Optional[str] = field(default=None, metadata={"help": "Pretrained tokenizer name or path"})
    data_dir: Optional[str] = field(default=None, metadata={"help": "Path to dataset"})
    cache_dir: Optional[str] = field(default=None, metadata={"help": "Directory to store the pretrained models"})
    do_lower_case: Optional[bool] = field(default=False, metadata={"help": "Lowercase tokenizer"})
    use_fast_tokenizer: bool = field(default=True, metadata={"help": "Use fast tokenizer"})
    model_revision: str = field(default="main", metadata={"help": "Model version"})
    use_auth_token: bool = field(default=False, metadata={"help": "Use auth token for private models"})
    ignore_mismatched_sizes: bool = field(default=False, metadata={"help": "Enable loading model with mismatched head sizes"})
    epochs: Optional[int] = field(default=3, metadata={"help": "Number of training epochs"})
    lr: Optional[float] = field(default=2e-5, metadata={"help": "Learning rate for training"})


def main():
    # Parse arguments from command line
    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
    model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        handlers=[logging.StreamHandler(sys.stdout)],
    )

    logger.setLevel(training_args.get_process_log_level())
    datasets.utils.logging.set_verbosity(training_args.get_process_log_level())
    transformers.utils.logging.set_verbosity(training_args.get_process_log_level())

    logger.info(f"Training/evaluation parameters {training_args}")

    # Handle last checkpoint detection
    last_checkpoint = None
    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
        last_checkpoint = get_last_checkpoint(training_args.output_dir)
        if last_checkpoint:
            logger.info(f"Checkpoint detected, resuming from {last_checkpoint}.")
        elif len(os.listdir(training_args.output_dir)) > 0:
            raise ValueError(f"Output directory ({training_args.output_dir}) is not empty.")

    # Set seed for reproducibility
    set_seed(training_args.seed)

    num_labels = 6

    # Model configuration
    config = AutoConfig.from_pretrained(model_args.model_name_or_path, num_labels=num_labels)
    tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name or model_args.model_name_or_path, use_fast=model_args.use_fast_tokenizer)
    model = AutoModelForMaskedLM.from_pretrained(model_args.model_name_or_path)

    def preprocess_function(examples):
        label_list = ['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise']
        texts = (examples['text'],)
        result = tokenizer(*texts, padding="max_length", max_length=data_args.max_seq_length, truncation=True)
        if "label" in examples:
            result["label"] = [label_list.index(l) for l in examples["label"]]
        return result

    if data_args.domain_corpus:
        # Metric setup
        metric = evaluate.load("accuracy")

        def compute_metrics(p: EvalPrediction):
            preds = np.argmax(p.predictions, axis=1)
            return metric.compute(predictions=preds, references=p.label_ids)

        # Load your domain-specific corpus
        domain_corpus = load_dataset(data_args.domain_corpus)  # Use the `domain_corpus` argument

        # Tokenization for MLM
        def preprocess_function(examples):
            result = tokenizer(examples['text'], padding=True, truncation=True, max_length=data_args.max_seq_length)
            return result

        # Split domain_corpus into 'train' and 'test'
        domain_corpus = domain_corpus.remove_columns("label")
        domain_corpus = domain_corpus["train"].train_test_split(test_size=0.9)
        train_dataset = domain_corpus.map(preprocess_function, batched=True)["train"]
        eval_dataset =  domain_corpus.map(preprocess_function, batched=True)["test"]

        # Setup Data Collator for MLM
        data_collator = DataCollatorForLanguageModeling(
            tokenizer=tokenizer,
            mlm=True,
            mlm_probability=0.15,  # 15% of the tokens will be masked during training
        )

        # Fine-tuning the model on domain-specific corpus (Adaptive Pre-Training)
        domain_train_args = TrainingArguments(
            output_dir=training_args.output_dir,
            per_device_train_batch_size=8,
            per_device_eval_batch_size=16,
            num_train_epochs=model_args.epochs,  # Using command-line argument for epochs
            save_steps=1,
            logging_steps=1,
            evaluation_strategy="steps",
            logging_dir="./logs",
            save_total_limit=1,
            disable_tqdm=True,
            report_to="none",
            save_strategy="no",
            learning_rate=model_args.lr,  # Using command-line argument for learning rate
        )

        trainer = Trainer(
            model=model,
            args=domain_train_args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            tokenizer=tokenizer,
            data_collator=data_collator,
        )

        # Start training
        train_result = trainer.train()
        print("Done adaptive pre-training!")
        metrics = train_result.metrics
        trainer.save_model(training_args.output_dir)  # Saves the tokenizer too for easy upload



if __name__ == "__main__":
    main()


Overwriting /kaggle/working/NustTitans/adaptive_pretraining.py


In [None]:
!python /kaggle/working/NustTitans/adaptive_pretraining.py \
--model_name_or_path 'Davlan/afro-xlmr-large' \
--domain_corpus "DGurgurov/igbo_sa" \
--lr 5e-6 \
--epochs 1 \
--output_dir '/kaggle/working/output/'

Map: 100%|████████████████████████| 5116/5116 [00:00<00:00, 11567.86 examples/s]
{'loss': 3.5439, 'grad_norm': 90.11295318603516, 'learning_rate': 4.929577464788733e-06, 'epoch': 0.014084507042253521}


In [6]:
%%writefile /kaggle/working/NustTitans/fine_tuning.py
#!/usr/bin/env python
# coding=utf-8
import transformers,torch
import logging
import os
import random
import sys
from dataclasses import dataclass, field
from typing import Optional
import datasets
import numpy as np
import pandas as pd
from datasets import load_dataset
import evaluate
from transformers import (
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    EvalPrediction,
    HfArgumentParser,
    Trainer,
    TrainingArguments,
    default_data_collator,
    set_seed,
)
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version, send_example_telemetry
from transformers.utils.versions import require_version
from datasets import Features, Value, ClassLabel, Dataset

os.environ['WANDB_DISABLED'] = 'true'

# Ensure required version of datasets library
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")

logger = logging.getLogger(__name__)

import re
import string

def preprocess_text(text):
    """
    Preprocess the input text by removing emojis, punctuation, extra spaces, etc.
    """

    # Remove emojis using regex (Unicode characters for emojis)
    emoji_pattern = re.compile("[\U0001F600-\U0001F64F"
                               "\U0001F300-\U0001F5FF"
                               "\U0001F680-\U0001F6FF"
                               "\U0001F700-\U0001F77F"
                               "\U0001F780-\U0001F7FF"
                               "\U0001F800-\U0001F8FF"
                               "\U0001F900-\U0001F9FF"
                               "\U0001FA00-\U0001FA6F"
                               "\U0001FA70-\U0001FAFF"
                               "\U00002702-\U000027B0"
                               "\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = re.sub(emoji_pattern, '', text)  # Remove emojis

    # Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)

    # Remove mentions (e.g., @username)
    text = re.sub(r"@\w+", '', text)

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Remove extra whitespace (e.g., multiple spaces, newlines)
    text = re.sub(r'\s+', ' ', text).strip()

    return text


@dataclass
class DataTrainingArguments:
    max_seq_length: Optional[int] = field(
        default=128,
        metadata={
            "help": "The maximum total input sequence length after tokenization. Sequences longer than this will be truncated."
        }
    )
    overwrite_cache: bool = field(
        default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
    )
    pad_to_max_length: bool = field(
        default=True,
        metadata={"help": "Whether to pad all samples to `max_seq_length`."}
    )
    max_train_samples: Optional[int] = field(
        default=None, metadata={"help": "For debugging purposes, truncate the number of training examples."}
    )
    max_eval_samples: Optional[int] = field(
        default=None, metadata={"help": "For debugging purposes, truncate the number of evaluation examples."}
    )
    max_predict_samples: Optional[int] = field(
        default=None, metadata={"help": "For debugging purposes, truncate the number of prediction examples."}
    )
@dataclass
class ModelArguments:

    model_name_or_path: str = field(default=None, metadata={"help": "Path to pretrained model or model identifier"})
    config_name: Optional[str] = field(default=None, metadata={"help": "Pretrained config name or path"})
    tokenizer_name: Optional[str] = field(default=None, metadata={"help": "Pretrained tokenizer name or path"})
    data_dir: Optional[str] = field(default=None, metadata={"help": "Path to dataset"})
    cache_dir: Optional[str] = field(default=None, metadata={"help": "Directory to store the finetuned models"})
    do_lower_case: Optional[bool] = field(default=False, metadata={"help": "Lowercase tokenizer"})
    use_fast_tokenizer: bool = field(default=True, metadata={"help": "Use fast tokenizer"})
    model_revision: str = field(default="main", metadata={"help": "Model version"})
    use_auth_token: bool = field(default=False, metadata={"help": "Use auth token for private models"})
    ignore_mismatched_sizes: bool = field(default=False, metadata={"help": "Enable loading model with mismatched head sizes"})
    lr: Optional[float] = field(default=2e-5, metadata={"help": "Learning rate for training"})
    epochs: Optional[int] = field(default=3, metadata={"help": "Number of training epochs"})
def main():
    # Parse arguments from command line
    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
    model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        handlers=[logging.StreamHandler(sys.stdout)],
    )

    logger.setLevel(training_args.get_process_log_level())
    datasets.utils.logging.set_verbosity(training_args.get_process_log_level())
    transformers.utils.logging.set_verbosity(training_args.get_process_log_level())

    logger.info(f"Training/evaluation parameters {training_args}")

    # Handle last checkpoint detection
    last_checkpoint = None
    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
        last_checkpoint = get_last_checkpoint(training_args.output_dir)
        if last_checkpoint:
            logger.info(f"Checkpoint detected, resuming from {last_checkpoint}.")
        elif len(os.listdir(training_args.output_dir)) > 0:
            raise ValueError(f"Output directory ({training_args.output_dir}) is not empty.")

    # Set seed for reproducibility
    set_seed(training_args.seed)
    num_labels = 6
    config = AutoConfig.from_pretrained(model_args.model_name_or_path, num_labels=num_labels)
    tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name or model_args.model_name_or_path, use_fast=model_args.use_fast_tokenizer)
    # Load the fine-tuning dataset (for example, a sentiment analysis dataset)
    df = pd.read_csv(model_args.data_dir)  # Replace with your fine-tuning data
    # Convert labels to float32 for multi-label classification
    df['labels'] = df[['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise']].astype('float32').values.tolist()
 # Multi-label column
    df['text'] = df['text'].apply(preprocess_text)  # Apply text preprocessing

    # Convert dataset into Hugging Face Dataset format
    from datasets import Dataset
    fine_tuning_dataset = Dataset.from_pandas(df)
    # Ensure labels are of float32 type
    fine_tuning_dataset = fine_tuning_dataset.map(
        lambda x: {'labels': torch.tensor(x['labels'], dtype=torch.float32)},
        batched=True
    )

    # Preprocessing function for fine-tuning (same tokenizer)
    def fine_tune_preprocess(examples):
        return tokenizer(examples['text'], padding=True, truncation=True, max_length=128)

    # Tokenize the fine-tuning dataset
    fine_tuning_dataset = fine_tuning_dataset.map(fine_tune_preprocess, batched=True)

    # Convert to PyTorch format (input_ids, attention_mask, labels)
    #fine_tuning_dataset = fine_tuning_dataset.rename_column("label", "labels")
    fine_tuning_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
    data_collator = DataCollatorWithPadding(tokenizer)
    # Load the model for fine-tuning
    model_for_fine_tuning = AutoModelForSequenceClassification.from_pretrained(
        num_labels=6,  # For multi-label classification (number of labels)
        problem_type="multi_label_classification",  # Multi-label classification setup
        pretrained_model_name_or_path = model_args.model_name_or_path,

    )

    # Fine-tuning training arguments
    fine_tuning_args = TrainingArguments(
        output_dir=training_args.output_dir,
        evaluation_strategy="epoch",
        learning_rate=model_args.lr,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=model_args.epochs,
        weight_decay=0.01,
        save_strategy="no",

        
    )

    # Set up Trainer for fine-tuning
    trainer_fine_tune = Trainer(
        model=model_for_fine_tuning,
        args=fine_tuning_args,
        train_dataset=fine_tuning_dataset,
        eval_dataset=fine_tuning_dataset,  # You can use separate validation set
        data_collator = data_collator,
    )

    # Start fine-tuning the model
    trainer_results = trainer_fine_tune.train()
    metrics = trainer_results.metrics
    max_train_samples = (data_args.max_train_samples if data_args.max_train_samples is not None else len(fine_tuning_dataset))
    metrics["train_samples"] = min(max_train_samples, len(fine_tuning_dataset))

    trainer_fine_tune.save_model(training_args.output_dir)  # Saves the tokenizer too for easy upload
    tokenizer.save_pretrained(training_args.output_dir)
    trainer_fine_tune.log_metrics("train", metrics)
    trainer_fine_tune.save_metrics("train", metrics)
    trainer_fine_tune.save_state()



if __name__ == "__main__":
    main()


Writing /kaggle/working/NustTitans/fine_tuning.py


In [None]:
%bash !/usr/bin/env python
# coding=utf-8
import transformers,torch
import logging
import os
import random
import sys
from dataclasses import dataclass, field
from typing import Optional
import datasets
import numpy as np
import pandas as pd
from datasets import load_dataset
import evaluate
from transformers import (
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    EvalPrediction,
    HfArgumentParser,
    Trainer,
    TrainingArguments,
    default_data_collator,
    set_seed,
)
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version, send_example_telemetry
from transformers.utils.versions import require_version
from datasets import Features, Value, ClassLabel, Dataset

os.environ['WANDB_DISABLED'] = 'true'

# Ensure required version of datasets library
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")

logger = logging.getLogger(__name__)

import re
import string

def preprocess_text(text):
    """
    Preprocess the input text by removing emojis, punctuation, extra spaces, etc.
    """

    # Remove emojis using regex (Unicode characters for emojis)
    emoji_pattern = re.compile("[\U0001F600-\U0001F64F"
                               "\U0001F300-\U0001F5FF"
                               "\U0001F680-\U0001F6FF"
                               "\U0001F700-\U0001F77F"
                               "\U0001F780-\U0001F7FF"
                               "\U0001F800-\U0001F8FF"
                               "\U0001F900-\U0001F9FF"
                               "\U0001FA00-\U0001FA6F"
                               "\U0001FA70-\U0001FAFF"
                               "\U00002702-\U000027B0"
                               "\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = re.sub(emoji_pattern, '', text)  # Remove emojis

    # Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)

    # Remove mentions (e.g., @username)
    text = re.sub(r"@\w+", '', text)

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Remove extra whitespace (e.g., multiple spaces, newlines)
    text = re.sub(r'\s+', ' ', text).strip()

    return text


@dataclass
class DataTrainingArguments:
    max_seq_length: Optional[int] = field(
        default=128,
        metadata={
            "help": "The maximum total input sequence length after tokenization. Sequences longer than this will be truncated."
        }
    )
    overwrite_cache: bool = field(
        default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
    )
    pad_to_max_length: bool = field(
        default=True,
        metadata={"help": "Whether to pad all samples to `max_seq_length`."}
    )
    max_train_samples: Optional[int] = field(
        default=None, metadata={"help": "For debugging purposes, truncate the number of training examples."}
    )
    max_eval_samples: Optional[int] = field(
        default=None, metadata={"help": "For debugging purposes, truncate the number of evaluation examples."}
    )
    max_predict_samples: Optional[int] = field(
        default=None, metadata={"help": "For debugging purposes, truncate the number of prediction examples."}
    )
@dataclass
class ModelArguments:

    model_name_or_path: str = field(default=None, metadata={"help": "Path to pretrained model or model identifier"})
    config_name: Optional[str] = field(default=None, metadata={"help": "Pretrained config name or path"})
    tokenizer_name: Optional[str] = field(default=None, metadata={"help": "Pretrained tokenizer name or path"})
    data_dir: Optional[str] = field(default=None, metadata={"help": "Path to dataset"})
    cache_dir: Optional[str] = field(default=None, metadata={"help": "Directory to store the finetuned models"})
    do_lower_case: Optional[bool] = field(default=False, metadata={"help": "Lowercase tokenizer"})
    use_fast_tokenizer: bool = field(default=True, metadata={"help": "Use fast tokenizer"})
    model_revision: str = field(default="main", metadata={"help": "Model version"})
    use_auth_token: bool = field(default=False, metadata={"help": "Use auth token for private models"})
    ignore_mismatched_sizes: bool = field(default=False, metadata={"help": "Enable loading model with mismatched head sizes"})
    lr: Optional[float] = field(default=2e-5, metadata={"help": "Learning rate for training"})
    epochs: Optional[int] = field(default=3, metadata={"help": "Number of training epochs"})
def main():
    # Parse arguments from command line
    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
    model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        handlers=[logging.StreamHandler(sys.stdout)],
    )

    logger.setLevel(training_args.get_process_log_level())
    datasets.utils.logging.set_verbosity(training_args.get_process_log_level())
    transformers.utils.logging.set_verbosity(training_args.get_process_log_level())

    logger.info(f"Training/evaluation parameters {training_args}")

    # Handle last checkpoint detection
    last_checkpoint = None
    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
        last_checkpoint = get_last_checkpoint(training_args.output_dir)
        if last_checkpoint:
            logger.info(f"Checkpoint detected, resuming from {last_checkpoint}.")
        elif len(os.listdir(training_args.output_dir)) > 0:
            raise ValueError(f"Output directory ({training_args.output_dir}) is not empty.")

    # Set seed for reproducibility
    set_seed(training_args.seed)
    num_labels = 6
    config = AutoConfig.from_pretrained(model_args.model_name_or_path, num_labels=num_labels)
    tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name or model_args.model_name_or_path, use_fast=model_args.use_fast_tokenizer)
    # Load the fine-tuning dataset (for example, a sentiment analysis dataset)
    df = pd.read_csv(model_args.data_dir)  # Replace with your fine-tuning data
    # Convert labels to float32 for multi-label classification
    df['labels'] = df[['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise']].astype('float32').values.tolist()
 # Multi-label column
    df['text'] = df['text'].apply(preprocess_text)  # Apply text preprocessing

    # Convert dataset into Hugging Face Dataset format
    from datasets import Dataset
    fine_tuning_dataset = Dataset.from_pandas(df)
    # Ensure labels are of float32 type
    fine_tuning_dataset = fine_tuning_dataset.map(
        lambda x: {'labels': torch.tensor(x['labels'], dtype=torch.float32)},
        batched=True
    )

    # Preprocessing function for fine-tuning (same tokenizer)
    def fine_tune_preprocess(examples):
        return tokenizer(examples['text'], padding=True, truncation=True, max_length=128)

    # Tokenize the fine-tuning dataset
    fine_tuning_dataset = fine_tuning_dataset.map(fine_tune_preprocess, batched=True)

    # Convert to PyTorch format (input_ids, attention_mask, labels)
    #fine_tuning_dataset = fine_tuning_dataset.rename_column("label", "labels")
    fine_tuning_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
    data_collator = DataCollatorWithPadding(tokenizer)
    # Load the model for fine-tuning
    model_for_fine_tuning = AutoModelForSequenceClassification.from_pretrained(
        num_labels=6,  # For multi-label classification (number of labels)
        problem_type="multi_label_classification",  # Multi-label classification setup
        pretrained_model_name_or_path = model_args.model_name_or_path,

    )

    # Fine-tuning training arguments
    fine_tuning_args = TrainingArguments(
        output_dir=training_args.output_dir,
        evaluation_strategy="epoch",
        learning_rate=model_args.lr,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=model_args.epochs,
        weight_decay=0.01,
        save_strategy="no",

        
    )

    # Set up Trainer for fine-tuning
    trainer_fine_tune = Trainer(
        model=model_for_fine_tuning,
        args=fine_tuning_args,
        train_dataset=fine_tuning_dataset,
        eval_dataset=fine_tuning_dataset,  # You can use separate validation set
        data_collator = data_collator,
    )

    # Start fine-tuning the model
    trainer_results = trainer_fine_tune.train()
    metrics = trainer_results.metrics
    max_train_samples = (data_args.max_train_samples if data_args.max_train_samples is not None else len(fine_tuning_dataset))
    metrics["train_samples"] = min(max_train_samples, len(fine_tuning_dataset))

    trainer_fine_tune.save_model(training_args.output_dir)  # Saves the tokenizer too for easy upload
    tokenizer.save_pretrained(training_args.output_dir)
    trainer_fine_tune.log_metrics("train", metrics)
    trainer_fine_tune.save_metrics("train", metrics)
    trainer_fine_tune.save_state()



if __name__ == "__main__":
    main()


In [None]:
#Fine_tuning
!python /kaggle/working/NustTitans/fine_tuning.py \
  --model_name_or_path '/kaggle/working/output'  \
  --output_dir '/kaggle/working/finetuning1' \
  --data_dir '/kaggle/input/final-dataset-semeval2025/public_data_test/track_a/train/amh.csv' \
  --do_train \
  --per_device_train_batch_size 8 \
  --lr 5e-6 \
  --epochs 20 \
  --max_seq_length 128 \
  --save_steps -10                                                       

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Map: 100%|████████████████████████| 3549/3549 [00:00<00:00, 97081.38 examples/s]
Map: 100%|█████████████████████████| 3549/3549 [00:00<00:00, 9095.65 examples/s]
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at /kaggle/working/output and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  5%|██                                      | 444/8880 [03:06<54:

In [23]:
import re
import string
def preprocess_text(text):
    """
    Preprocess the input text by removing emojis, punctuation, extra spaces, etc.
    """
    
    # Remove emojis using regex (Unicode characters for emojis)
    emoji_pattern = re.compile("[\U0001F600-\U0001F64F"
                               "\U0001F300-\U0001F5FF"
                               "\U0001F680-\U0001F6FF"
                               "\U0001F700-\U0001F77F"
                               "\U0001F780-\U0001F7FF"
                               "\U0001F800-\U0001F8FF"
                               "\U0001F900-\U0001F9FF"
                               "\U0001FA00-\U0001FA6F"
                               "\U0001FA70-\U0001FAFF"
                               "\U00002702-\U000027B0"
                               "\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = re.sub(emoji_pattern, '', text)  # Remove emojis
    
    # Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    
    # Remove mentions (e.g., @username)
    text = re.sub(r"@\w+", '', text)
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Remove extra whitespace (e.g., multiple spaces, newlines)
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text


As you may observe, the training loss is very large. As a start, you can tune the training parameters and model to get a competitive result.

You can observe also, there is no validation metrics (e.g., accuracy, loss etc) since we are only training without validtaion

# Prediction on Eval dataset

1. F1-score (macro)
2. F1-score (micro)

Evaluation based in emotions seperetaley

In [28]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
import torch
from datasets import Dataset
import pandas as pd

# Load the fine-tuned model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained("/kaggle/working/finetuning1")
tokenizer = AutoTokenizer.from_pretrained("/kaggle/working/finetuning1/")

# If you're predicting on a test set, load it (example below)
test_data = pd.read_csv('/kaggle/input/final-dataset-semeval2025/public_data_test/track_a/dev/amh.csv')  # Replace with your test data path
test_data['text'] = test_data['text'].apply(preprocess_text)  # Apply the same preprocessing as for training

# Convert to Hugging Face Dataset format
test_dataset = Dataset.from_pandas(test_data)

# Tokenize the test dataset
def predict_preprocess(examples):
    return tokenizer(examples['text'], padding=True, truncation=True, max_length=128)

test_dataset = test_dataset.map(predict_preprocess, batched=True)

# Set format for PyTorch
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])

# Define Trainer for prediction (no need for a training loop)
trainer = Trainer(
    model=model,  # The fine-tuned model
    args=TrainingArguments(output_dir='/kaggle/working/finetuning', do_train=False, do_eval=False),  # Empty args for prediction
    tokenizer=tokenizer
)

# Make predictions
predictions, labels, _ = trainer.predict(test_dataset)

# Get the predicted labels (for multi-label classification, we can threshold predictions)
predicted_labels = (torch.sigmoid(torch.tensor(predictions)) > .35).int()  # For multi-label, use sigmoid activation and threshold

# Print out or process the predictions
print(predicted_labels)


Map:   0%|          | 0/592 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


tensor([[1, 1, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        ...,
        [0, 1, 0, 0, 1, 0],
        [1, 0, 0, 0, 0, 0],
        [1, 1, 0, 0, 0, 0]], dtype=torch.int32)


In [29]:
from sklearn.metrics import f1_score

# 11. Get the true labels for all classes and convert to 0 or 1
true_labels = test_data[["anger", "disgust", "fear", "joy", "sadness", "surprise"]].fillna(0).applymap(lambda x: 1 if x > 0 else 0)
print(len(true_labels),len(predicted_labels))
# 12. Calculate the F1 score for each class
f1_scores = {}
for idx, label in enumerate(true_labels.columns):
    f1 = f1_score(true_labels[label], predicted_labels[:, idx].numpy(), average='binary')
    f1_scores[label] = f1

# 13. Print F1 scores for each label
for label, score in f1_scores.items():
    print(f"F1 Score for {label}: {score}")

# Micro-average F1 score: Flatten both true labels and predicted labels
micro_f1 = f1_score(true_labels.values.flatten(), predicted_labels.flatten(), average='micro')

# Macro-average F1 score: F1 score per label, then take the average
macro_f1 = f1_score(true_labels.values, predicted_labels, average='micro')

# 13. Print out the results
print(f"Micro-average F1 Score: {micro_f1}")
print(f"Macro-average F1 Score: {macro_f1}")

592 592
F1 Score for anger: 0.6981981981981982
F1 Score for disgust: 0.7874720357941835
F1 Score for fear: 0.5116279069767442
F1 Score for joy: 0.6989247311827957
F1 Score for sadness: 0.6923076923076923
F1 Score for surprise: 0.627450980392157
Micro-average F1 Score: 0.8859797297297297
Macro-average F1 Score: 0.7169811320754718


  true_labels = test_data[["anger", "disgust", "fear", "joy", "sadness", "surprise"]].fillna(0).applymap(lambda x: 1 if x > 0 else 0)


# Prediction on test Dataset

In [37]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
import torch
from datasets import Dataset
os.environ['WANDB_DISABLED'] = 'true'
# Load the fine-tuned model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained("/kaggle/working/finetuning1")
tokenizer = AutoTokenizer.from_pretrained("/kaggle/working/finetuning1")

# If you're predicting on a test set, load it (example below)
test_data = pd.read_csv('/kaggle/input/final-dataset-semeval2025/public_data_test/track_a/dev/amh.csv')  # Replace with your test data path
test_data['text'] = test_data['text'].apply(preprocess_text)  # Apply the same preprocessing as for training

# Convert to Hugging Face Dataset format
test_dataset = Dataset.from_pandas(test_data)

# Tokenize the test dataset
def predict_preprocess(examples):
    return tokenizer(examples['text'], padding=True, truncation=True, max_length=128)

test_dataset = test_dataset.map(predict_preprocess, batched=True)

# Set format for PyTorch
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])

# Define Trainer for prediction (no need for a training loop)
trainer = Trainer(
    model=model,  # The fine-tuned model
    args=TrainingArguments(output_dir='/tmp', do_train=False, do_eval=False),  # Empty args for prediction
    tokenizer=tokenizer
)

# Make predictions
predictions, labels, _ = trainer.predict(test_dataset)

# Get the predicted labels (for multi-label classification, we can threshold predictions)
predicted_labels = (torch.sigmoid(torch.tensor(predictions)) > 0.35).int()  # For multi-label, use sigmoid activation and threshold

# Print out or process the predictions
print(predicted_labels)


Map:   0%|          | 0/592 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


tensor([[1, 1, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        ...,
        [0, 1, 0, 0, 1, 0],
        [1, 0, 0, 0, 0, 0],
        [1, 1, 0, 0, 0, 0]], dtype=torch.int32)


In [38]:
# 1. Convert predicted labels to a DataFrame
predicted_labels_df = pd.DataFrame(predictions, columns=["anger", "disgust", "fear", "joy", "sadness", "surprise"])

# 2. Create a DataFrame that includes the original IDs and the predicted labels
output_df = test_data[['id']].copy()  # Assuming 'id' column exists in the test data
output_df = output_df.join(predicted_labels_df)

# 3. Save the DataFrame to a CSV file
output_df.to_csv('/kaggle/working/test_amh.csv', index=False)

print("Predicted labels have been saved to predicted_labels.csv")

Predicted labels have been saved to predicted_labels.csv


# Push Code to Github

In [None]:
!apt-get install git


In [None]:
!git config --global user.name "mhm930"
!git config --global user.email "mhmkhan80@gmail.com"


In [None]:
!git init


In [None]:
!git remote add origin https://github.com/mhm930/NustTitans.git


In [None]:
!git add .


In [None]:
!git commit -m "Add my code from Kaggle"


In [None]:
!git config --global user.name "mhm930"
!git config --global user.password "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"


In [None]:
!git push origin master mhm930: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


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

In [None]:
!git push https://mhm930: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@github.com/mhm930/NustTitans.git
