In [18]:
!pip install datasets numpy transformers scikit-learn numpy pandas duckdb matplotlib itables



In [8]:
import os
from functools import partial
from itables import show
from itertools import chain
from pathlib import Path
from typing import Dict, List, Tuple

import duckdb
import numpy as np
import pandas as pd
import torch
from datasets import load_dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import (
    pipeline,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EvalPrediction
)

In [9]:
MODELS = {
    "bert": [
        "lyeonii/bert-tiny",
        "lyeonii/bert-small",
        "lyeonii/bert-medium",
        "google-bert/bert-base-uncased",
        "google-bert/bert-large-uncased"
    ],
    "roberta": [
        "smallbenchnlp/roberta-small",
        "JackBAI/roberta-medium",
        "FacebookAI/roberta-base",
        "FacebookAI/roberta-large"
    ]
}

DATASETS = {
    "toxic_spans": load_dataset("heegyu/toxic-spans"),
    # "paired_sentiment_datasets": load_dataset("BoringAnt1793/paired_sentiment_datasets")
}


def preprocess_function(examples, tokenizer):
    return tokenizer(
        examples["text_of_post"],
        truncation=True,
        padding="max_length",
        max_length=256,
        return_attention_mask=True,
        return_tensors=None
    )

def preprocess_labels(examples):
    examples["labels"] = examples["toxic"]
    return examples

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


In [16]:
def compute_metrics(eval_pred: EvalPrediction) -> Dict:
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
    accuracy = accuracy_score(labels, predictions)
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

def train_and_evaluate_model(model_name: str, dataset_name: str) -> dict:
    torch.cuda.empty_cache()

    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=2,
        torchscript=True
    ).to(device)

    tokenizer = AutoTokenizer.from_pretrained(
        model_name,
        model_max_length=256
    )

    tokenized_datasets = DATASETS[dataset_name].map(
        partial(preprocess_function, tokenizer=tokenizer),
        batched=True,
        batch_size=1000,
        num_proc=4
    )

    tokenized_datasets = tokenized_datasets.map(
        preprocess_labels,
        batched=True,
        num_proc=4
    )

    columns_to_remove = [
        "text_of_post", "toxic", "probability", "position",
        "type", "support", "position_probability"
    ]
    tokenized_datasets = tokenized_datasets.remove_columns(columns_to_remove)
    tokenized_datasets.set_format("torch")

    optimal_batch_size = 32
    if torch.cuda.is_available():
        gpu_memory = torch.cuda.get_device_properties(0).total_memory / (1024**3)
        optimal_batch_size = int(min(32 * (gpu_memory / 8), 128))

    output_dir = Path.cwd() / "results" / f"{model_name.replace('/', '_')}_{dataset_name}"
    training_args = TrainingArguments(
        output_dir=output_dir,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=1e-5,
        per_device_train_batch_size=optimal_batch_size,
        per_device_eval_batch_size=optimal_batch_size * 2,
        num_train_epochs=10,
        weight_decay=0.01,
        logging_dir=f"{output_dir}/logs",
        logging_steps=10,
        save_total_limit=2,
        fp16=torch.cuda.is_available(),
        gradient_checkpointing=True,
        dataloader_num_workers=4,
        dataloader_pin_memory=True,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["test"],
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

    print("Starting training...")
    train_results = trainer.train()

    print("Evaluating model...")
    eval_results = trainer.evaluate()

    model.save_pretrained(
        output_dir,
        save_config=True,
        safe_serialization=True  # Use safe serialization for better compatibility
    )
    tokenizer.save_pretrained(output_dir)
    eval_results["model_name"] = model_name
    eval_results["dataset_name"] = dataset_name
    eval_stats = pd.DataFrame([eval_results])
    eval_stats.to_parquet(Path(output_dir) / "EvalStats.parquet")

    return eval_results

In [15]:
for dataset_name in DATASETS.keys():
    for model_name in chain(*MODELS.values()):
        try:
            train_and_evaluate_model(model_name, dataset_name)
        except Exception as e:
            print(f"model {model_name}, dataset {dataset_name} failed with error: {e}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at lyeonii/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map (num_proc=4):   0%|          | 0/10006 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/1000 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/10006 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/1000 [00:00<?, ? examples/s]



model lyeonii/bert-tiny, dataset toxic_spans failed with error: Using the `Trainer` with `PyTorch` requires `accelerate>=0.26.0`: Please run `pip install transformers[torch]` or `pip install 'accelerate>={ACCELERATE_MIN_VERSION}'`


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at lyeonii/bert-small and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map (num_proc=4):   0%|          | 0/10006 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/1000 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/10006 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/1000 [00:00<?, ? examples/s]



model lyeonii/bert-small, dataset toxic_spans failed with error: Using the `Trainer` with `PyTorch` requires `accelerate>=0.26.0`: Please run `pip install transformers[torch]` or `pip install 'accelerate>={ACCELERATE_MIN_VERSION}'`


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at lyeonii/bert-medium and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map (num_proc=4):   0%|          | 0/10006 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/1000 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/10006 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/1000 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model lyeonii/bert-medium, dataset toxic_spans failed with error: Using the `Trainer` with `PyTorch` requires `accelerate>=0.26.0`: Please run `pip install transformers[torch]` or `pip install 'accelerate>={ACCELERATE_MIN_VERSION}'`


Map (num_proc=4):   0%|          | 0/10006 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/1000 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/10006 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/1000 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model google-bert/bert-base-uncased, dataset toxic_spans failed with error: Using the `Trainer` with `PyTorch` requires `accelerate>=0.26.0`: Please run `pip install transformers[torch]` or `pip install 'accelerate>={ACCELERATE_MIN_VERSION}'`


Map (num_proc=4):   0%|          | 0/10006 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/1000 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/10006 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/1000 [00:00<?, ? examples/s]



model google-bert/bert-large-uncased, dataset toxic_spans failed with error: Using the `Trainer` with `PyTorch` requires `accelerate>=0.26.0`: Please run `pip install transformers[torch]` or `pip install 'accelerate>={ACCELERATE_MIN_VERSION}'`


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at smallbenchnlp/roberta-small and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map (num_proc=4):   0%|          | 0/10006 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/1000 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/10006 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/1000 [00:00<?, ? examples/s]



model smallbenchnlp/roberta-small, dataset toxic_spans failed with error: Using the `Trainer` with `PyTorch` requires `accelerate>=0.26.0`: Please run `pip install transformers[torch]` or `pip install 'accelerate>={ACCELERATE_MIN_VERSION}'`


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at JackBAI/roberta-medium and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map (num_proc=4):   0%|          | 0/10006 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/1000 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/10006 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/1000 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model JackBAI/roberta-medium, dataset toxic_spans failed with error: Using the `Trainer` with `PyTorch` requires `accelerate>=0.26.0`: Please run `pip install transformers[torch]` or `pip install 'accelerate>={ACCELERATE_MIN_VERSION}'`


Map (num_proc=4):   0%|          | 0/10006 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/1000 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/10006 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/1000 [00:00<?, ? examples/s]



model FacebookAI/roberta-base, dataset toxic_spans failed with error: Using the `Trainer` with `PyTorch` requires `accelerate>=0.26.0`: Please run `pip install transformers[torch]` or `pip install 'accelerate>={ACCELERATE_MIN_VERSION}'`


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map (num_proc=4):   0%|          | 0/10006 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/1000 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/10006 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/1000 [00:00<?, ? examples/s]

model FacebookAI/roberta-large, dataset toxic_spans failed with error: Using the `Trainer` with `PyTorch` requires `accelerate>=0.26.0`: Please run `pip install transformers[torch]` or `pip install 'accelerate>={ACCELERATE_MIN_VERSION}'`


