In [1]:
from typing import List
from dao.attribute import DAOAttributePL
from models.attribute import AttributePLInDB
from datasets import Dataset
import torch
from transformers import TrainingArguments, Trainer
from transformers import AutoTokenizer, AutoModelForMaskedLM


dao_attribute: DAOAttributePL = DAOAttributePL(collection_name="attributes-24-12-16-recalc-24-12-22.1-pgryka")

attributes_generated: List[AttributePLInDB] = dao_attribute.find_many_by_query({"is_generated": True})
attributes_real: List[AttributePLInDB] = dao_attribute.find_many_by_query({"is_generated": False})

dicts_generated = [{"text": attribute.stylometrix_metrics.text, "label": 1} for attribute in attributes_generated]
dicts_real = [{"text": attribute.stylometrix_metrics.text, "label": 0} for attribute in attributes_real]
combined = dicts_generated + dicts_real
dataset_whole = Dataset.from_list(combined)

In [5]:
split_dataset = dataset_whole.train_test_split(test_size=0.3)

# Extract the train and test subsets
train_dataset_before_tokenizer = split_dataset["train"]
test_dataset_before_tokenizer = split_dataset["test"]

In [3]:
model_name = "sdadas/polish-roberta-large-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForMaskedLM.from_pretrained(model_name, num_labels=2)

tokenizer_config.json:   0%|          | 0.00/344 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/8.59M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.74G [00:00<?, ?B/s]

In [20]:
def tokenize_function(example):
    encoding = tokenizer(
        example['text'],
        max_length=512,
        truncation=True,
        stride=256,
        return_overflowing_tokens=True,
        padding="max_length",
        return_tensors="pt"
    )
    n_chunks = encoding["input_ids"].shape[0]

    return {
        "input_ids": [encoding["input_ids"][i].tolist() for i in range(n_chunks)],
        "attention_mask": [encoding["attention_mask"][i].tolist() for i in range(n_chunks)],
        "label": [example["label"]] * n_chunks
    }
def tokenize_function_batch(batch):
    """
    Processes a batch of examples and tokenizes each text using a sliding window.
    Each long text may produce several chunks; this function flattens them so that
    every chunk becomes a separate example with its own scalar label.

    Args:
        batch (dict): A dictionary with keys "text" and "label", where each value is a list.

    Returns:
        dict: A dictionary with keys "input_ids", "attention_mask", and "label",
              where each value is a list of length equal to the total number of chunks produced.
    """
    all_input_ids = []
    all_attention_masks = []
    all_labels = []

    # Loop over each example in the batch.
    for text, label in zip(batch["text"], batch["label"]):
        # Use the built-in sliding window functionality.
        encoding = tokenizer(
            text,
            max_length=512,
            truncation=True,
            stride=256,
            return_overflowing_tokens=True,
            padding="max_length",
            return_tensors="pt"
        )
        # Number of chunks produced for this text.
        num_chunks = encoding["input_ids"].shape[0]
        # For each chunk, add its token ids, attention mask, and the same scalar label.
        for i in range(num_chunks):
            all_input_ids.append(encoding["input_ids"][i].tolist())
            all_attention_masks.append(encoding["attention_mask"][i].tolist())
            all_labels.append(label)

    return {
        "input_ids": all_input_ids,
        "attention_mask": all_attention_masks,
        "label": all_labels
    }

train_dataset = train_dataset_before_tokenizer.map(tokenize_function_batch, batched=True, remove_columns=["text"])
test_dataset = test_dataset_before_tokenizer.map(tokenize_function_batch, batched=True, remove_columns=["text"])

Map:   0%|          | 0/3329 [00:00<?, ? examples/s]

Map:   0%|          | 0/1427 [00:00<?, ? examples/s]

In [21]:
train_dataset = train_dataset.with_format("torch")
test_dataset = test_dataset.with_format("torch")


In [26]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,  # MLM stands for masked language modeling
    mlm_probability=0.15
)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_steps=50,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    load_best_model_at_end=True,
)

def compute_metrics(eval_pred):
    """Compute accuracy or other metrics after each evaluation."""
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=-1)
    accuracy = (predictions == labels).float().mean()
    return {"accuracy": accuracy.item()}

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        # Rename 'label' to 'labels' if present
        if 'label' in inputs:
            inputs['labels'] = inputs.pop('label')
        return super().compute_loss(model, inputs, return_outputs=return_outputs)


trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
trainer.train()

***** Running training *****
  Num examples = 13927
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 5223
  Number of trainable parameters = 435091457


In [None]:
results = trainer.evaluate(test_dataset)
print(results)

In [1]:
from transformers_interpret import SequenceClassificationExplainer

In [None]:
# Load the tokenizer and model you just fine-tuned
# If you've saved locally, pass the local path of your checkpoint
tokenizer = AutoTokenizer.from_pretrained("./results")
model = AutoModelForMaskedLM.from_pretrained("./results")

# Create the explainer
cls_explainer = SequenceClassificationExplainer(
    model=model,
    tokenizer=tokenizer
)

text = "To jest przykładowe zdanie do sprawdzenia."

# Get word attributions
word_attributions = cls_explainer(text)

# word_attributions is a list of tuples: [(token_1, attribution_score_1), (token_2, ...), ...]
print(word_attributions)
