In [1]:
import random
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM, pipeline

In [2]:
# Configuration
model_name = "allegro/herbert-base-cased"
mask_prob = 0.15  # Probability of masking a token
augmentation_factor = 2  # Number of augmented examples per original example
random_seed = 42

# Set seed for reproducibility
random.seed(random_seed)
np.random.seed(random_seed)
torch.manual_seed(random_seed)

<torch._C.Generator at 0x1392f7010>

In [3]:
# Load tokenizer and masked language model
tokenizer = AutoTokenizer.from_pretrained(model_name)
mlm_model = AutoModelForMaskedLM.from_pretrained(model_name)

# Set up a pipeline for masked language modeling
mlm_pipeline = pipeline("fill-mask", model=mlm_model, tokenizer=tokenizer, device=0)

BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


In [4]:
def augment_text_with_word_mask(text, augmentation_factor=1, mask_prob=0.15):
    words = text.split()
    augmented_texts = []

    for _ in range(augmentation_factor):
        masked_words = words[:]
        num_words_to_mask = max(1, int(len(words) * mask_prob))
        mask_indices = random.sample(range(len(words)), num_words_to_mask)

        for idx in mask_indices:
            masked_words[idx] = tokenizer.mask_token

        masked_text = " ".join(masked_words)

        predictions = mlm_pipeline(masked_text)

        # Replace the masks with predictions
        for idx, pred in zip(mask_indices, predictions):
            try:
                if isinstance(pred, list) and len(pred) > 0:
                    # Normal case: List of predictions
                    masked_words[idx] = pred[0]["token_str"]
                elif isinstance(pred, dict):
                    # Edge case: Single dictionary as output
                    masked_words[idx] = pred.get("token_str", tokenizer.mask_token)
            except Exception as e:
                print(f"Error with prediction: {e}, Prediction Output: {pred}")
                masked_words[idx] = tokenizer.mask_token  # Fallback

        augmented_texts.append(" ".join(masked_words))

    return augmented_texts


def augment_dataset(df, text_column="text", label_column="label", augmentation_factor=1, mask_prob=0.15):
    augmented_rows = []

    for _, row in df.iterrows():
        original_text = row[text_column]
        label = row[label_column]

        augmented_texts = augment_text_with_word_mask(
            original_text, augmentation_factor=augmentation_factor, mask_prob=mask_prob
        )

        for aug_text in augmented_texts:
            augmented_rows.append({text_column: aug_text, label_column: label})

    augmented_df = pd.DataFrame(augmented_rows)
    return pd.concat([df, augmented_df], ignore_index=True)

In [None]:
# Load dataset
data_path = "all_texts.csv"
df = pd.read_csv(data_path)

# Augment the dataset
augmented_df = augment_dataset(df, text_column="text", label_column="label", augmentation_factor=augmentation_factor, mask_prob=mask_prob)

# Save the augmented dataset
augmented_df.to_csv("augmented_texts.csv", index=False)
print("Augmented dataset saved to 'augmented_texts_bert.csv'.")

Augmented dataset saved to 'augmented_texts.csv'.
