In [1]:
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import Dataset
import time

# --- 1. Translation pipelines for backtranslation ---
# English <-> Spanish
en2es = pipeline('translation_en_to_es', model='Helsinki-NLP/opus-mt-en-es', device=0)
es2en = pipeline('translation_es_to_en', model='Helsinki-NLP/opus-mt-es-en', device=0)
# English <-> German
en2de = pipeline('translation_en_to_de', model='Helsinki-NLP/opus-mt-en-de', device=0)
de2en = pipeline('translation_de_to_en', model='Helsinki-NLP/opus-mt-de-en', device=0)
# English <-> French
en2fr = pipeline('translation_en_to_fr', model='Helsinki-NLP/opus-mt-en-fr', device=0)
fr2en = pipeline('translation_fr_to_en', model='Helsinki-NLP/opus-mt-fr-en', device=0)

def safe_translate(pipe, text):
    # Sometimes huggingface pipelines hit rate limits or errors
    for _ in range(3):
        try:
            return pipe(text)[0]['translation_text']
        except Exception as e:
            print("Retrying due to:", e)
            time.sleep(1)
    return text  # fallback

def backtranslate(text, lang):
    if lang == 0:  # Assuming 'en' is represented by 0
        es = safe_translate(en2es, text)
        bt_en = safe_translate(es2en, es)
        de = safe_translate(en2de, text)
        bt_en_2 = safe_translate(de2en, de)
        return [bt_en, bt_en_2]
    elif lang == 3:  # Assuming 'es' is represented by 3
        en = safe_translate(es2en, text)
        bt_es = safe_translate(en2es, en)
        return [bt_es]
    elif lang == 2:  # Assuming 'de' is represented by 2
        en = safe_translate(de2en, text)
        bt_de = safe_translate(en2de, en)
        return [bt_de]
    elif lang == 1:  # Assuming 'fr' is represented by 1
        en = safe_translate(fr2en, text)
        bt_fr = safe_translate(en2fr, en)
        return [bt_fr]
    else:
        return []

# --- 2. Paraphrase for English (optional, since EN has good models) ---
paraphrase_tokenizer = AutoTokenizer.from_pretrained("Vamsi/T5_Paraphrase_Paws")
paraphrase_model = AutoModelForSeq2SeqLM.from_pretrained("Vamsi/T5_Paraphrase_Paws")

def paraphrase_en(text, num_return_sequences=1):
    input_text = f"paraphrase: {text} </s>"
    features = paraphrase_tokenizer([input_text], return_tensors='pt', truncation=True)
    output = paraphrase_model.generate(
        **features,
        max_length=64,
        num_beams=10,
        num_return_sequences=num_return_sequences,
        temperature=1.5
    )
    return [paraphrase_tokenizer.decode(o, skip_special_tokens=True) for o in output]

# --- 3. Apply pipeline to Hugging Face Dataset ---

def augment_row(row):
    orig_text = row['sentence']
    lang = row['lang']  # Should be 'en', 'es', or 'de'
    augmented = []
    # Backtranslation
    augmented += backtranslate(orig_text, lang)
    # # Paraphrasing (only for English)
    # if lang == 0:  # Assuming 'en' is represented by 0
    #     augmented += paraphrase_en(orig_text, num_return_sequences=2)
    # Keep original
    return {'augmented_sentences': [orig_text] + augmented, 'labels': [row['labels']] * (1 + len(augmented)), 'lang': [lang] * (1 + len(augmented))}

# Assuming your Hugging Face dataset is loaded as "dataset"
# Let's say we're augmenting only the minority classes:
minority_labels = [1, 2]  # Adjust as needed

def augment_dataset(dataset):
    new_data = {'sentence': [], 'labels': [], 'lang': []}
    for row in dataset:
        if row['labels'] in minority_labels:
            aug = augment_row(row)
            for s, l, lang in zip(aug['augmented_sentences'], aug['labels'], aug['lang']):
                new_data['sentence'].append(s)
                new_data['labels'].append(l)
                new_data['lang'].append(lang)
        else:
            # Add original only for majority class
            new_data['sentence'].append(row['sentence'])
            new_data['labels'].append(row['labels'])
            new_data['lang'].append(row['lang'])

    # Note: This assumes that the original dataset is in the same format as the augmented one
    return Dataset.from_dict(new_data)

Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [None]:
from datasets import load_dataset
# dataset = load_dataset("nojedag/financial-tweets-sentiment-multilingual")
dataset = load_dataset("nojedag/financial_phrasebank_multilingual")

# get a sample of 1000 elements
dataset["train"] = dataset['train'].shuffle(seed=42).select(range(80))
dataset["test"] = dataset['test'].shuffle(seed=42).select(range(20))

def prepare_dataset(dataset):
    dataset = dataset.rename_column("sentiment", "labels")
    return dataset

dataset = prepare_dataset(dataset)

In [None]:
dataset['train'] = augment_dataset(dataset['train'])
dataset['test'] = augment_dataset(dataset['test'])

KeyError: 'labels'

In [None]:
# Push the balanced dataset to Hugging Face Hub
dataset.push_to_hub("nojedag/financial_phrasebank_multilingual_augmented")