# Tema 7: Aumento de datos

## Ejercicio 1
Utilizar aumento de datos para mejorar la clasificación de sentimientos en un dataset desbalanceado (rotten_tomatoes).

### Apartado a
Cargar dataset y obtener estadísticas.

In [1]:
from datasets import load_dataset
import pandas as pd

dataset = load_dataset("rotten_tomatoes")

labels = dataset['train'].features['label'].names
print('Labels:', labels)
print()

for split in dataset:
    df = pd.DataFrame(dataset[split])
    print(f"--- {split} ---")
    print(f"Total: {len(df)}")
    print(df['label'].value_counts().rename(index=dict(enumerate(labels))))
    print()

Labels: ['neg', 'pos']

--- train ---
Total: 8530
label
pos    4265
neg    4265
Name: count, dtype: int64

--- validation ---
Total: 1066
label
pos    533
neg    533
Name: count, dtype: int64

--- test ---
Total: 1066
label
pos    533
neg    533
Name: count, dtype: int64



Dividir en train/test y desbalancear el conjunto de entrenamiento.

In [2]:
import numpy as np
from sklearn.model_selection import train_test_split

SEED = 42

# Usar un subconjunto del dataset
texts = dataset['train']['text']
labels_arr = dataset['train']['label']

X_train, X_test, y_train, y_test = train_test_split(
    texts, labels_arr, test_size=0.3, random_state=SEED, stratify=labels_arr
)

print(f"Train: {len(X_train)}, Test: {len(X_test)}")
print(f"Train distribución: {pd.Series(y_train).value_counts().to_dict()}")

Train: 5971, Test: 2559
Train distribución: {1: 2986, 0: 2985}


In [3]:
# Desbalancear: mantener todos los negativos (0) y solo un porcentaje de los positivos (1)
minority_ratio = 0.2

X_train_neg = [t for t, l in zip(X_train, y_train) if l == 0]
y_train_neg = [l for l in y_train if l == 0]

X_train_pos = [t for t, l in zip(X_train, y_train) if l == 1]
y_train_pos = [l for l in y_train if l == 1]

np.random.seed(SEED)
n_keep = int(len(X_train_pos) * minority_ratio)
indices = np.random.choice(len(X_train_pos), size=n_keep, replace=False)
X_train_pos_reduced = [X_train_pos[i] for i in indices]
y_train_pos_reduced = [y_train_pos[i] for i in indices]

X_train_imb = X_train_neg + X_train_pos_reduced
y_train_imb = y_train_neg + y_train_pos_reduced

print(f"Train desbalanceado: {len(X_train_imb)}")
print(f"Negativos (clase mayoritaria): {len(X_train_neg)}")
print(f"Positivos (clase minoritaria): {len(X_train_pos_reduced)}")

Train desbalanceado: 3582
Negativos (clase mayoritaria): 2985
Positivos (clase minoritaria): 597


### Apartado b
Clasificación baseline con TF-IDF y SVM.

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report


def train_and_evaluate(X_train, y_train, X_test, y_test, description):
    vectorizer = TfidfVectorizer(max_features=10000)
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_test_tfidf = vectorizer.transform(X_test)

    svm = SVC(kernel='linear', random_state=SEED)
    svm.fit(X_train_tfidf, y_train)

    y_pred = svm.predict(X_test_tfidf)

    print(f"\n{'='*60}")
    print(f"{description}")
    print(f"{'='*60}")
    print(classification_report(y_test, y_pred, target_names=labels))

    return y_pred

In [5]:
y_pred_baseline = train_and_evaluate(
    X_train_imb, y_train_imb, X_test, y_test,
    "Baseline: TF-IDF + SVM (dataset desbalanceado)"
)


Baseline: TF-IDF + SVM (dataset desbalanceado)
              precision    recall  f1-score   support

         neg       0.53      1.00      0.69      1280
         pos       0.98      0.10      0.19      1279

    accuracy                           0.55      2559
   macro avg       0.75      0.55      0.44      2559
weighted avg       0.75      0.55      0.44      2559



### Apartado c
Aumento de datos: sustitución de sinónimos (WordNet).

In [6]:
import nlpaug.augmenter.word as naw

aug_synonym = naw.SynonymAug(aug_src='wordnet')

# Ejemplo de aumento
example = X_train_pos_reduced[0]
print("Original:", example)
print("Aumentado:", aug_synonym.augment(example))

Original: a coming-of-age tale from new zealand whose boozy , languid air is balanced by a rich visual clarity and deeply felt performances across the board .
Aumentado: ['a come - of - eld tale from new zealand whose boozy, languorous air equal balanced by a rich visual clarity and profoundly feel performances across the board.']


In [7]:
def augment_texts(texts, labels, augmenter, n_aug=1):
    augmented_texts = []
    augmented_labels = []
    for text, label in zip(texts, labels):
        for _ in range(n_aug):
            aug_text = augmenter.augment(text)
            if isinstance(aug_text, list):
                aug_text = aug_text[0]
            augmented_texts.append(aug_text)
            augmented_labels.append(label)
    return augmented_texts, augmented_labels

In [8]:
# Aumentar solo la clase minoritaria (positivos)
aug_texts_syn, aug_labels_syn = augment_texts(
    X_train_pos_reduced, y_train_pos_reduced, aug_synonym, n_aug=3
)

X_train_syn = X_train_imb + aug_texts_syn
y_train_syn = y_train_imb + aug_labels_syn

print(f"Train con sinónimos: {len(X_train_syn)}")
print(f"Negativos: {y_train_syn.count(0)}, Positivos: {y_train_syn.count(1)}")

Train con sinónimos: 5373
Negativos: 2985, Positivos: 2388


In [9]:
y_pred_synonym = train_and_evaluate(
    X_train_syn, y_train_syn, X_test, y_test,
    "Aumento con sinónimos (WordNet)"
)


Aumento con sinónimos (WordNet)
              precision    recall  f1-score   support

         neg       0.60      0.92      0.73      1280
         pos       0.83      0.39      0.53      1279

    accuracy                           0.65      2559
   macro avg       0.71      0.65      0.63      2559
weighted avg       0.71      0.65      0.63      2559



Aumento de datos: operaciones aleatorias sobre palabras (swap, delete).

In [10]:
aug_swap = naw.RandomWordAug(action='swap')

example = X_train_pos_reduced[0]
print("Original:", example)
print("Swap:", aug_swap.augment(example))

Original: a coming-of-age tale from new zealand whose boozy , languid air is balanced by a rich visual clarity and deeply felt performances across the board .
Swap: ['A coming - of - age tale from new boozy zealand whose, air languid is a balanced by rich visual clarity and deeply across felt performances the board.']


In [11]:
aug_texts_swap, aug_labels_swap = augment_texts(
    X_train_pos_reduced, y_train_pos_reduced, aug_swap, n_aug=3
)

X_train_swap = X_train_imb + aug_texts_swap
y_train_swap = y_train_imb + aug_labels_swap

print(f"Train con swap: {len(X_train_swap)}")
print(f"Negativos: {y_train_swap.count(0)}, Positivos: {y_train_swap.count(1)}")

Train con swap: 5373
Negativos: 2985, Positivos: 2388


In [12]:
y_pred_swap = train_and_evaluate(
    X_train_swap, y_train_swap, X_test, y_test,
    "Aumento con random word swap"
)


Aumento con random word swap
              precision    recall  f1-score   support

         neg       0.61      0.90      0.73      1280
         pos       0.81      0.42      0.55      1279

    accuracy                           0.66      2559
   macro avg       0.71      0.66      0.64      2559
weighted avg       0.71      0.66      0.64      2559



In [13]:
aug_delete = naw.RandomWordAug(action='delete')

example = X_train_pos_reduced[0]
print("Original:", example)
print("Delete:", aug_delete.augment(example))

Original: a coming-of-age tale from new zealand whose boozy , languid air is balanced by a rich visual clarity and deeply felt performances across the board .
Delete: ['a - of - age tale from new whose, languid is a visual clarity deeply felt performances across the.']


In [14]:
aug_texts_del, aug_labels_del = augment_texts(
    X_train_pos_reduced, y_train_pos_reduced, aug_delete, n_aug=3
)

X_train_del = X_train_imb + aug_texts_del
y_train_del = y_train_imb + aug_labels_del

print(f"Train con delete: {len(X_train_del)}")
print(f"Negativos: {y_train_del.count(0)}, Positivos: {y_train_del.count(1)}")

Train con delete: 5373
Negativos: 2985, Positivos: 2388


In [15]:
y_pred_delete = train_and_evaluate(
    X_train_del, y_train_del, X_test, y_test,
    "Aumento con random word delete"
)


Aumento con random word delete
              precision    recall  f1-score   support

         neg       0.61      0.91      0.73      1280
         pos       0.83      0.43      0.56      1279

    accuracy                           0.67      2559
   macro avg       0.72      0.67      0.65      2559
weighted avg       0.72      0.67      0.65      2559



Aumento de datos contextual con embeddings (BERT).

Nota: es costoso computacionalmente

In [16]:
# aug_contextual = naw.ContextualWordEmbsAug(
#     model_path='bert-base-uncased',
#     action='substitute'
# )
# 
# example = X_train_pos_reduced[0]
# print("Original:", example)
# print("Contextual:", aug_contextual.augment(example))

In [17]:
# aug_texts_ctx, aug_labels_ctx = augment_texts(
#     X_train_pos_reduced, y_train_pos_reduced, aug_contextual, n_aug=1
# )
# 
# X_train_ctx = X_train_imb + aug_texts_ctx
# 
# y_train_ctx = y_train_imb + aug_labels_ctx
# 
# print(f"Train con contextual: {len(X_train_ctx)}")
# print(f"Negativos: {y_train_ctx.count(0)}, Positivos: {y_train_ctx.count(1)}")

In [18]:
# y_pred_contextual = train_and_evaluate(
#     X_train_ctx, y_train_ctx, X_test, y_test,
#     "Aumento contextual (BERT)"
# )

Aumento de datos con back-translation.

Nota: es costoso computacionalmente

In [19]:
# aug_backtranslation = naw.BackTranslationAug(
#     from_model_name='facebook/wmt19-en-de',
#     to_model_name='facebook/wmt19-de-en'
# )
# 
# example = X_train_pos_reduced[0]
# print("Original:", example)
# print("Back-translation:", aug_backtranslation.augment(example))

In [20]:
# aug_texts_bt, aug_labels_bt = augment_texts(
#     X_train_pos_reduced, y_train_pos_reduced, aug_backtranslation, n_aug=3
# )
# 
# X_train_bt = X_train_imb + aug_texts_bt
# y_train_bt = y_train_imb + aug_labels_bt
# 
# print(f"Train con back-translation: {len(X_train_bt)}")
# print(f"Negativos: {y_train_bt.count(0)}, Positivos: {y_train_bt.count(1)}")

In [21]:
# y_pred_bt = train_and_evaluate(
#     X_train_bt, y_train_bt, X_test, y_test,
#     "Aumento con back-translation (EN->DE->EN)"
# )

Comparar resultados de todas las estrategias.

In [22]:
from sklearn.metrics import f1_score

results = {
    "Baseline (desbalanceado)": y_pred_baseline,
    "Sinónimos (WordNet)": y_pred_synonym,
    "Random swap": y_pred_swap,
    "Random delete": y_pred_delete,
#     "Contextual (BERT)": y_pred_contextual,
#     "Back-translation": y_pred_bt,
}

print(f"{'Estrategia':<30} {'F1 macro':>10} {'F1 pos':>10} {'F1 neg':>10}")
print("-" * 62)
for name, y_pred in results.items():
    f1_macro = f1_score(y_test, y_pred, average='macro')
    f1_per_class = f1_score(y_test, y_pred, average=None)
    print(f"{name:<30} {f1_macro:>10.4f} {f1_per_class[1]:>10.4f} {f1_per_class[0]:>10.4f}")

Estrategia                       F1 macro     F1 pos     F1 neg
--------------------------------------------------------------
Baseline (desbalanceado)           0.4396     0.1893     0.6899
Sinónimos (WordNet)                0.6286     0.5303     0.7268
Random swap                        0.6408     0.5534     0.7281
Random delete                      0.6484     0.5626     0.7343
