In [None]:
import sys
sys.path.append('/home/yeniguno/projects/sugardata')

import sugardata as su

import os


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datasets import load_dataset, Dataset, DatasetDict, concatenate_datasets
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding, TrainingArguments, Trainer, EarlyStoppingCallback

BASE MODEL

In [None]:
BASE_MODEL = "cardiffnlp/twitter-roberta-base-sentiment"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
model = AutoModelForSequenceClassification.from_pretrained(BASE_MODEL)

# Access label mappings
label2id = model.config.label2id
id2label = model.config.id2label

print("label2id:", label2id)
print("id2label:", id2label)

In [None]:
MODEL_LABEL_MAPPING = {
    "LABEL_0": "Negative",
    "LABEL_1": "Neutral",
    "LABEL_2": "Positive",
    "Negative": "LABEL_0",
    "Neutral": "LABEL_1",
    "Positive": "LABEL_2",
}

In [None]:
pipe = pipeline("text-classification", model=BASE_MODEL, device=0)

pred_good = pipe("Good")
pred_bad = pipe("Bad")
pred_neutral = pipe("Neutral")

print(f"Positive prediction: {pred_good}")
print(f"Negative prediction: {pred_bad}")
print(f"Neutral prediction: {pred_neutral}")

DATASET

In [None]:
DATASET = "zeroshot/twitter-financial-news-sentiment"

In [None]:
ds = load_dataset(DATASET)
print(ds)

In [None]:
split_ds = ds["train"].train_test_split(test_size=0.1, seed=42, shuffle=True)

ds = DatasetDict({
    "train": split_ds["train"],
    "validation": split_ds["test"],  # this is your validation for fine-tuning
    "test": ds["validation"],        # your real validation set, now used for testing
})

In [None]:
def plot_feature_distribution(ds, feature_name: str):
    # 1) Collect value‐counts for each split
    counts = {}
    all_categories = set()
    for split_name, split in ds.items():
        df = split.to_pandas()
        vc = df[feature_name].value_counts()
        counts[split_name] = vc
        all_categories.update(vc.index.tolist())
    
    # 2) Build a DataFrame: rows=categories, cols=splits
    categories = sorted(all_categories)
    df_counts = pd.DataFrame(
        { split: counts[split].reindex(categories, fill_value=0)
          for split in counts },
        index=categories
    )
    print("Counts DataFrame:\n", df_counts)
    
    # 3) Transpose for plotting: rows=splits, cols=categories
    df_plot = df_counts.T
    
    # 4) Plot grouped bar chart with splits on x-axis
    x = np.arange(len(df_plot.index))
    n_categories = len(df_plot.columns)
    width = 0.8 / n_categories
    
    fig, ax = plt.subplots()
    for i, category in enumerate(df_plot.columns):
        ax.bar(x + i * width, df_plot[category], width, label=category)
    
    ax.set_xticks(x + width * (n_categories - 1) / 2)
    ax.set_xticklabels(df_plot.index, rotation=0)
    ax.set_xlabel("Split")
    ax.set_ylabel("Count")
    ax.set_title(f"Distribution of `{feature_name}` across splits")
    ax.legend(title=feature_name)
    plt.tight_layout()
    plt.show()

plot_feature_distribution(ds, "label")

In [None]:
DATASET_LABEL_MAPPING = {
    "Positive": 1,
    "Negative": 0,
    "Neutral": 2,
    0: "Negative",
    1: "Positive",
    2: "Neutral",
}

DATASET_LABELS = [
    id2label.get(0),
    id2label.get(1),
    id2label.get(2),
]

In [None]:
# DATASET TO MODEL LABEL MAPPING
#label2id: {'LABEL_0': 0, 'LABEL_1': 1, 'LABEL_2': 2}
#id2label: {0: 'LABEL_0', 1: 'LABEL_1', 2: 'LABEL_2'}
"""
Dataset -> Model Text -> Model Label
0   -> Negative -> LABEL_0 -> 0
1   -> Positive -> LABEL_2 -> 2
2   -> Neutral  -> LABEL_1 -> 1
"""
print(ds["train"][144])
def encode_labels(example):
    model_text_label =MODEL_LABEL_MAPPING.get(DATASET_LABEL_MAPPING.get(example['label']))
    example['label'] = label2id[model_text_label]
    return example

ds = ds.map(encode_labels)

print(ds["train"][144])

BASE MODEL EVALUATION

In [None]:
def evaluate_sentiment_model(
        pipe: pipeline,
        data: Dataset,
        text_field: str,
        true_label_field: str,
        verbose: bool = True):
    y_true = []
    y_pred = []

    for idx, row in enumerate(data):
        text = row[text_field]
        if isinstance(text, str) and len(text) > 0:
            text = text.strip()
            text = text[:2000]

        pred = pipe(text)[0]["label"]
        true = id2label[row[true_label_field]]

        y_true.append(true)
        y_pred.append(pred)

        if verbose and idx % 1000 == 0:
            print(f"Processed {idx} rows: {pred} vs {true}")
    
    acc = accuracy_score(y_true, y_pred)
    f1_macro = f1_score(y_true, y_pred, average='macro')
    f1_weighted = f1_score(y_true, y_pred, average='weighted')
    precision = precision_score(y_true, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_true, y_pred, average='weighted', zero_division=0)
    cm = confusion_matrix(y_true, y_pred,labels=DATASET_LABELS)

    # Per-class F1 (optional, for analysis)
    f1_per_class = f1_score(y_true, y_pred, average=None)

    return {
        "accuracy": acc,
        "f1_macro": f1_macro,
        "f1_weighted": f1_weighted,
        "precision": precision,
        "recall": recall,
        "confusion_matrix": cm,
        "f1_per_class_0": f1_per_class[0],
        "f1_per_class_1": f1_per_class[1],
        "f1_per_class_2": f1_per_class[2],
        
    }

base_pipe = pipeline("text-classification", model=BASE_MODEL, device=0)

base_test_evals = evaluate_sentiment_model(pipe, ds["test"], "text", "label")

In [None]:
print(base_test_evals)

In [None]:
def plot_cm(cm, labels):
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=labels, yticklabels=labels)

    plt.xlabel('Predicted Label')
    plt.ylabel('Actual Label')
    plt.title('Confusion Matrix')
    plt.show()
    
plot_cm(base_test_evals["confusion_matrix"], DATASET_LABELS)

In [None]:
base_train_evals = evaluate_sentiment_model(pipe, ds["train"], "text", "label")

print(base_train_evals)

In [None]:
def get_failed_instances(data, pipe, text_field, true_label_field):
    row_indices = []
    preds = []

    for idx, row in enumerate(data):
        text = row[text_field]
        if isinstance(text, str) and len(text) > 0:
            text = text.strip()
            text = text[:2000]

        pred = pipe(text)[0]["label"]
        true = id2label[row[true_label_field]]
        
        if pred != true:
            row_indices.append(idx)
            preds.append(label2id[pred])

    return row_indices, preds

failed_train_indices, failed_train_preds = get_failed_instances(ds["train"], pipe, "text", "label")
print(f"Number of failed instances: {len(failed_train_indices)}")

In [None]:
df_train_failed = ds["train"].select(failed_train_indices).to_pandas()
df_train_failed["preds"] = failed_train_preds

df_train_failed.head(10)

FIRST FINE TUNE 

In [None]:
def tokenize_function(examples):
    return tokenizer(examples['text'],
                     padding='max_length',
                     truncation=True,
                     max_length=512)

train_dataset = ds['train']
test_dataset = ds['test']
validation_dataset = ds['validation']

tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)
tokenized_validation = validation_dataset.map(tokenize_function, batched=True)

tokenized_train = tokenized_train.remove_columns(['text'])
tokenized_test = tokenized_test.remove_columns(['text'])
tokenized_validation = tokenized_validation.remove_columns(['text'])

tokenized_train.set_format('torch')
tokenized_test.set_format('torch')
tokenized_validation.set_format('torch')

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    acc = accuracy_score(labels, predictions)
    f1_macro = f1_score(labels, predictions, average='macro')
    f1_weighted = f1_score(labels, predictions, average='weighted')
    precision_weighted = precision_score(labels, predictions, average='weighted')
    recall_weighted = recall_score(labels, predictions, average='weighted')
    
    # Per-class F1 (optional, for analysis)
    f1_per_class = f1_score(labels, predictions, average=None)
    
    return {
        "accuracy": acc,
        "f1": f1_macro,
        "f1_weighted": f1_weighted,
        "precision": precision_weighted,
        "recall": recall_weighted,
        "f1_per_class_0": f1_per_class[0],
        "f1_per_class_1": f1_per_class[1],
        "f1_per_class_2": f1_per_class[2],
    }


In [None]:
output_dir = "round_1"

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=1e-5,  # Further reduce the learning rate
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=10,  # Continue with fewer epochs due to overfitting
    weight_decay=0.1,  # Increase weight decay to regularize the model
    save_strategy="epoch",
    eval_strategy="epoch",
    push_to_hub=False,
    load_best_model_at_end=True,
    gradient_accumulation_steps=1,  # Accumulate gradients to simulate larger batch sizes
    lr_scheduler_type="linear",  # Use a linear learning rate scheduler
    logging_steps=500,  # Increase logging to monitor training closely
    warmup_steps=500,  # Add a warmup phase to stabilize learning at the start
    gradient_checkpointing=True,  # Enable gradient checkpointing to save memory
    max_grad_norm=1.0,  # Gradient clipping,
    fp16=True,  # enable mixed precision
    dataloader_pin_memory=True,
    dataloader_num_workers=12,
    optim="adamw_torch_fused",
    metric_for_best_model="f1",
    greater_is_better=True,
)

# Early stopping callback with patience of 2 epochs
early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=3,  # Stop training if no improvement after 2 epochs
    early_stopping_threshold=0.01
)

# Trainer with early stopping callback
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping_callback],

)
trainer.train()

In [None]:
trainer.model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

In [None]:
new_model = AutoModelForSequenceClassification.from_pretrained(output_dir)
new_tokenizer = AutoTokenizer.from_pretrained(output_dir)

In [None]:
new_pipe = pipeline(
    "text-classification",
    model=new_model,
    tokenizer=new_tokenizer,
    device=0
)

new_test_evals = evaluate_sentiment_model(new_pipe, ds["test"], "text", "label")

In [None]:
print(new_test_evals)

In [None]:
plot_cm(new_test_evals["confusion_matrix"], DATASET_LABELS)

SUGAR

In [None]:
failed_train_indices, failed_train_preds = get_failed_instances(ds["train"], new_pipe, "text", "label")
print(f"Number of failed instances: {len(failed_train_indices)}")

df_train_failed = ds["train"].select(failed_train_indices).to_pandas()
df_train_failed["preds"] = failed_train_preds

df_train_failed.tail(10)

In [None]:
examples = []

for idx, row in df_train_failed.iterrows():
    label = row["label"]
    if label == 1:
        continue

    examples.append(row["text"])

print(f"Number of examples: {len(examples)}")

In [None]:
examples[:5]

In [None]:
results = su.generate_sentiment_data(
    language="en",
    examples=examples,
    label_options=["Positive", "Negative"],
    batch_size=64,
)

print(f"Number of generated examples: {len(results)}")

In [None]:
results[0:5]

In [None]:
def append_to_train(results):
    """
    Append new examples to the training set.
    """
    new_examples = []
    for res in results:
        label = label2id.get(MODEL_LABEL_MAPPING.get(res["label"]))
        new_examples.append({"text": res["generated_text"], "label": label})

    new_dataset = Dataset.from_list(new_examples)
    ds["train"] = concatenate_datasets([ds["train"], new_dataset])
    return ds

ds = append_to_train(results)

In [None]:
plot_feature_distribution(ds, "label")

In [None]:
examples = []

for instance in ds["train"]:
    if instance["label"] == 0:
        examples.append(instance["text"])

print(f"Number of negative examples: {len(examples)}")

In [None]:
results = su.generate_sentiment_data(
    language="en",
    examples=examples,
    label_options=["Negative"],
    batch_size=64,
)

print(f"Number of generated examples: {len(results)}")

In [None]:
ds = append_to_train(results)

print(ds)

In [None]:
examples = []

for instance in ds["train"]:
    if instance["label"] == 2:
        examples.append(instance["text"])

print(f"Number of positive examples: {len(examples)}")

In [None]:
results = su.generate_sentiment_data(
    language="en",
    examples=examples,
    label_options=["Positive"],
    batch_size=64,
)

print(f"Number of generated examples: {len(results)}")

In [None]:
ds = append_to_train(results)

print(ds)

In [None]:
plot_feature_distribution(ds, "label")

SECOND FINE TUNE

In [None]:
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
model = AutoModelForSequenceClassification.from_pretrained(BASE_MODEL)

In [None]:
def tokenize_function(examples):
    return tokenizer(examples['text'],
                     padding='max_length',
                     truncation=True,
                     max_length=512)

train_dataset = ds['train']
test_dataset = ds['test']
validation_dataset = ds['validation']

tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)
tokenized_validation = validation_dataset.map(tokenize_function, batched=True)

tokenized_train = tokenized_train.remove_columns(['text'])
tokenized_test = tokenized_test.remove_columns(['text'])
tokenized_validation = tokenized_validation.remove_columns(['text'])

tokenized_train.set_format('torch')
tokenized_test.set_format('torch')
tokenized_validation.set_format('torch')

In [None]:
output_dir = "round_2"

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=1e-5,  # Further reduce the learning rate
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=10,  # Continue with fewer epochs due to overfitting
    weight_decay=0.1,  # Increase weight decay to regularize the model
    save_strategy="epoch",
    eval_strategy="epoch",
    push_to_hub=False,
    load_best_model_at_end=True,
    gradient_accumulation_steps=1,  # Accumulate gradients to simulate larger batch sizes
    lr_scheduler_type="linear",  # Use a linear learning rate scheduler
    logging_steps=500,  # Increase logging to monitor training closely
    warmup_steps=500,  # Add a warmup phase to stabilize learning at the start
    gradient_checkpointing=True,  # Enable gradient checkpointing to save memory
    max_grad_norm=1.0,  # Gradient clipping,
    fp16=True,  # enable mixed precision
    dataloader_pin_memory=True,
    dataloader_num_workers=12,
    optim="adamw_torch_fused",
    metric_for_best_model="f1",
    greater_is_better=True,
)

# Early stopping callback with patience of 2 epochs
early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=3,  # Stop training if no improvement after 2 epochs
    early_stopping_threshold=0.01
)

# Trainer with early stopping callback
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping_callback],

)
trainer.train()

In [None]:
trainer.model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

In [None]:
new_model = AutoModelForSequenceClassification.from_pretrained(output_dir)
new_tokenizer = AutoTokenizer.from_pretrained(output_dir)

In [None]:
new_pipe = pipeline(
    "text-classification",
    model=new_model,
    tokenizer=new_tokenizer,
    device=0
)

new_test_evals = evaluate_sentiment_model(new_pipe, ds["test"], "text", "label")

print(new_test_evals)

In [None]:
plot_cm(new_test_evals["confusion_matrix"], DATASET_LABELS)