## Infrastructure Setup

In [4]:
# Import all the modules
import nltk
from datasets import load_dataset, concatenate_datasets, Dataset
import importlib.resources
import os
from pydantic_ai import Agent
from pydantic_ai.models.google import GoogleModel
from pydantic_ai.providers.google import GoogleProvider
from pydantic import BaseModel, Field
import getpass
import nest_asyncio
import pandas as pd
from tqdm.notebook import tqdm
import random
import asyncio
import time
import re
from transformers import AutoTokenizer, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, AutoModelForSeq2SeqLM, Seq2SeqTrainer, T5Tokenizer, T5ForConditionalGeneration
import evaluate
import numpy as np

# Ensure the necessary NLTK resources are downloaded
nltk.download("punkt_tab")

# Check if the api key is set
if not os.getenv("GOOGLE_API_KEY"):
    os.environ["GOOGLE_API_KEY"] = getpass.getpass("Enter your Google API key: ")

# Avoid event loop issues in Jupyter notebooks
nest_asyncio.apply()

[nltk_data] Downloading package punkt_tab to /usr/share/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


## Prepare data for training (one-time setup)

### Trim paragraphs into sentences

In [5]:
# Load the raw dataset
dataset = load_dataset("brando/small-c4-dataset")
dataset = concatenate_datasets(
    [dataset["train"], dataset["validation"], dataset["test"]]
)

# Split text into sentences
sentences = []
for data in tqdm(dataset):
    text = data["text"]
    for sentence in nltk.tokenize.sent_tokenize(text, language="english"):
        sentence_length = len(sentence.strip())
        if sentence_length < 50 or sentence_length > 100:
            continue
        if not re.match(r"^[a-zA-Z0-9.,!? '\"]*$", sentence):
            continue
        sentences.append(sentence)
sentences = random.sample(sentences, 50000)
print(f"Total sentences: {len(sentences)}")
print(f"Sample sentences: {sentences[:5]}")

README.md:   0%|          | 0.00/5.95k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/13.3M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/13.3M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/13.4M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

  0%|          | 0/30000 [00:00<?, ?it/s]

Total sentences: 50000
Sample sentences: ['For a particular post, also take notes from existing and previous post holders.', 'The highly anticipated PBS Kids in the Park festival is tomorrow, Saturday, June 21.', 'Taliesin almost beat it out because of that issue.', 'Zinc includes 285,000 tonnes of refined zinc and 595,000 tonnes of zinc contained in concentrates.', 'Our Cartridges for Epson Expression Premium XP700 are great value with super fast delivery!']


### Generate training data for the model

In [None]:
# Define the Pydantic model for the antithesis generation
class Antithesis(BaseModel):
    result: str = Field(..., description="The antithesis of the original sentence.")
    reasoning: str = Field(
        ..., description="The reasoning behind the antithesis generation."
    )

# Initialize the LLM agent
model = GoogleModel("gemini-2.0-flash", provider=GoogleProvider())
agent = Agent(
    model=model,
    output_type=Antithesis,
    system_prompt=importlib.resources.read_text("prompts", "generate_antithesis.txt"),
)

# Function to generate antithesis for a given sentence
coroutines = []
for sentence in tqdm(sentences):
    coroutines.append(
        agent.run(f"Generate an antithesis for the following sentence: {sentence}")
    )

# Run all the coroutines concurrently
dataset = {"sentence": [], "antithesis": []}
for i in tqdm(range(0, len(coroutines), 50)):
    coroutines_batch = coroutines[i : i + 50]
    sentences_batch = sentences[i : i + 50]
    try:
        antithesis_batch = asyncio.run(
            asyncio.gather(*coroutines_batch, return_exceptions=True)
        )
        antithesis_batch = [result.output.result for result in antithesis_batch]
        dataset["sentence"].extend(sentences_batch)
        dataset["antithesis"].extend(antithesis_batch)
    except Exception as e:
        print(f"Error processing batch {i // 50}: {e}")
        continue
    time.sleep(2)

# Save the dataset to a CSV file
dataset = pd.DataFrame(dataset)
dataset.to_csv("data/antithesis_dataset.csv", index=False)

  0%|          | 0/50000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

Error processing batch 142: 'UnexpectedModelBehavior' object has no attribute 'output'
Error processing batch 318: 'UnexpectedModelBehavior' object has no attribute 'output'
Error processing batch 473: 'UnexpectedModelBehavior' object has no attribute 'output'
Error processing batch 798: 'UnexpectedModelBehavior' object has no attribute 'output'
Error processing batch 928: 'UnexpectedModelBehavior' object has no attribute 'output'


## Train the model

### Load the training data and split it into train and test sets

In [None]:
# Load the dataset from the CSV file
dataset = pd.read_csv("data/antithesis_dataset.csv")
dataset.dropna(inplace=True)

# Convert the DataFrame to a Hugging Face Dataset
dataset = Dataset.from_pandas(dataset)

# Split the dataset into train, validation, and test sets
dataset = dataset.train_test_split(test_size=0.2)
print(f"Train size: {len(dataset['train'])}, Test size: {len(dataset['test'])}")

Train size: 39797, Test size: 9950


### Tokenize the training data

In [7]:
# Define the prefix and tokenizer for the T5 model
prefix = "antithesis : "
tokenizer = AutoTokenizer.from_pretrained("t5-small")

# Function to tokenize the dataset
def tokenize(data):
    inputs = [prefix + sentence for sentence in data["sentence"]]
    targets = [antithesis for antithesis in data["antithesis"]]
    return tokenizer(inputs, text_target=targets, max_length=512, truncation=True)

# Tokenize the dataset
tokenized_dataset = dataset.map(tokenize, batched=True)

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Map:   0%|          | 0/39797 [00:00<?, ? examples/s]

Map:   0%|          | 0/9950 [00:00<?, ? examples/s]

### Define the evaluation metric

In [8]:
# Load the evaluation metric
metric = evaluate.load("sacrebleu")

# Function to postprocess the text for evaluation
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

# Function to compute the evaluation metrics
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    preds_sanitized = np.where(preds < 0, tokenizer.pad_token_id, preds)
    decoded_preds = tokenizer.batch_decode(preds_sanitized, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}
    prediction_lens = [np.count_nonzero(pred_seq != tokenizer.pad_token_id) for pred_seq in preds_sanitized]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result


Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

### Define model and trainer

In [9]:
# Create a data collator for the T5 model
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model="t5-small")

# Create training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="antithesis_finetuned_checkpoints",
    eval_strategy="epoch",
    learning_rate=2e-3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True,
    report_to="none",
)

# Create the model for training
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

# Create the Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

### Train the model

In [10]:
# Train the T5 model
trainer.train()

# Save the trained model
trainer.save_model("antithesis_finetuned")

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,2.2308,1.962099,21.6761,17.1343
2,1.9214,1.861403,22.9548,17.0759
3,1.6792,1.8432,23.5149,17.0648




## Test the model

In [None]:
# Define constants
model_path = "antithesis_finetuned"
prefix = "antithesis : "

# Define tokenizer and model
tokenizer = T5Tokenizer.from_pretrained(model_path, legacy=False)
model = T5ForConditionalGeneration.from_pretrained(model_path)

# Function to generate antithesis for a given sentence
def generate_antithesis(sentence):
    input_text = prefix + sentence
    input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
    output_ids = model.generate(input_ids, max_length=100, num_beams=5, early_stopping=True, no_repeat_ngram_size=2)
    antithesis = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return antithesis

# Test the given text
texts = [
    "Love builds strong bridges.",
    "The sun shines brightly.",
    "Donald Trump is insane.",
    "High mountains are always covered in snow."
]
for text in texts:
    antithesis = generate_antithesis(text)
    print(f"Original: {text}\nAntithesis: {antithesis}\n")

Original: Love builds strong bridges.
Antithesis: Hate destroys weak bridges

Original: The sun shines brightly.
Antithesis: The moon darkens darkly

Original: Donald Trump is insane.
Antithesis: Donald Trump is wise

Original: High mountains are always covered in snow.
Antithesis: Low mountains are never exposed to sunshine



## Evaluate the performance

### Define the evaluator

In [None]:
# Define the Pydantic model for the score
class Score(BaseModel):
    score: float = Field(..., description="The score of the antithesis generation.")
    explanation: str = Field(..., description="The explanation of the score.")


# Initialize the LLM agent
agent = Agent(
    model=GoogleModel("gemini-2.0-flash", provider=GoogleProvider()),
    output_type=Score,
    system_prompt=importlib.resources.read_text("prompts", "evaluate_antithesis.txt"),
)


# Function to evaluate the antithesis generation
async def evaluate_antithesis(sentence, antithesis):
    prompt = f"Evaluate the score quantifies how perfectly they achieve structural antithesis between sentence1 and sentence2.\n\n"
    prompt += f"sentence1: {sentence}\n"
    prompt += f"sentence2: {antithesis}\n"
    result = await agent.run(prompt)
    return result.output.score, result.output.explanation

### Run evaluation

In [21]:
# Evaluate the antithesis generation on a small evaluation set
evaluation_set = random.sample(dataset["test"].to_list(), 100)

# Run the evaluation for the finetuned model
average_score = 0
for data in tqdm(evaluation_set):
    sentence = data["sentence"]
    antithesis = generate_antithesis(sentence)
    score, explanation = asyncio.run(evaluate_antithesis(sentence, antithesis))
    average_score += score / len(evaluation_set)
print(f"Average score for the finetuned model: {average_score:.4f}")

# Run the evaluation for the original model
average_score = 0
for data in tqdm(evaluation_set):
    sentence = data["sentence"]
    antithesis = data["antithesis"]
    score, explanation = asyncio.run(evaluate_antithesis(sentence, antithesis))
    average_score += score / len(evaluation_set)
print(f"Average score for the original model: {average_score:.4f}")

  0%|          | 0/100 [00:00<?, ?it/s]

Average score for the finetuned model: 0.7785


  0%|          | 0/100 [00:00<?, ?it/s]

Average score for the original model: 0.7703
