<a href="https://colab.research.google.com/github/llw0111/Assignment-1-Development-/blob/main/SentSimplificationBART.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from transformers import BartTokenizer, BartForConditionalGeneration, DataCollatorForSeq2Seq, TrainingArguments, Trainer
from datasets import load_dataset, Dataset

# Load your data from CSV
train_dataset = Dataset.from_csv("/content/newsela_train.csv")
val_dataset = Dataset.from_csv("/content/newsela_test.csv")

# Add prefix for T5
prefix = "simplify: "

# Preprocess function
def preprocess(examples): # Changed to handle batches
    inputs = [prefix + ex for ex in examples['Normal']]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding=True)

    # Use text_target for labels and remove as_target_tokenizer
    labels = tokenizer(text_target=examples['Simple'], max_length=128, truncation=True, padding=True)

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

model_name = "facebook/bart-base"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

# Apply preprocessing
train_dataset = train_dataset.map(preprocess, batched=True) # Changed batched=False to batched=True
val_dataset = val_dataset.map(preprocess, batched=True)   # Changed batched=False to batched=True

# Initialize data collator
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)


# Define training args
training_args = TrainingArguments(
    output_dir="./BERT-simplification",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    save_strategy="epoch",
    logging_dir="./logs",
    report_to="none",  # Disable reporting to services like W&B
)

# Train
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer, # Keep tokenizer here for the trainer
    data_collator=data_collator, # Add data collator
)

trainer.train()

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

Map:   0%|          | 0/102711 [00:00<?, ? examples/s]

Map:   0%|          | 0/1414 [00:00<?, ? examples/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
!pip install textstat evaluate sacrebleu sacremoses
import nltk
nltk.download('punkt_tab')

Collecting textstat
  Downloading textstat-0.7.8-py3-none-any.whl.metadata (15 kB)
Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Collecting pyphen (from textstat)
  Downloading pyphen-0.17.2-py3-none-any.whl.metadata (3.2 kB)
Collecting cmudict (from textstat)
  Downloading cmudict-1.1.1-py3-none-any.whl.metadata (3.6 kB)
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading textstat-0.7.8-py3-none-any.whl (239 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m239.1/239.1 k

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
from nltk.translate.bleu_score import sentence_bleu
from nltk.tokenize import word_tokenize
import textstat
import numpy as np
from evaluate import load
from torch.utils.data import DataLoader
from transformers import DataCollatorForSeq2Seq # Import DataCollatorForSeq2Seq

# Load the SARI metric from the evaluate library
sari_metric = load("sari")

# Initialize data collator (using the same as in the training cell)
# You might need to ensure 'tokenizer' and 'model' are available in this scope or pass them
# For simplicity here, assuming they are globally available after running the training cell
# data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model) # Initialize outside function

# Function to calculate all metrics (BLEU, FKGL, SARI) in batches
def calculate_all_metrics_in_batches(trainer, dataset, tokenizer, batch_size=8):
    # Select only the columns needed for the model input and labels
    dataset_for_dataloader = dataset.select_columns(['input_ids', 'attention_mask', 'labels'])

    # Initialize data collator inside the function or ensure it's passed
    data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=trainer.model)

    # Use the data collator as the collate_fn for the DataLoader
    dataloader = DataLoader(dataset_for_dataloader, batch_size=batch_size, shuffle=False, collate_fn=data_collator)

    all_bleu_scores = []
    all_fkgl_scores = []
    all_sari_scores = []

    # Get original complex sentences and reference simple sentences from the *original* dataset
    # We need these for SARI calculation, which uses the original text
    original_complex_sentences = dataset['Normal'] # Assuming 'Normal' is the column for complex sentences
    reference_simple_sentences = dataset['Simple'] # Assuming 'Simple' is the column for simple sentences


    for i, batch in enumerate(dataloader):
        # Move batch to the same device as the model
        batch = {k: v.to(trainer.model.device) for k, v in batch.items()}


        # Generate predictions for the batch
        with torch.no_grad():
            # Ensure that the batch passed to generate contains only necessary keys
            # The collator adds 'labels', which generate doesn't need as input
            inputs = {k: v for k, v in batch.items() if k in ['input_ids', 'attention_mask']}
            outputs = trainer.model.generate(
                **inputs,
                max_length=128,
                num_beams=4,
                early_stopping=True
            )

        decoded_preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        # Get the original labels from the batch before they were potentially modified by the collator
        # The collator might pad labels with -100, so we decode the original labels from the batch
        # We need to handle potential -100 padding in labels when decoding
        decoded_labels = tokenizer.batch_decode(batch['labels'], skip_special_tokens=True)


        # Get corresponding original complex sentences and reference simple sentences for the batch
        # Accessing the original dataset with indices based on batch index might be incorrect
        # if the dataset is shuffled or processed in a way that changes the order.
        # It's safer to get the original texts from the batch itself if possible,
        # or ensure the DataLoader does not shuffle and the original dataset is aligned.
        # Assuming for now that the DataLoader with shuffle=False maintains the order.
        batch_original_complex = original_complex_sentences[i * batch_size : (i + 1) * batch_size]
        batch_reference_simple = reference_simple_sentences[i * batch_size : (i + 1) * batch_size]


        for j in range(len(decoded_preds)):
            pred = decoded_preds[j]
            label = decoded_labels[j]
            original = batch_original_complex[j]
            reference = batch_reference_simple[j]


            # BLEU score
            # Ensure label is a string before tokenizing
            reference_tokens = [word_tokenize(str(label))]
            candidate_tokens = word_tokenize(pred)
            all_bleu_scores.append(sentence_bleu(reference_tokens, candidate_tokens))

            # FKGL score
            try:
                fkgl = textstat.flesch_kincaid_grade(pred)
                if not np.isnan(fkgl): # Check for potential NaN results
                    all_fkgl_scores.append(fkgl)
            except:
                # Handle cases where textstat might fail
                pass


        # Calculate SARI score for the batch using the evaluate library
        # The evaluate library's SARI metric expects a list of references for each prediction
        references_for_sari_batch = [[ref] for ref in batch_reference_simple]
        sari_result = sari_metric.compute(sources=batch_original_complex, predictions=decoded_preds, references=references_for_sari_batch)
        all_sari_scores.append(sari_result['sari'])


    avg_bleu = np.mean(all_bleu_scores) if all_bleu_scores else 0
    avg_fkgl = np.mean(all_fkgl_scores) if all_fkgl_scores else 0
    avg_sari = np.mean(all_sari_scores) if all_sari_scores else 0


    return avg_bleu, avg_fkgl, avg_sari

# Calculate all metrics on the validation set in batches
import torch
# Ensure trainer, val_dataset, and tokenizer are available from previous cells
avg_bleu, avg_fkgl, avg_sari = calculate_all_metrics_in_batches(trainer, val_dataset, tokenizer)

print(f"Average BLEU score on validation set: {avg_bleu}")
print(f"Average FKGL score on validation set: {avg_fkgl}")
print(f"Average SARI score on validation set: {avg_sari}")

Downloading builder script: 0.00B [00:00, ?B/s]

The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Average BLEU score on validation set: 0.27139280506351515
Average FKGL score on validation set: 7.803708293059663
Average SARI score on validation set: 41.792413403768094


In [None]:
import pandas as pd


df = pd.read_csv("/content/wikilarge_test.csv")

df['simple'] = df['simplifications'].apply(lambda x:x.split(",")[0])
df['simple'] = df['simple'].apply(lambda x:x.removeprefix("['"))
df['normal'] = df['original']

df[['simple','normal']].to_csv("/content/wikilarge_test1.csv", index=False)

In [None]:
from transformers import BartTokenizer, BartForConditionalGeneration, Trainer, TrainingArguments, DataCollatorForSeq2Seq
from datasets import load_dataset, Dataset

# Load your data from CSV
train_dataset = Dataset.from_csv("/content/wikilarge_train.csv")
val_dataset = Dataset.from_csv("/content/wikilarge_test1.csv")

# Preprocess function
def preprocess(examples): # Changed to handle batches
    inputs = [ex for ex in examples['normal']]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding=True)

    # Use text_target for labels and remove as_target_tokenizer
    labels = tokenizer(text_target=examples['simple'], max_length=128, truncation=True, padding=True)

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

# Load tokenizer and model
model_name = "facebook/bart-base"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

# Apply preprocessing
train_dataset = train_dataset.map(preprocess, batched=True) # Changed batched=False to batched=True
val_dataset = val_dataset.map(preprocess, batched=True)   # Changed batched=False to batched=True

# Initialize data collator
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)


# Define training args
training_args = TrainingArguments(
    output_dir="./bart-simplification_wikilarge",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    save_strategy="epoch",
    logging_dir="./logs",
    report_to="none",  # Disable reporting to services like W&B
)

# Train
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer, # Keep tokenizer here for the trainer
    data_collator=data_collator, # Add data collator
)

trainer.train()

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/296402 [00:00<?, ? examples/s]

Map:   0%|          | 0/359 [00:00<?, ? examples/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
from nltk.translate.bleu_score import sentence_bleu
from nltk.tokenize import word_tokenize
import textstat
import numpy as np
from evaluate import load
from torch.utils.data import DataLoader
from transformers import DataCollatorForSeq2Seq # Import DataCollatorForSeq2Seq

# Load the SARI metric from the evaluate library
sari_metric = load("sari")

# Initialize data collator (using the same as in the training cell)
# You might need to ensure 'tokenizer' and 'model' are available in this scope or pass them
# For simplicity here, assuming they are globally available after running the training cell
# data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model) # Initialize outside function

# Function to calculate all metrics (BLEU, FKGL, SARI) in batches
def calculate_all_metrics_in_batches(trainer, dataset, tokenizer, batch_size=8):
    # Select only the columns needed for the model input and labels
    dataset_for_dataloader = dataset.select_columns(['input_ids', 'attention_mask', 'labels'])

    # Initialize data collator inside the function or ensure it's passed
    data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=trainer.model)

    # Use the data collator as the collate_fn for the DataLoader
    dataloader = DataLoader(dataset_for_dataloader, batch_size=batch_size, shuffle=False, collate_fn=data_collator)

    all_bleu_scores = []
    all_fkgl_scores = []
    all_sari_scores = []

    # Get original complex sentences and reference simple sentences from the *original* dataset
    # We need these for SARI calculation, which uses the original text
    original_complex_sentences = dataset['normal'] # Assuming 'Normal' is the column for complex sentences
    reference_simple_sentences = dataset['simple'] # Assuming 'Simple' is the column for simple sentences


    for i, batch in enumerate(dataloader):
        # Move batch to the same device as the model
        batch = {k: v.to(trainer.model.device) for k, v in batch.items()}


        # Generate predictions for the batch
        with torch.no_grad():
            # Ensure that the batch passed to generate contains only necessary keys
            # The collator adds 'labels', which generate doesn't need as input
            inputs = {k: v for k, v in batch.items() if k in ['input_ids', 'attention_mask']}
            outputs = trainer.model.generate(
                **inputs,
                max_length=128,
                num_beams=4,
                early_stopping=True
            )

        decoded_preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        # Get the original labels from the batch before they were potentially modified by the collator
        # The collator might pad labels with -100, so we decode the original labels from the batch
        # We need to handle potential -100 padding in labels when decoding
        decoded_labels = tokenizer.batch_decode(batch['labels'], skip_special_tokens=True)


        # Get corresponding original complex sentences and reference simple sentences for the batch
        # Accessing the original dataset with indices based on batch index might be incorrect
        # if the dataset is shuffled or processed in a way that changes the order.
        # It's safer to get the original texts from the batch itself if possible,
        # or ensure the DataLoader does not shuffle and the original dataset is aligned.
        # Assuming for now that the DataLoader with shuffle=False maintains the order.
        batch_original_complex = original_complex_sentences[i * batch_size : (i + 1) * batch_size]
        batch_reference_simple = reference_simple_sentences[i * batch_size : (i + 1) * batch_size]


        for j in range(len(decoded_preds)):
            pred = decoded_preds[j]
            label = decoded_labels[j]
            original = batch_original_complex[j]
            reference = batch_reference_simple[j]


            # BLEU score
            # Ensure label is a string before tokenizing
            reference_tokens = [word_tokenize(str(label))]
            candidate_tokens = word_tokenize(pred)
            all_bleu_scores.append(sentence_bleu(reference_tokens, candidate_tokens))

            # FKGL score
            try:
                fkgl = textstat.flesch_kincaid_grade(pred)
                if not np.isnan(fkgl): # Check for potential NaN results
                    all_fkgl_scores.append(fkgl)
            except:
                # Handle cases where textstat might fail
                pass


        # Calculate SARI score for the batch using the evaluate library
        # The evaluate library's SARI metric expects a list of references for each prediction
        references_for_sari_batch = [[ref] for ref in batch_reference_simple]
        sari_result = sari_metric.compute(sources=batch_original_complex, predictions=decoded_preds, references=references_for_sari_batch)
        all_sari_scores.append(sari_result['sari'])


    avg_bleu = np.mean(all_bleu_scores) if all_bleu_scores else 0
    avg_fkgl = np.mean(all_fkgl_scores) if all_fkgl_scores else 0
    avg_sari = np.mean(all_sari_scores) if all_sari_scores else 0


    return avg_bleu, avg_fkgl, avg_sari

# Calculate all metrics on the validation set in batches
import torch
# Ensure trainer, val_dataset, and tokenizer are available from previous cells
avg_bleu, avg_fkgl, avg_sari = calculate_all_metrics_in_batches(trainer, val_dataset, tokenizer)

print(f"Average BLEU score on validation set: {avg_bleu}")
print(f"Average FKGL score on validation set: {avg_fkgl}")
print(f"Average SARI score on validation set: {avg_sari}")

The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Average BLEU score on validation set: 0.11059985909211038
Average FKGL score on validation set: 10.480386073999606
Average SARI score on validation set: 48.32810403017347
