In [2]:
# !pip install -q transformers==v4.37.1 sentencepiece datasets accelerate

In [3]:
import torch
from torch import nn
from torch.optim import AdamW
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.tensorboard import SummaryWriter
from tqdm.auto import tqdm
from datasets import load_dataset, load_metric
from transformers import MT5ForConditionalGeneration, AutoTokenizer, DataCollatorForSeq2Seq

In [7]:
dataset = load_dataset("Owishiboo/grammar-correction")

Downloading readme:   0%|          | 0.00/108 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.21M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [8]:
dataset['train']

Dataset({
    features: ['Unnamed: 0', 'input', 'target'],
    num_rows: 6004
})

In [9]:
dataset['train']['target'][0]

'New technology has been introduced to society .'

In [None]:
model = MT5ForConditionalGeneration.from_pretrained("google/mt5-small")
tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")

In [None]:
special_tokens = {"bos_token":"<s>"}
tokenizer.add_special_tokens(special_tokens)
tokenizer.all_special_tokens

# Model output test

In [8]:
input = """ New and new technology has been introduced to the society ."""
output = """New technology has been introduced to society ."""
inputs = tokenizer(input, text_target=output, return_tensors="pt")

tokenizer.batch_decode(model.generate(inputs.input_ids,max_length=50),skip_special_tokens=True)[0]

'<extra_id_0>'

# Dataset splits

In [11]:
datasets_train_test = dataset["train"].train_test_split(test_size=400)
datasets_train_validation = datasets_train_test["test"].shuffle().train_test_split(test_size=200)

In [12]:
datasets_train_test

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'input', 'target'],
        num_rows: 5604
    })
    test: Dataset({
        features: ['Unnamed: 0', 'input', 'target'],
        num_rows: 400
    })
})

In [13]:
datasets_train_validation

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'input', 'target'],
        num_rows: 200
    })
    test: Dataset({
        features: ['Unnamed: 0', 'input', 'target'],
        num_rows: 200
    })
})

In [14]:
dataset["train"] = datasets_train_test["train"]
dataset["validation"] = datasets_train_validation["train"]
test_dataset = datasets_train_validation["test"]

In [15]:
dataset["train"], dataset["validation"],test_dataset

(Dataset({
     features: ['Unnamed: 0', 'input', 'target'],
     num_rows: 5604
 }),
 Dataset({
     features: ['Unnamed: 0', 'input', 'target'],
     num_rows: 200
 }),
 Dataset({
     features: ['Unnamed: 0', 'input', 'target'],
     num_rows: 200
 }))

In [16]:
max_input_length = 512
max_target_length = 512

def preprocess_data(data):

  inputs = data["input"]
  targets = ["<s>" + text for text in data["target"]]
  model_inputs = tokenizer(inputs,
                           padding=True,
                           max_length=max_input_length,
                           truncation=True)

  # Setup the tokenizer for targets
  labels = tokenizer(targets,
                     padding=True,
                     max_length=max_target_length,
                     truncation=True)

  model_inputs["labels"] = labels["input_ids"]
  return model_inputs

In [None]:
tokenized_dataset = dataset.map(preprocess_data,
                                batched=True)

In [16]:
tokenized_dataset.keys()

dict_keys(['train', 'validation'])

In [18]:
tokenized_dataset = tokenized_dataset.remove_columns(['input', 'target', 'Unnamed: 0'])

In [19]:
data_collator = DataCollatorForSeq2Seq(tokenizer,model=model)

# Fine tuning

In [20]:
# parameters
BATCH_SIZE = 8
LR = 0.001
WEIGHT_DECAY = 1e-5
BETAS = (0.9,0.999)
EPSILON = 1e-8
EPOCHS = 20
EARLY_STOP_PATIENCE = 10
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [21]:
# Dataloaders
dataloader_train = torch.utils.data.DataLoader(dataset=tokenized_dataset['train'], collate_fn=data_collator, batch_size=BATCH_SIZE)
dataloader_val = torch.utils.data.DataLoader(dataset=tokenized_dataset['validation'], collate_fn=data_collator, batch_size=BATCH_SIZE)

In [None]:
# Set up tensorboard writer
writer = SummaryWriter()

# Define AdamW optimizer and ReduceLROnPlateau scheduler
optimizer = AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY, betas=BETAS, eps=EPSILON)
lr_scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=5, verbose=True)
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

# Initialize early stopping variables
best_val_loss = float('inf')
early_stopping_counter = 0
model.to(DEVICE)

# Training loop
for epoch in range(EPOCHS):
    print("Train")
    total_loss = 0

    for train_batch in tqdm(dataloader_train):
        model.train()
        train_batch_inputs = {k: v.to(DEVICE) for k, v in train_batch.items()}

        optimizer.zero_grad()
        logits = model(**train_batch_inputs).logits
        loss = criterion(logits.view(-1, logits.shape[-1]), train_batch_inputs["labels"].view(-1))
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        writer.add_scalar('Train Loss', loss.item(), epoch)

    average_loss = total_loss / len(dataloader_train)
    print(f'Epoch {epoch + 1}/{EPOCHS}, Train Loss: {average_loss}')

    print("Val")
    total_val_loss = 0

    for val_batch in tqdm(dataloader_val):
        model.eval()
        val_batch_inputs = {k: v.to(DEVICE) for k, v in val_batch.items()}

        val_logits = model(**val_batch_inputs).logits
        val_loss = criterion(val_logits.view(-1, val_logits.shape[-1]), val_batch_inputs["labels"].view(-1))

        total_val_loss += val_loss.item()
        writer.add_scalar('Validation Loss', loss.item(), epoch)

    average_val_loss = total_val_loss / len(dataloader_val)
    print(f'Epoch {epoch + 1}/{EPOCHS}, Validation Loss: {average_val_loss}')

    # Update learning rate using the scheduler
    lr_scheduler.step(average_val_loss)

    # Check for early stopping
    if average_val_loss < best_val_loss:
        best_val_loss = average_val_loss
        early_stopping_counter = 0
    else:
        early_stopping_counter += 1
        if early_stopping_counter >= EARLY_STOP_PATIENCE:
            print(f'Early stopping after {epoch + 1} epochs without improvement.')
            model.save_pretrained(f'/content/drive/MyDrive/gujju-gpt/Models/mT5-gujarati-summarization_ep{epoch+1}')
            break
# Close tensorboard writer
writer.close()

In [24]:
input = """becase if study in this way of cours , I will gante so much , I also will have alot of information the book I study ."""
inputs = tokenizer(input, text_target=output, return_tensors="pt")
output = tokenizer.batch_decode(model.to('cpu').generate(inputs.input_ids,max_length=50),skip_special_tokens=True)[0]
output

'Because if studying occurs in this way, I will learn so much ; I also will have a lot of information from the book I study.'

In [26]:
input = """I think they're going to love they're new apartment."""
inputs = tokenizer(input, text_target=output, return_tensors="pt")
output = tokenizer.batch_decode(model.to('cpu').generate(inputs.input_ids,max_length=50),skip_special_tokens=True)[0]
output

"I think they're going to love the new buildings."

# Test results

In [26]:
model = MT5ForConditionalGeneration.from_pretrained(r"mT5-\mT5-ep19-20240216T113959Z-001")
tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")



In [25]:
input = """I think they're going to love they're new apartment."""
inputs = tokenizer(input, return_tensors="pt")
output = tokenizer.batch_decode(model.to('cpu').generate(inputs.input_ids,max_length=50),skip_special_tokens=True)[0]
output

"I think they're going to love the new buildings."

In [28]:
incorrect = []
correct = []

In [30]:
output_file = "mT5-results-ep29.txt"
with open(output_file, 'w', encoding='utf-8') as file:
    for i, item in enumerate(test_dataset):
        inputs = item['input']
        targets = item['target']
        model_input = tokenizer(inputs, return_tensors="pt").to('cpu')
        result = tokenizer.batch_decode(
            model.to('cpu').generate(model_input.input_ids,max_length=50),
            skip_special_tokens=True)[0]

        incorrect.append(targets)
        correct.append(result)



        # Write to the file in the desired format
        file.write(f'Sample No. : {i}\n')
        file.write(f'Inputs: {inputs}\n')
        file.write(f'Targets: {targets}\n')
        file.write(f'Generated: {result}\n\n')

In [32]:
# notebook workaround if you get error : "A UTF-8 locale is required. Got ANSI_X3.4-1968" while using !pip install
# import locale
# locale.getpreferredencoding = lambda: "UTF-8"

In [27]:
from rouge_score import rouge_scorer

# Create a RougeScorer object
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Initialize variables to accumulate scores
total_scores = {"rouge1": {"precision": 0, "recall": 0, "fmeasure": 0},
                "rouge2": {"precision": 0, "recall": 0, "fmeasure": 0},
                "rougeL": {"precision": 0, "recall": 0, "fmeasure": 0}}

# Iterate over samples and accumulate scores
for reference_incorrect, reference_correct in zip(incorrect, correct):
    scores = scorer.score(reference_incorrect, reference_correct)
    for metric in total_scores:
        total_scores[metric]["precision"] += scores[metric].precision
        total_scores[metric]["recall"] += scores[metric].recall
        total_scores[metric]["fmeasure"] += scores[metric].fmeasure

# Calculate average scores
num_samples = len(incorrect)
average_scores = {metric: {measure: total_scores[metric][measure] / num_samples for measure in ["precision", "recall", "fmeasure"]} for metric in total_scores}

# Print the aggregated ROUGE scores
print("Aggregated ROUGE Scores:")
print(average_scores)


NameError: name 'incorrect' is not defined