In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install datasets
!pip install pydot --quiet
!pip install gensim --quiet
# !pip install sentencepiece

!pip install tensorflow==2.15.0 --quiet
!pip install tf_keras==2.15.0 --quiet
!pip install tensorflow-text==2.15.0 --quiet


!pip install transformers==4.17 --quiet

!pip install -q evaluate
!pip install -q rouge_score

In [None]:
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow import keras

from tensorflow.keras import layers
from tensorflow.keras.layers import Embedding, Input, Dense, Lambda
from tensorflow.keras.models import Model
import tensorflow.keras.backend as K
import transformers

from transformers import (
    T5Tokenizer,
    TFT5ForConditionalGeneration,
    PegasusTokenizer,
    TFPegasusForConditionalGeneration,
)
from transformers import logging
logging.set_verbosity_error()

import sklearn as sk
import os

from sklearn.model_selection import train_test_split

from datasets import load_dataset, load_from_disk

import matplotlib.pyplot as plt

from pprint import pprint

import pickle

drive = '/content/drive/MyDrive/'

In [None]:
def print_version(library_name):
    try:
        lib = __import__(library_name)
        version = getattr(lib, '__version__', 'Version number not found')
        print(f"{library_name} version: {version}")
    except ImportError:
        print(f"{library_name} not installed.")
    except Exception as e:
        print(f"An error occurred: {e}")

In [None]:
#confirm versions
print_version('numpy')
print_version('transformers')
print_version('tensorflow')
print_version('keras')

In [None]:
# 95th Percentile. See tokenizing_analysis.ipynb.
T5_TRAIN_INPUT_95TH_PERCENTILE = 1046
T5_TRAIN_TARGET_95TH_PERCENTILE = 104

T5_VAL_INPUT_95TH_PERCENTILE = 1044
T5_VAL_TARGET_95TH_PERCENTILE = 103

T5_TEST_INPUT_95TH_PERCENTILE = 1044
T5_TEST_TARGET_95TH_PERCENTILE = 103

PEGASUS_TRAIN_INPUT_95TH_PERCENTILE = 945
PEGASUS_TRAIN_TARGET_95TH_PERCENTILE = 93

PEGASUS_VAL_INPUT_95TH_PERCENTILE = 943
PEGASUS_VAL_TARGET_95TH_PERCENTILE = 93

PEGASUS_TEST_INPUT_95TH_PERCENTILE = 944
PEGASUS_TEST_TARGET_95TH_PERCENTILE = 92

In [None]:
# We downloaded this already. Don't need to load this again.
# tldr_dataset = load_dataset("webis/tldr-17", trust_remote_code=True, split='train')

# tldr_train_dataset, tldr_test_dataset = load_dataset("webis/tldr-17", trust_remote_code=True, split=['train[:70%]', 'train[70%:]'])
# tldr_dataset.save_to_disk(f'/{drive}/tldr_dataset')

In [None]:
# We already pickled our test split. Don't need to load this again.
# tldr_dataset_local = load_from_disk(f'{drive}/tldr_dataset')
# len(tldr_dataset_local)

In [None]:
# We already pickled our test split. Don't need to do this again.
# X = tldr_dataset_local['content']
# y = tldr_dataset_local['summary']

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# NOTE: This is the same size as the test set size.
#   0.25 of the size to split * 0.8 of the original data = 0.2.
# X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

# print(f"Training set size: {len(X_train)}")
# print(f"Validation set size: {len(X_val)}")
# print(f"Test set size: {len(X_test)}")

In [None]:
# We downloaded this already.
# with open(f'{drive}/train_test_split.pkl', 'wb') as file:
    # pickle.dump((X_train, X_val, X_test, y_train, y_val, y_test), file)

In [None]:
with open(f'{drive}/train_test_split.pkl', 'rb') as file:
    X_train, X_val, X_test, y_train, y_val, y_test = pickle.load(file)

In [None]:
print(f"Training set size: {len(X_train)}, {len(y_train)}")
print(f"Validation set size: {len(X_val)}, {len(y_val)}")
print(f"Test set size: {len(X_test)}, {len(y_test)}")

In [None]:
# Comment out whichever model we don't want to test at the moment.

# Load the T5 model and tokenizer.
t5_model_name = 't5-base'
t5_tokenizer = T5Tokenizer.from_pretrained(t5_model_name)
t5_model = TFT5ForConditionalGeneration.from_pretrained(t5_model_name)

# Load the PEGASUS model and tokenizer.
pegasus_model_name = 'google/pegasus-xsum'
pegasus_tokenizer = PegasusTokenizer.from_pretrained(pegasus_model_name)
pegasus_model = TFPegasusForConditionalGeneration.from_pretrained(pegasus_model_name)

In [None]:
# Experimental wrapper model. This was used for experimentation only, and not in the final

def build_t5_training_wrapper_model(t5_model, learning_rate=5e-5, max_length_input=T5_TRAIN_INPUT_95TH_PERCENTILE, max_length_output=T5_TRAIN_TARGET_95TH_PERCENTILE):
    input_ids = layers.Input(shape=(max_length_input), dtype=tf.int32, name='input_ids')
    attention_mask = layers.Input(shape=(max_length_input), dtype=tf.int32, name='attention_mask')
    decoder_input_ids = layers.Input(shape=(max_length_output), dtype=tf.int32, name='labels')

    t5_logits = t5_model(input_ids, attention_mask=attention_mask, decoder_input_ids=decoder_input_ids)[0]

    model = tf.keras.models.Model(inputs=[input_ids, attention_mask, decoder_input_ids],
                                  outputs=[t5_logits], name="Output")

    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
                  loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True),
                  metrics=['accuracy'])

    return model

def build_pegasus_training_wrapper_model(pegasus_model, learning_rate=5e-5, max_length_input=PEGASUS_TRAIN_INPUT_95TH_PERCENTILE, max_length_output=PEGASUS_TRAIN_TARGET_95TH_PERCENTILE):
    input_ids = layers.Input(shape=(max_length_input), dtype=tf.int32, name='input_ids')
    attention_mask = layers.Input(shape=(max_length_input), dtype=tf.int32, name='attention_mask')
    decoder_input_ids = layers.Input(shape=(max_length_output), dtype=tf.int32, name='labels')

    pegasus_logits = pegasus_model(input_ids, attention_mask=attention_mask, decoder_input_ids=decoder_input_ids)[0]

    model = tf.keras.models.Model(inputs=[input_ids, attention_mask, decoder_input_ids],
                                  outputs=[pegasus_logits], name="Output")

    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
                  loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True),
                  metrics=['accuracy'])

    return model

In [None]:
def preprocess_data(text_pairs, tokenizer, model, max_length, max_length_output, is_t5=True):
    orig_text = [orig for orig, target in text_pairs]
    orig_encoded = tokenizer.batch_encode_plus(
        orig_text,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='tf'
    )

    orig_input_ids = np.array(orig_encoded["input_ids"], dtype="int32")
    orig_attention_masks = np.array(orig_encoded["attention_mask"], dtype="int32")

    target_text = [target for orig, target in text_pairs]
    target_encoded = tokenizer.batch_encode_plus(
        target_text,
        max_length=max_length_output,
        padding='max_length',
        truncation=True,
        return_tensors='tf'
    )
    label_ids = np.array(target_encoded['input_ids'])

    preprocessed_data = [orig_input_ids, orig_attention_masks]
    decoder_input_ids = []

    # Appropriately handle decoder_input_ids.
    if is_t5:
      decoder_input_ids = model._shift_right(label_ids)

    # Assume this is PEGASUS.
    else:
      decoder_input_ids = tf.concat([tf.fill([label_ids.shape[0], 1], tokenizer.pad_token_id), label_ids[:, :-1]], axis=-1)

    preprocessed_data.append(decoder_input_ids)

    return preprocessed_data, label_ids

In [None]:
prefix = "summarize: "

# If we uncomment this, add [:subset_size] to each train, val, test split.
# subset_size = 10000

# Uncomment whichever model we want to test.

# NOTE: We already pickled this because this takes a very long time. See later cells.
t5_train_input_prefixes = [f"{prefix}{content}" for content in  X_train]
t5_val_input_prefixes = [f"{prefix}{content}" for content in X_val]
t5_test_input_prefixes = [f"{prefix}{content}" for content in X_test]

# T5 - Train
t5_train_inputs, t5_target_input_ids = preprocess_data(
    list(zip(t5_train_input_prefixes, y_train)),
    t5_tokenizer,
    t5_model,
    max_length=T5_TRAIN_INPUT_95TH_PERCENTILE,
    max_length_output=T5_TRAIN_TARGET_95TH_PERCENTILE,
)

# Pickle for faster use later. We already did this.
# t5_train_inputs_path = "tldr_preprocessed_data/t5_train_inputs.pkl"
# t5_train_target_path = "tldr_preprocessed_data/t5_target_input_ids.pkl"

# with open(t5_train_inputs_path, 'wb') as f:
#     pickle.dump(t5_train_inputs, f)

# with open(t5_train_target_path, 'wb') as f:
#     pickle.dump(t5_target_input_ids, f)

In [None]:
# PEGASUS - Train

# NOTE: We already pickled this because this takes a very long time. See later cells.
pegasus_train_inputs, pegasus_target_input_ids = preprocess_data(
    list(zip(X_train, y_train)),
    pegasus_tokenizer,
    pegasus_model,
    is_t5=False,
    max_length=PEGASUS_TRAIN_INPUT_95TH_PERCENTILE,
    max_length_output=PEGASUS_TRAIN_TARGET_95TH_PERCENTILE,
)

# Pickle for faster use later. We already did this.
# pegasus_train_inputs_path = "tldr_preprocessed_data/pegasus_train_inputs.pkl"
# pegasus_train_target_path = "tldr_preprocessed_data/pegasus_target_input_ids.pkl"

# with open(pegasus_train_inputs_path, 'wb') as f:
#     pickle.dump(pegasus_train_inputs, f)

# with open(pegasus_train_target_path, 'wb') as f:
#     pickle.dump(pegasus_target_input_ids, f)

In [None]:
# # T5 - Val

# NOTE: We already pickled this because this takes a very long time. See later cells.
t5_val_inputs, t5_val_target_input_ids = preprocess_data(
    list(zip(t5_val_input_prefixes, y_val)),
    t5_tokenizer,
    t5_model,
    max_length=T5_TRAIN_INPUT_95TH_PERCENTILE,
    max_length_output=T5_TRAIN_TARGET_95TH_PERCENTILE,
)

# Pickle for faster use later. We already did this.
# t5_val_inputs_path = "tldr_preprocessed_data/t5_val_inputs.pkl"
# t5_val_target_path = "tldr_preprocessed_data/t5_val_target_input_ids.pkl"

# with open(t5_val_inputs_path, 'wb') as f:
#     pickle.dump(t5_val_inputs, f)

# with open(t5_val_target_path, 'wb') as f:
#     pickle.dump(t5_val_target_input_ids, f)


# NOTE: If we want to ignore reprocessing the data, comment them out and uncomment this.
# with open(t5_train_inputs_path, 'rb') as f:
#     t5_train_inputs = pickle.load(f)

# with open(t5_train_target_path, 'rb') as f:
#     t5_target_input_ids = pickle.load(f)

# with open(t5_val_inputs_path, 'rb') as f:
#     t5_val_inputs = pickle.load(f)

# with open(t5_val_target_path, 'rb') as f:
#     t5_val_target_input_ids = pickle.load(f)


# Create Datasets for training.
t5_train_dataset = {
    'input_ids': t5_train_inputs[0],
    'attention_mask': t5_train_inputs[1],
    # 'decoder_input_ids': t5_train_inputs[2],
    'labels': t5_target_input_ids
}

t5_val_dataset = {
    'input_ids': t5_val_inputs[0],
    'attention_mask': t5_val_inputs[1],
    # 'decoder_input_ids': t5_val_inputs[2],
    'labels': t5_val_target_input_ids
}

print(len(t5_train_dataset['labels']))
print(len(t5_val_dataset['labels']))

In [None]:
# PEGASUS - Val
pegasus_val_inputs, pegasus_val_target_input_ids = preprocess_data(
    list(zip(X_val, y_val)),
    pegasus_tokenizer,
    pegasus_model,
    is_t5=False,
    max_length=PEGASUS_TRAIN_INPUT_95TH_PERCENTILE,
    max_length_output=PEGASUS_TRAIN_TARGET_95TH_PERCENTILE)

# Pickle for faster use later. We already did this.
# pegasus_val_inputs_path = "tldr_preprocessed_data/pegasus_val_inputs.pkl"
# pegasus_val_target_path = "tldr_preprocessed_data/pegasus_val_target_input_ids.pkl"


# with open(pegasus_val_inputs_path, 'wb') as f:
#     pickle.dump(pegasus_val_inputs, f)

# with open(pegasus_val_target_path, 'wb') as f:
#     pickle.dump(pegasus_val_target_input_ids, f)


# NOTE: If we want to ignore reprocessing the data, comment them out and uncomment this.
# with open(pegasus_train_inputs_path, 'rb') as f:
#     pegasus_train_inputs = pickle.load(f)

# with open(pegasus_train_target_path, 'rb') as f:
#     pegasus_target_input_ids = pickle.load(f)

# with open(pegasus_val_inputs_path, 'rb') as f:
#     pegasus_val_inputs = pickle.load(f)

# with open(pegasus_val_target_path, 'rb') as f:
#     pegasus_val_target_input_ids = pickle.load(f)


# Create Datasets for training.
pegasus_train_dataset = {
    'input_ids': pegasus_train_inputs[0],
    'attention_mask': pegasus_train_inputs[1],
    # 'decoder_input_ids': pegasus_train_inputs[2],
    'labels': pegasus_target_input_ids
}

pegasus_val_dataset = {
    'input_ids': pegasus_val_inputs[0],
    'attention_mask': pegasus_val_inputs[1],
    # 'decoder_input_ids': pegasus_val_inputs[2],
    'labels': pegasus_val_target_input_ids
}

print(pegasus_train_dataset['labels'])
print(pegasus_val_dataset['labels'])
print(pegasus_val_dataset['labels'])

In [None]:
import time

class TimeHistory(tf.keras.callbacks.Callback):
  def on_train_begin(self, logs=None):
    self.train_start_time = time.time()
    self.epoch_times = []

  def on_epoch_begin(self, epoch, logs=None):
    self.epoch_start_time = time.time()

  def on_epoch_end(self, epoch, logs=None):
    epoch_time = time.time() - self.epoch_start_time
    self.epoch_times.append(epoch_time)
    print(f"Epoch {epoch + 1} training time: {epoch_time:.2f} seconds")

  def on_train_end(self, logs=None):
    total_training_time = time.time() - self.train_start_time
    print(f"Total training time: {total_training_time:.2f} seconds")


In [None]:
# Define early stopping callback.
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=3,
    verbose=1,
    mode='min',
    restore_best_weights=True
)

# Define T5 model epoch checkpoint callback.
t5_checkpoint_dir = f'{drive}/t5_fine_tuned_model_checkpoints/'
t5_checkpoint_filepath = t5_checkpoint_dir + 't5_reddit_tldr_weights.{epoch:02d}-{val_accuracy:.2f}.hdf5'
t5_model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=t5_checkpoint_filepath,
    save_weights_only=True)

# Define PEGASUS model epoch checkpoint callback.
pegasus_checkpoint_dir = f'{drive}/pegasus_fine_tuned_model_checkpoints/'
pegasus_checkpoint_filepath = pegasus_checkpoint_dir + 'pegasus_reddit_tldr_weights.{epoch:02d}-{val_accuracy:.2f}.hdf5'
pegasus_model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=pegasus_checkpoint_filepath,
    save_weights_only=True)

In [None]:
epochs = 3
batch_size = 8
learning_rate = 5e-5

# Just to make sure we're always starting from the base.
t5_model_name = 't5-base'
t5_tokenizer = T5Tokenizer.from_pretrained(t5_model_name)
pegasus_model_name = 'google/pegasus-xsum'
pegasus_tokenizer = PegasusTokenizer.from_pretrained(pegasus_model_name)
pegasus_model = TFPegasusForConditionalGeneration.from_pretrained(pegasus_model_name)

# NOTE: Below is the experimental training wrapper model training.

# Build the training wrapper model.
# t5_model_wrapper = build_t5_training_wrapper_model(t5_model)

# Define time callback.
t5_time_callback = TimeHistory()

# optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
# loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

# t5_model.compile(optimizer=optimizer, loss=loss)

# Define custom ROUGE callback. Metric is only available in keras 3. Hardware was incompatible.
# After numerous attempts at this, we decided to forego this.
# t5_ROUGE_callback = ROUGECallback(
#     model=t5_model,
#     tokenizer=t5_tokenizer,
#     val_data=t5_val_dataset,
#     log_dir='./logs'
# )

# history = t5_model.fit(
#     train_dataset,
#     validation_data=t5_val_dataset,
#     # batch_size=batch_size,
#     epochs=epochs,
#     verbose=1,
#     callbacks=[early_stopping, t5_model_checkpoint_callback, t5_time_callback]
# )

# history = t5_model_wrapper.fit(
#     t5_train_dataset,
#     t5_train_dataset['labels'],
#     # train_dataset,
#     # t5_train_dataset['labels'],
#     # validation_data=val_dataset,
#     validation_data=t5_val_dataset,
#     batch_size=batch_size,
#     epochs=epochs,
#     verbose=1,
#     callbacks=[early_stopping, t5_model_checkpoint_callback, t5_time_callback]
# )

# # PEGASUS - Change the name t5 here to generic model wrapper.
# pegasus_model_wrapper = build_pegasus_training_wrapper_model(pegasus_model)
pegasus_time_callback = TimeHistory()

# history = pegasus_model_wrapper.fit(
#     pegasus_train_dataset,
#     pegasus_train_dataset['labels'],
#     validation_data=pegasus_val_dataset,
#     batch_size=batch_size,
#     epochs=epochs,
#     verbose=1,
#     callbacks=[early_stopping, pegasus_model_checkpoint_callback, pegasus_time_callback]
# )

In [None]:
# NOTE: Below is the model training without the wrapper model. Comment/Uncomment the model info that you need.

model = TFPegasusForConditionalGeneration.from_pretrained(pegasus_model_name)
# model = TFT5ForConditionalGeneration.from_pretrained(t5_model_name)

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
    metrics=['accuracy']
)


history = model.fit(
    pegasus_train_dataset,
    pegasus_train_dataset['labels'],
    validation_data=pegasus_val_dataset,
    batch_size=batch_size,
    epochs=epochs,
    verbose=1,
    callbacks=[pegasus_model_checkpoint_callback, pegasus_time_callback]
)

# history = model.fit(
#     t5_train_dataset,
#     t5_train_dataset['labels'],
#     validation_data=t5_val_dataset,
#     batch_size=batch_size,
#     epochs=epochs,
#     verbose=1,
#     callbacks=[t5_model_checkpoint_callback, t5_time_callback]
#     # callbacks=[early_stopping, t5_model_checkpoint_callback, t5_time_callback]

In [None]:
print("Training History:", history.history)

# with open(f'{drive}/Training_10Epochs/t5_training_history.pkl', 'wb') as f:
with open(f'{drive}/Training_20Epochs/pegasus_training_history.pkl', 'wb') as f:
    pickle.dump(history.history, f)

In [None]:
model.save_weights(f'{drive}/pegasus_fine_tuned_weights.h5')
model.save_pretrained(f'{drive}/pegasus_fine_tuned')