In [1]:
from datasets import load_dataset, Dataset
from transformers import T5Tokenizer, DataCollatorForSeq2Seq, TFAutoModelForSeq2SeqLM
import tensorflow as tf
import pandas as pd
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split

# Load Data

In [13]:
df = pd.read_csv('news summary.csv')
df.head()

Unnamed: 0,Article,Summary,Topic
0,ad sales boost time warner profit quarterly...,timewarner said fourth quarter sales rose 2% t...,business
1,dollar gains on greenspan speech the dollar...,the dollar has hit its highest level against t...,business
2,yukos unit buyer faces loan claim the owner...,yukos' owner menatep group says it will ask ro...,business
3,high fuel prices hit ba's profits british a...,"rod eddington, ba's chief executive, said the ...",business
4,pernod takeover talk lifts domecq shares in...,pernod has reduced the debt it took on to fund...,business


In [14]:
# Tells T5 what task to do
df['Article'] = 'summarize: ' + df['Article']
df['Summary'] = df['Summary'] + 'EOS'
df.head()

Unnamed: 0,Article,Summary,Topic
0,summarize: ad sales boost time warner profit ...,timewarner said fourth quarter sales rose 2% t...,business
1,summarize: dollar gains on greenspan speech ...,the dollar has hit its highest level against t...,business
2,summarize: yukos unit buyer faces loan claim ...,yukos' owner menatep group says it will ask ro...,business
3,summarize: high fuel prices hit ba's profits ...,"rod eddington, ba's chief executive, said the ...",business
4,summarize: pernod takeover talk lifts domecq ...,pernod has reduced the debt it took on to fund...,business


In [15]:
df_subset = df[:100]
X = df_subset['Article'].values
y = df_subset['Summary']

## Split Data

In [16]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Tokenize

In [17]:
tokenizer = T5Tokenizer.from_pretrained('t5-small')
train = tokenizer(list(x_train), max_length=512, truncation=True)
test = tokenizer(list(x_test), max_length=512, truncation=True)

# Don't need to attention maks the targets
y_train = tokenizer(list(y_train), max_length=256, truncation=True)["input_ids"]
y_test = tokenizer(list(y_test), max_length=256, truncation=True)["input_ids"]

# def key_relable(old_dict, key_suffix):
#     new_dict = {key+key_suffix:value for (key, value) in old_dict.items()}
#     del old_dict
#     return new_dict

# # Relable keys for identifiability
# train = key_relable(train, '_article')
# test = key_relable(test, '_article')


train.keys()

dict_keys(['input_ids', 'attention_mask'])

## Create Tensorflow Dataset

In [18]:
model = TFAutoModelForSeq2SeqLM.from_pretrained('t5-small')
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, return_tensors='tf')

All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at t5-small.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


In [32]:
tokenizer('hello how are you')

{'input_ids': [21820, 149, 33, 25, 1], 'attention_mask': [1, 1, 1, 1, 1]}

In [19]:
# train_article_ds = Dataset.from_dict(x_train)
# train_summary_ds = Dataset.from_dict(y_train)

# test_article_ds = Dataset.from_dict(x_test)
# test_article_ds = Dataset.from_dict(y_test)

train['labels'] = y_train
test['labels'] = y_test

train_ds = Dataset.from_dict(train)
test_ds = Dataset.from_dict(test)

In [20]:
train_dataset = train_ds.to_tf_dataset(
    columns=['input_ids', 'attention_mask', 'labels'],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=4
)

test_dataset = train_ds.to_tf_dataset(
    columns=['input_ids', 'attention_mask', 'labels'],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=4
)

# Process Dataset

In [21]:
model.summary()

Model: "tft5_for_conditional_generation_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 shared (TFSharedEmbeddings)  multiple                 16449536  
                                                                 
 encoder (TFT5MainLayer)     multiple                  18881280  
                                                                 
 decoder (TFT5MainLayer)     multiple                  25175808  
                                                                 
Total params: 60,506,624
Trainable params: 60,506,624
Non-trainable params: 0
_________________________________________________________________


In [23]:
model.compile(optimizer="Adam")
model.fit(train_dataset, validation_data=test_dataset, epochs=50)

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! Please ensure your labels are passed as the 'labels' key of the input dict so that they are accessible to the model during the forward pass. To disable this behaviour, please pass a loss argument, or explicitly pass loss=None if you do not want your model to compute a loss.


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
 3/20 [===>..........................] - ETA: 53s - loss: 0.0556

KeyboardInterrupt: 

# Generate Text

In [52]:
def generate(input_text, model=model):
    if type(input_text) == str:
        input_text = [input_text]
    input_tensors = tokenizer(input_text, return_tensors='tf', padding=True, truncation=True)['input_ids']

    generated_sequence_ids = model.generate(input_tensors)

    sequences = tokenizer.batch_decode(generated_sequence_ids)
    return sequences

['<pad> <extra_id_0> disability perspectives have become familiar in feminist approaches to some philosophical topics, such as models and standards',
 '<pad> Hilbert’s approach raised fascinating metamathematical questions—from semantic completeness through']

In [46]:
s = model.generate(tokenizer(['Hi i went to the store today'], return_tensors='tf')['input_ids'])

TypeError: batch_decode() missing 1 required positional argument: 'sequences'