# Off the shelf results with T5

In [4]:
from transformers import T5ForConditionalGeneration, T5Tokenizer,  set_seed

In [3]:
base_model = T5ForConditionalGeneration.from_pretrained('t5-base')
base_tokenizer = T5Tokenizer.from_pretrained('t5-base')

Downloading spiece.model: 100%|██████████| 792k/792k [00:00<00:00, 2.02MB/s]
Downloading tokenizer.json: 100%|██████████| 1.39M/1.39M [00:00<00:00, 2.63MB/s]
For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/t

## Abstractive summarization

In [8]:
preprocess_text = """Sinan Ozdemir is a data scientist, startup founder, and educator living in the San Francisco Bay
Charlie; cat, Euclid; and bearded dragon, Fiero. He spent his academic career studying pure mathematics
at Johns Hopkins University before transitioning to education. He spent several years conducting lectures
on data science at Johns Hopkins University and at the General Assembly before founding his own startup,
Kylie.ai, which uses artificial intelligence to build chatbots from historical transcripts.
After completing a Fellowship at the Y Combinator accelerator, Sinan spent most of his time working on
his fast-growing company, while creating educational material for data science.
""".strip().replace('\n','')

preprocess_text

'Sinan Ozdemir is a data scientist, startup founder, and educator living in the San Francisco BayCharlie; cat, Euclid; and bearded dragon, Fiero. He spent his academic career studying pure mathematicsat Johns Hopkins University before transitioning to education. He spent several years conducting lectureson data science at Johns Hopkins University and at the General Assembly before founding his own startup,Kylie.ai, which uses artificial intelligence to build chatbots from historical transcripts.After completing a Fellowship at the Y Combinator accelerator, Sinan spent most of his time working onhis fast-growing company, while creating educational material for data science.'

In [11]:
set_seed(100)

t5_prepared_text = "summarize: " + preprocess_text

input_ids = base_tokenizer.encode(t5_prepared_text, return_tensors='pt')

# summarize
summary_ids = base_model.generate(
    input_ids,
    num_beams=4,
    no_repeat_ngram_size=3,
    min_length=30,
    max_length=50,
    early_stopping=True
)

output = base_tokenizer.decode(summary_ids[0], skip_special_tokens=True)

print(output)

Sinan Ozdemir is a data scientist, startup founder, and educator. he founded his own startup, Kylie.ai, which uses artificial intelligence to build chatbots.


## English -> German Translation

In [26]:
set_seed(100)
input_ids = base_tokenizer.encode('translate English to Spanish: Where is the chocolate?', return_tensors='pt')
# input_ids = base_tokenizer('translate English to Spanish: Where is the chocolate?', return_tensors='pt').input_ids

translate_ids = base_model.generate(
    input_ids,
    num_beams=4,
    no_repeat_ngram_size=3,
    max_length=20,
    early_stopping=True
)

output = base_tokenizer.decode(translate_ids[0], skip_special_tokens=True)


labels = base_tokenizer('Wo ist die Schokolade?', return_tensors='pt').input_ids

loss = base_model(
    input_ids=input_ids, labels=labels
).loss

print(output)
print(loss)

Wo ist die Schokolade?
tensor(0.0881, grad_fn=<NllLossBackward0>)


## CoLA: The Corpus of Linguistic Acceptability

In [38]:
set_seed(100)
input_ids = base_tokenizer.batch_encode_plus(
    ['cola sentence: Where is the chocolate?', 'cola sentence: Where be a chocolates?'],
     padding=True, truncation=True, return_tensors='pt'
).input_ids

cola_ids_batch = base_model.generate(
    input_ids,
    num_beams=4,
    no_repeat_ngram_size=3,
    max_length=20,
    early_stopping=True
)

batch_generated_texts = [base_tokenizer.decode(cola_ids, skip_special_tokens=True) for cola_ids in cola_ids_batch]


print(batch_generated_texts)

['acceptable', 'unacceptable']


## STSB - Semantic Text Similarity Benchmark
Are two sentences semantically similar? 0-5

In [41]:
set_seed(100)
sentence_1 = 'How to fish'
sentence_2 = 'Fishing Manual for beginners'
sentence_3 = 'Fish is tasty'
sentence_4 = 'The teacher Mr. Fisherman'

batch = [
    f"stsb sentence1: {sentence_1} sentence2: {sentence_1}",
    f"stsb sentence1: {sentence_1} sentence2: {sentence_2}",
    f"stsb sentence1: {sentence_1} sentence2: {sentence_3}",
    f"stsb sentence1: {sentence_1} sentence2: {sentence_4}"
]


input_ids = base_tokenizer.batch_encode_plus(
    batch, padding=True, truncation=True, return_tensors='pt'
).input_ids

stsb_ids_batch = base_model.generate(
    input_ids,
    max_length=5,
    early_stopping=True
)

batch_generated_texts = [base_tokenizer.decode(stsb_ids, skip_special_tokens=True) for stsb_ids in stsb_ids_batch]


print(batch_generated_texts)



['5.0', '3.2', '3.2', '2.8']


## MNLI - Multi-Genre Natural Language Inference
Whether a premise implies ("entailment"), contradicts ("contradiction"), or neither ("neutral") a hypothesis

In [44]:
set_seed(100)
sentence_1 = 'I am active in politics'
sentence_2 = 'I am running for mayor'
sentence_3 = 'I hate politics'
sentence_4 = 'I play as a center back'

batch = [
    f"mnli premise: {sentence_1}. hypothesis: {sentence_2}",
    f"mnli premise: {sentence_1}. hypothesis: {sentence_3}",
    f"mnli premise: {sentence_1}. hypothesis: {sentence_4}"
]


input_ids = base_tokenizer.batch_encode_plus(
    batch, padding=True, truncation=True, return_tensors='pt'
).input_ids

mnli_ids_batch = base_model.generate(
    input_ids,
    max_length=5,
    early_stopping=True
)

batch_generated_texts = [base_tokenizer.decode(mnli_ids, skip_special_tokens=True) for mnli_ids in mnli_ids_batch]


print(batch_generated_texts)

['entailment', 'contradiction', 'neutral']


## Q/A - Question/Answering

In [45]:
input_ids = base_tokenizer.encode(
    'question: Where does Sinan live? context: Sinan lives in California but Matt lives in Boston', 
    return_tensors='pt'
)

# summarize
qa_ids = base_model.generate(
    input_ids,
    early_stopping=True
)

output = base_tokenizer.decode(qa_ids[0], skip_special_tokens=True)

print(output)



California


# Using T5 for abstractive summarization

In [1]:
from transformers import pipeline, T5ForConditionalGeneration, TrainingArguments, Trainer, DataCollatorForSeq2Seq, T5Tokenizer

import pandas as pd
from datasets import Dataset
import torch
import random

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
base_model = T5ForConditionalGeneration.from_pretrained('t5-base')
base_tokenizer = T5Tokenizer.from_pretrained('t5-base')

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
reviews = pd.read_csv('./data/reviews.csv')

def add_punc(s):
    if str(s)[-1] not in ['.','!','?']:
        s = str(s) + '.'
    return s

reviews['Summary'] = reviews['Summary'].map(add_punc)
reviews

Unnamed: 0,Text,Summary,Score
0,Great taffy at a great price. There was a wid...,Great taffy.,5
1,This taffy is so good. It is very soft and ch...,"Wonderful, tasty taffy.",5
2,Right now I'm mostly just sprouting this so my...,Yay Barley.,5
3,This is a very healthy dog food. Good for thei...,Healthy Dog Food.,5
4,good flavor! these came securely packed... the...,fresh and greasy!,4
...,...,...,...
96507,We need this for a recipe my wife is intereste...,a-ok.,4
96508,This product is great. Gives you so much ener...,Great Cafe Latte.,5
96509,My only complaint is that there's so much of i...,Very large ground spice jars.,5
96510,Great for sesame chicken..this is a good if no...,Will not do without.,5


In [4]:
reviews = reviews[
    (reviews['Summary'].str.len() < 100) & (reviews['Summary'].str.len() >= 30)
].reset_index(drop=True)

reviews

Unnamed: 0,Text,Summary,Score
0,this has to be one of the best teas I have eve...,the best tea ever... freah bright clean.,5
1,Perfect size sea salt for the table or the pic...,Great tasting sea salt WITH iodine.,5
2,"This is an great product. The taste is great, ...",Marley's Mellow Mood Lite - Half Tea Half Lemo...,5
3,"I like these better than the regular Altoids, ...","Nice little mints, but pricey.",4
4,"Fresh,a great way to get a little chocolate in...",OMG best chocolate jelly belly.,5
...,...,...,...
13068,Found these on amazon warehouse sale at a good...,Very tasty - but not my fave starbucks flavor.,5
13069,"Coffee tastes as good as from Starbucks, k-cup...","Awesome coffee,but 38 bucks at Sam's.",5
13070,These little nuggets are so good. I cry becaus...,Quality Street is at it's best.,5
13071,Really enjoyed these and whats great is that t...,The best kipper snacks around....,5


In [5]:
reviews_dataset = Dataset.from_pandas(reviews.sample(100).reset_index(drop=True))
reviews_dataset

Dataset({
    features: ['Text', 'Summary', 'Score'],
    num_rows: 100
})

In [6]:
PREFIX = 'summarize: '

def preprocess_function(examples):
    inputs = [PREFIX + doc for doc in examples['Text']]
    model_inputs = base_tokenizer(inputs, max_length=512, truncation=True)
    labels = base_tokenizer(examples['Summary'], truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_reviews_dataset = reviews_dataset.map(preprocess_function, batched=True)
tokenized_reviews_dataset = tokenized_reviews_dataset.train_test_split(train_size=.9)
tokenized_reviews_dataset

Map: 100%|██████████| 100/100 [00:00<00:00, 3698.42 examples/s]


DatasetDict({
    train: Dataset({
        features: ['Text', 'Summary', 'Score', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 90
    })
    test: Dataset({
        features: ['Text', 'Summary', 'Score', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 10
    })
})

In [7]:
data_collator = DataCollatorForSeq2Seq(
    tokenizer=base_tokenizer, model=base_model
)

In [8]:
training_args = TrainingArguments(
    output_dir='./t5_summary_results',
    overwrite_output_dir=True,
    evaluation_strategy='epoch',
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    logging_steps=50,
    load_best_model_at_end=True,
    save_strategy='epoch'
)

trainer = Trainer(
    model=base_model,
    args=training_args,
    data_collator=data_collator,
    # tokenizer=base_tokenizer,
    train_dataset=tokenized_reviews_dataset['train'],
    eval_dataset=tokenized_reviews_dataset['test']
)

# trainer.evaluate()

In [9]:
# Check if CUDA is available
def show_cuda_space_info():
    if torch.cuda.is_available():
        # Get the CUDA device name
        device = torch.device("cuda")
        print("Using device:", torch.cuda.get_device_name(device))

        # Memory allocation and caching are dynamic in PyTorch, but you can get approximate memory usage as follows
        total_memory = torch.cuda.get_device_properties(device).total_memory
        allocated_memory = torch.cuda.memory_allocated(device)
        cached_memory = torch.cuda.memory_reserved(device)
        free_memory = total_memory - (allocated_memory + cached_memory)

        print(f"Total memory: {total_memory / 1e9:.2f} GB")
        print(f"Allocated memory: {allocated_memory / 1e9:.2f} GB")
        print(f"Cached memory: {cached_memory / 1e9:.2f} GB")
        print(f"Free memory: {free_memory / 1e9:.2f} GB")
    else:
        print("CUDA is not available.")
show_cuda_space_info()
print('------------------')
torch.cuda.empty_cache()
show_cuda_space_info()

Using device: NVIDIA GeForce GTX 1660 Ti
Total memory: 6.44 GB
Allocated memory: 0.89 GB
Cached memory: 0.98 GB
Free memory: 4.57 GB
------------------
Using device: NVIDIA GeForce GTX 1660 Ti
Total memory: 6.44 GB
Allocated memory: 0.89 GB
Cached memory: 0.98 GB
Free memory: 4.57 GB


In [10]:
trainer.train()
# we can freeze some layers to avoid memory issue

  7%|▋         | 1/15 [00:02<00:38,  2.77s/it]

OutOfMemoryError: CUDA out of memory. Tried to allocate 14.00 MiB. GPU 0 has a total capacty of 6.00 GiB of which 0 bytes is free. Of the allocated memory 5.03 GiB is allocated by PyTorch, and 294.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF