<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Import-Libraries-and-Settings" data-toc-modified-id="Import-Libraries-and-Settings-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Import Libraries and Settings</a></span></li><li><span><a href="#Transfer-Learning-Summarization-on-bbc_sports" data-toc-modified-id="Transfer-Learning-Summarization-on-bbc_sports-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Transfer Learning Summarization on <code>bbc_sports</code></a></span></li></ul></div>

# Testing T5 Transfer Learning Summarization on BBC Sports

In [None]:
!pip install transformers==4.2.0

In [None]:
!pip install torch

## Import Libraries and Settings

In [1]:
import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelWithLMHead

In [2]:
# Initialize pre-trained tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("t5-base")

In [3]:
model = AutoModelWithLMHead.from_pretrained("t5-base", return_dict=True)



## Transfer Learning Summarization on `bbc_sports`

In [None]:
# Read BBC Sports
bbc_sports = pd.read_csv("/data/workspace_files/bbc_sports.csv")
print(bbc_sports.shape)
bbc_sports.head()

In [None]:
# Generating t5 summary
t5_summaries = np.array([])

In [None]:
# Loop through the texts to generate the summaries
for txt in bbc_sports["contents"]:

    # Tokenize and tensorize the text
    # For tasks in T5, add the task verb. In our case: summarize
    # Max length of tokens supported by T5 is 512
    inputs = tokenizer.encode("summarize: ", txt, return_tensors="pt", max_length=512, truncation=True)

    # Generate the summaries: 18 words < summary < 150 words
    outputs = model.generate(inputs, max_length=150, min_length=18, length_penalty=5, num_beams=2)

    # Convert summary output tensor IDs to text
    summary = tokenizer.decode(outputs[0])

    # Append result
    t5_summaries = np.append(t5_summaries, summary)


# Runtime Total: t5-base: BBC-Sports = 1h40m

In [None]:
# Check list of summaries
t5_summaries

In [None]:
bbc_sports["summary_t5"] = t5_summaries
bbc_sports

In [None]:
# Export result to CSV
bbc_sports.to_csv("/data/workspace_files/bbc_sports_t5_summarized.csv")