See also: https://huggingface.co/docs/transformers/main/model_doc/t5

## Load Libraries

In [1]:
!pip install simplet5

Collecting simplet5
  Downloading simplet5-0.1.4.tar.gz (7.3 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentencepiece (from simplet5)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
Collecting transformers==4.16.2 (from simplet5)
  Downloading transformers-4.16.2-py3-none-any.whl (3.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pytorch-lightning==1.5.10 (from simplet5)
  Downloading pytorch_lightning-1.5.10-py3-none-any.whl (527 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.7/527.7 kB[0m [31m30.4 MB/s[0m eta [36m0:00:00[0m
Collecting torchmetrics>=0.4.1 (from pytorch-lightning==1.5.10->simplet5)
  Downloading torchmetrics-1.2.1-py3-none-any.whl (806 kB)
[2K     [90

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from simplet5 import SimpleT5


INFO:pytorch_lightning.utilities.seed:Global seed set to 42


## Load Data

In [3]:
def getSampleData():
  path = "https://raw.githubusercontent.com/Shivanandroy/T5-Finetuning-PyTorch/main/data/news_summary.csv"
  df = pd.read_csv(path)

  # simpleT5 expects dataframe to have 2 columns: "source_text" and "target_text"
  df = df.rename(columns={"headlines":"target_text", "text":"source_text"})
  df = df[['source_text', 'target_text']]

  # T5 model expects a task related prefix: since it is a summarization task, we will add a prefix "summarize: "
  df['source_text'] = "summarize: " + df['source_text']
  return df

In [4]:
df = getSampleData()


## Split Data

In [5]:
train_df, test_df = train_test_split(df, test_size=0.2)

## Create Model

In [6]:
model = SimpleT5()
model.from_pretrained(model_type="t5", model_name="t5-base")
model.train(train_df=train_df,
            eval_df=test_df,
            source_max_token_len=128,
            target_max_token_len=50,
            batch_size=8, max_epochs=3, use_gpu=True)

Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/850M [00:00<?, ?B/s]

INFO:pytorch_lightning.utilities.distributed:GPU available: True, used: True
INFO:pytorch_lightning.utilities.distributed:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.distributed:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.accelerators.gpu:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M 
-----------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.614   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.seed:Global seed set to 42


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

In [7]:
model.load_model("t5","outputs/epoch-2", use_gpu=True)

In [10]:
text_to_summarize="""summarize:
Lawyers for Alexey Navalny said Monday they have lost contact with the jailed Russian opposition leader, who was believed to be imprisoned in a penal colony about 150 miles east of Moscow, and his whereabouts are unknown.

Navalny was sentenced to 19 years in prison in August, after he was found guilty of creating an extremist community, financing extremist activities and numerous other crimes. He was already serving sentences of 11-and-a-half years in a maximum security facility on fraud and other charges he denies.

Supporters of Navalny claim his arrest and incarceration are a politically motivated attempt to stifle his criticism of Russian President Vladimir Putin.
"""
model.predict(text_to_summarize)

['Lawyers lose contact with jailed Russian opposition leader']

## Onyx

In [None]:
#model.convert_and_load_onnx_model(model_dir="outputs/SimpleT5-epoch-2-train-loss-0.9526")
#model.onnx_predict(text_to_summarize)