In [2]:
from datasets import load_dataset
from transformers import pipeline
import transformers as trf

import re  #preprocessing
import pandas as pd  #data handling
from time import time  #time-to-run
from collections import defaultdict  #word frequency

import spacy  #preprocessing/tokenization

import logging  #gensim logging
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

from tqdm import tqdm

import matplotlib.pyplot as plt

from sklearn.feature_extraction import text as sktext

import scipy as sp
import numpy as np

import sklearn as sk
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

import gensim
from gensim.models import Word2Vec, Doc2Vec
from joblib import dump, load

In [3]:
from datasets import load_dataset

# dataset = load_dataset("cnn_dailymail", "3.0.0", split='train')
# dataset = dataset.shard(num_shards=10, index=0)
# dataset = dataset.remove_columns("id")
test_dataset = load_dataset("cnn_dailymail", "3.0.0", split='test')
test_dataset = test_dataset.shard(num_shards=10, index=0)
test_dataset = test_dataset.remove_columns("id")



In [4]:
from transformers import DataCollatorForSeq2Seq, AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
import evaluate

rouge = evaluate.load("rouge")

In [5]:
MODEL_PATH = "./t5-small/"

In [6]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

In [7]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=MODEL_PATH)

src: 
- https://huggingface.co/docs/transformers/tasks/summarization
- https://huggingface.co/docs/transformers/v4.27.2/en/main_classes/data_collator

In [69]:
prefix = "summarize: "

def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["article"]]
    model_inputs = tokenizer(inputs, max_length=2048, truncation=True)

    labels = tokenizer(text_target=examples["highlights"], max_length=256, truncation=True, return_tensors='pt')

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [8]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

src:
- https://huggingface.co/docs/transformers/training#train-with-pytorch-trainer
- https://github.com/huggingface/notebooks/blob/main/examples/summarization.ipynb

In [10]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)



In [61]:
test_tokenized_dataset

Dataset({
    features: ['article', 'highlights', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 1149
})

In [9]:
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_PATH)

---

In [10]:
sample_tokens = tokenizer("Hello world")
sample_tokens

{'input_ids': [8774, 296, 1], 'attention_mask': [1, 1, 1]}

In [11]:
tokenizer.decode(sample_tokens.input_ids)

'Hello world</s>'

---

In [12]:
def pretty(string, line_len = 120):
    n = len(string)
    print("\n".join([string[i:i+line_len] for i in range(0, n, line_len)]))

In [13]:
def generate(str, max_len=120):
    return tokenizer.batch_decode(model.generate(**tokenizer("summarize: " + str, return_tensors='pt'), max_new_tokens=max_len), skip_special_tokens=True)[0]

In [14]:
pretty(test_dataset[1]['article'])
print()
pretty(generate(test_dataset[1]['article']), 120)
print("-")
print(test_dataset[1]['highlights'])

London (CNN)A 19-year-old man was charged Wednesday with terror offenses after he was arrested as he returned to Britain
 from Turkey, London's Metropolitan Police said. Yahya Rashid, a UK national from northwest London, was detained at Luto
n airport on Tuesday after he arrived on a flight from Istanbul, police said. He's been charged with engaging in conduct
 in preparation of acts of terrorism, and with engaging in conduct with the intention of assisting others to commit acts
 of terrorism. Both charges relate to the period between November 1 and March 31. Rashid is due to appear in Westminster
 Magistrates' Court on Wednesday, police said. CNN's Lindsay Isaac contributed to this report.

19-year-old arrested at Luton airport on thursday. he was returning from turkey to uk.
-
London's Metropolitan Police say the man was arrested at Luton airport after landing on a flight from Istanbul .
He's been charged with terror offenses allegedly committed since the start of November .


In [26]:
rouge.compute(predictions=[generate(test_dataset[1]['article'])], references=[test_dataset[1]['highlights']])
# rouge

INFO - 23:20:19: Using default tokenizer.


{'rouge1': 0.326530612244898,
 'rouge2': 0.12765957446808512,
 'rougeL': 0.24489795918367344,
 'rougeLsum': 0.2857142857142857}

In [21]:
pretty(test_dataset[2]['article'])
print()
pretty(generate(test_dataset[2]['article']), 120)
print("-")
print(test_dataset[2]['highlights'])

Norfolk, Virginia (CNN)The second mate of the Houston Express probably couldn't believe what he was seeing. Hundreds of 
miles from land there was a small boat nearby. At first it looked abandoned. It was in bad shape, listing to one side. T
he crew of the 1,000-foot long container ship thought it was a yacht that had wrecked. Incredibly, as they got closer, t
hey saw there was a man on it, signaling for help. "He was moving, walking around, waving to us and in surprisingly good
 condition," Capt. Thomas Grenz told CNN by phone Friday. That man, Louis Jordan, 37, had an amazing story. He'd been dr
ifting on the 35-foot Pearson sailboat for more than two months since leaving Conway, South Carolina, to fish in the oce
an. Just a few days into his trip, a storm capsized his boat and broke his mast. One of his shoulders was broken, too, s
o he couldn't fix the boat right away. Eventually he was able to rig a makeshift mast and sail, but he could make little
 headway against the currents. "

In [27]:
rouge.compute(predictions=[generate(test_dataset[2]['article'])], references=[test_dataset[2]['highlights']])
# rouge

INFO - 23:20:39: Using default tokenizer.


{'rouge1': 0.15873015873015875,
 'rouge2': 0.0,
 'rougeL': 0.09523809523809525,
 'rougeLsum': 0.12698412698412698}

In [28]:
pretty(test_dataset[123]['article'])
print()
pretty(generate(test_dataset[123]['article']), 120)
print("-")
print(test_dataset[123]['highlights'])

A pro-Ukip businessman who forced Liam Fox to resign as Defence Secretary is behind an extraordinary bid to defeat a ‘go
lden boy’ Tory candidate. Tycoon Harvey Boulter has already given £30,000 to the Ukip candidate fighting Fox in his Some
rset seat in a continued vendetta against him. Boulter is also spending another £30,000 to try to defeat Tom Tugendhat, 
who is standing for the Tories in Tonbridge and West Malling, Kent. Scroll down for video . Pro-Ukip businessman Harvey 
Boulter (above), who forced Liam Fox to resign as Defence Secretary, is behind an extraordinary bid to defeat a ‘golden 
boy’ Tory candidate . Tugendhat’s father, Sir Michael Tugendhat, is the High Court judge who awarded Fox a six-figure su
m in damages after he sued Boulter for libellous remarks made on Sky TV. Fox resigned from the Cabinet in 2011 after Bou
lter revealed details of how Fox’s close friend, businessman Adam Werritty, accompanied him on overseas trips as Defence
 Secretary. Mr Boulter said: ‘Th

In [29]:
rouge.compute(predictions=[generate(test_dataset[123]['article'])], references=[test_dataset[123]['highlights']])
# rouge

INFO - 23:22:27: Using default tokenizer.


{'rouge1': 0.2807017543859649,
 'rouge2': 0.10909090909090909,
 'rougeL': 0.2105263157894737,
 'rougeLsum': 0.2105263157894737}

---