In [1]:
!pip install transformers datasets evaluate rouge_score sacremoses

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple


In [2]:
#Dataset
import torch
from transformers import TransfoXLTokenizer
from datasets import load_dataset
import re

# Load the CNN/Daily Mail  news & Booksum dataset from Hugging Face
cnn_dataset = load_dataset('cnn_dailymail', '3.0.0')
booksum = load_dataset("kmfoda/booksum")

  from .autonotebook import tqdm as notebook_tqdm
Found cached dataset cnn_dailymail (/Users/adamiao/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de)
100%|██████████| 3/3 [00:00<00:00, 46.82it/s]
Found cached dataset csv (/Users/adamiao/.cache/huggingface/datasets/kmfoda___csv/kmfoda--booksum-ae5085e4c62b30f1/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d)
100%|██████████| 3/3 [00:00<00:00, 274.47it/s]


In [3]:
#split the Datasets
cnn_dataset_split = cnn_dataset['train'].train_test_split(test_size=0.2)
cnn_dataset_test = cnn_dataset['test']
cnn_dataset_train = cnn_dataset_split['train']
cnn_dataset_val =  cnn_dataset_split['test']

In [4]:
booksum_split = booksum['train'].train_test_split(test_size=0.2)
booksum_test = booksum['test']
booksum_train = booksum_split['train']
booksum_val =  booksum_split['test']

In [5]:
booksum_train[0]

{'bid': 2500,
 'is_aggregate': False,
 'source': 'pinkmonkey',
 'chapter_path': 'all_chapterized_books/2500-chapters/8.txt',
 'summary_path': 'finished_summaries/pinkmonkey/Siddhartha/section_7_part_0.txt',
 'book_id': 'Siddhartha.part 2.chapter 8',
 'summary_id': 'chapter 8: by the river',
 'content': None,
 'summary': '{"name": "Chapter 8: By the River", "url": "https://web.archive.org/web/20180820034609/http://www.pinkmonkey.com/booknotes/monkeynotes/pmSiddhartha20.asp", "summary": "Siddhartha wanders into the forest far away from the town. He feels disgust for the life he has led. He reaches the long river which the ferryman had helped him to cross when he had returned from Gotama\'s grove. Weakened by fatigue and hunger, he stops on the banks of this river and wishes for an end to his life. Then, from a remote corner of his soul, Siddhartha hears a sound. It one syllable -- the sound of the holy Om. When it reaches his ears, his slumbering soul suddenly awakens. Siddhartha sinks i

In [6]:
cnn_dataset_train[0]

{'article': "Johan Cruyff has reiterated his view that new Manchester United manager Louis van Gaal has a 'militaristic' approach to tactics, while he likes his teams to keep the ball. Cruyff, a footballing icon as both a player and a coach, is considered the founding father of Barcelona's attacking, possession-based, tiki-taka football. The Dutch pair famously do not get on well and in an interview with the Guardian's Donald McRae, Cruyff has once again highlighted the contrast between their philosophies. Johan Cruyff says Louis van Gaal (right) has a militaristic approach to tactics and the pair are nothing alike . The Dutch Johan Cruyff legend introduced 'tiki-taka' football to Barcelona as manager in the 90s . Asked if he had a similar view on the game to Van Gaal, Cruyff said: 'No. Not much. Of course we’re both Dutch and that is always a [shared] basis. But I always think of being in charge of the speed and of the ball. 'Maybe he knows more than me but I always want control of th

In [7]:
# Load the pre-trained Transformer-XL tokenizer and set the max sequence length to 1024
tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103', max_length=1024)

# Text cleaning function
def text_process(text):
    # remove urls
    new_str = re.sub(r'https?://\S+', '', text)
    # remove non-alphanumeric characters and keep 4 major punctuation (.,?!)
    new_str = re.sub(r'[^\w\s\.\?\,\!]', '', new_str)
    return new_str

# Data processing function
def preprocess_function_cnn(examples):
    inputs = [text_process(doc) for doc in examples["article"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True, return_attention_mask=True)
    labels = tokenizer(text_target=examples["highlights"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

def preprocess_function_book(examples):
    inputs = [text_process(doc) for doc in examples["chapter"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True, return_attention_mask=True)
    labels = tokenizer(text_target=examples["summary_text"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [8]:
tokenized_cnn_train = cnn_dataset_train.map(preprocess_function_cnn, batched=True)
tokenized_cnn_val = cnn_dataset_val.map(preprocess_function_cnn, batched=True)
tokenized_cnn_test = cnn_dataset_test.map(preprocess_function_cnn, batched=True)

                                                                    

In [None]:
tokenized_book_train = booksum_train.map(preprocess_function_book, batched=True)
tokenized_book_val = booksum_val.map(preprocess_function_book, batched=True)
tokenized_book_test = booksum_test.map(preprocess_function_book, batched=True)

Map:   0%|          | 0/7680 [00:00<?, ? examples/s]

Map:   0%|          | 0/1920 [00:00<?, ? examples/s]

Map:   0%|          | 0/1431 [00:00<?, ? examples/s]

In [9]:
torch.save(tokenized_cnn_train, 'cnn_train_dataset.pt')
torch.save(tokenized_cnn_val, 'cnn_val_dataset.pt')
torch.save(tokenized_cnn_test, 'cnn_test_dataset.pt')

In [None]:
torch.save(tokenized_book_train, 'book_train_dataset.pt')
torch.save(tokenized_book_val, 'book_val_dataset.pt')
torch.save(tokenized_book_test, 'book_test_dataset.pt')