Copyright (c) Microsoft Corporation.  
Licensed under the MIT License.

# Abstractive Summarization using UniLM on CNN/DailyMails

In [None]:
%load_ext autoreload
%autoreload 2
import time
from utils_nlp.dataset.cnndm import CNNDMSummarizationDatasetOrg
from utils_nlp.models.transformers.abstractive_summarization_seq2seq import S2SAbsSumProcessor, S2SAbstractiveSummarizer

In [None]:
MODEL_NAME = "unilm-large-cased"
PER_GPU_BATCH_SIZE = 1

## Load the CNN/DailyMail dataset

In [None]:
start = time.time()
train_ds, test_ds = CNNDMSummarizationDatasetOrg(top_n=8)
print(time.time() - start)

In [None]:
print(len(train_ds))
print(len(test_ds))

## Preprocessing

In [None]:
processor = S2SAbsSumProcessor(model_name=MODEL_NAME)

In [None]:
train_dataset = processor.train_dataset_from_sum_ds(train_ds, load_cached_features=False)
test_dataset = processor.test_dataset_from_sum_ds(test_ds)

## Fine tune model

In [None]:
# abs_summarizer = S2SAbstractiveSummarizer(
#     model_name=MODEL_NAME)

abs_summarizer = S2SAbstractiveSummarizer(
    model_name=MODEL_NAME,
    load_model_from_dir="/home/hlu/notebooks/unilm/",
    model_file_name="unilmv1-large-cased.bin",
)

# abs_summarizer = S2SAbstractiveSummarizer(
#     model_name=MODEL_NAME,
#     load_model_from_dir="/home/hlu/notebooks/unilm/",
#     model_file_name="cnndm_model.bin",
# )

In [None]:
abs_summarizer.fit(
    train_dataset=train_dataset,
    per_gpu_batch_size=PER_GPU_BATCH_SIZE,
    save_model=False
)

## Generate summaries on testing dataset

In [None]:
res = abs_summarizer.predict(test_dataset=test_dataset)

In [None]:
for r in res:
    print(r)