Copyright (c) Microsoft Corporation.  
Licensed under the MIT License.

# Abstractive Summarization using UniLM on CNN/DailyMails

In [None]:
%load_ext autoreload
%autoreload 2
import time
from utils_nlp.dataset.cnndm import CNNDMSummarizationDatasetOrg
from utils_nlp.models import S2SAbsSumProcessor, S2SAbstractiveSummarizer

In [None]:
QUICK_RUN = False

In [None]:
OUTPUT_FILE = './nlp_cnndm_finetuning_results_test.txt'

# model parameters
MODEL_NAME = "unilm-large-cased"
MAX_SEQ_LEN = 768
MAX_SOURCE_SEQ_LENGTH = 640
MAX_TARGET_SEQ_LENGTH = 128

# fine-tuning parameters
TRAIN_PER_GPU_BATCH_SIZE = 1
GRADIENT_ACCUMULATION_STEPS = 2
LEARNING_RATE = 3e-5
if QUICK_RUN:
    WARMUP_STEPS = 5
    MAX_STEPS = 50
else:
    WARMUP_STEPS = 500
    MAX_STEPS = 5000

# inference parameters
TEST_PER_GPU_BATCH_SIZE = 12
BEAM_SIZE = 5
FORBID_IGNORE_WORD = "."

# mixed precision setting
FP16 = False

## Load the CNN/DailyMail dataset

In [None]:
start = time.time()
train_ds, test_ds = CNNDMSummarizationDatasetOrg(top_n=100)
print(time.time() - start)

In [None]:
print(len(train_ds))
print(len(test_ds))

## Preprocessing

In [None]:
processor = S2SAbsSumProcessor(model_name=MODEL_NAME)

In [None]:
train_dataset = processor.train_dataset_from_sum_ds(train_ds, load_cached_features=True)
test_dataset = processor.test_dataset_from_sum_ds(test_ds)

## Fine tune model

In [None]:
abs_summarizer = S2SAbstractiveSummarizer(
    model_name=MODEL_NAME,
    max_seq_len=MAX_SEQ_LEN,
    max_source_seq_length=MAX_SOURCE_SEQ_LENGTH,
    max_target_seq_length=MAX_TARGET_SEQ_LENGTH,
)

## To load a fine-tuned model for inference only
# abs_summarizer = S2SAbstractiveSummarizer(
#     model_name=MODEL_NAME,
#     max_seq_len=MAX_SEQ_LEN,
#     max_source_seq_length=MAX_SOURCE_SEQ_LENGTH,
#     max_target_seq_length=MAX_TARGET_SEQ_LENGTH,
#     load_model_from_dir="./",
#     model_file_name="model.5000.bin",
# )


In [None]:
abs_summarizer.fit(
    train_dataset=train_dataset,
    per_gpu_batch_size=TRAIN_PER_GPU_BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    learning_rate=LEARNING_RATE,
    warmup_steps=WARMUP_STEPS,
    max_steps=MAX_STEPS,
    fp16=FP16
)

## Generate summaries on testing dataset

In [None]:
start = time.time()
res = abs_summarizer.predict(
    test_dataset=test_dataset,
    per_gpu_batch_size=TEST_PER_GPU_BATCH_SIZE,
    beam_size=BEAM_SIZE,
    forbid_ignore_word=FORBID_IGNORE_WORD,
    fp16=FP16
)
print(time.time() - start)

In [None]:
for r in res[:5]:
    print(r)

In [None]:
with open(OUTPUT_FILE, 'w') as f:
    for line in res:
        f.write(line + '\n')