# Automatic Abstractive Summarization


# Installing Libraries

In [None]:
! pip install transformers # transformer libraries from huggingface.

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d5/43/cfe4ee779bbd6a678ac6a97c5a5cdeb03c35f9eaebbb9720b036680f9a2d/transformers-4.6.1-py3-none-any.whl (2.2MB)
[K     |████████████████████████████████| 2.3MB 7.0MB/s 
Collecting huggingface-hub==0.0.8
  Downloading https://files.pythonhosted.org/packages/a1/88/7b1e45720ecf59c6c6737ff332f41c955963090a18e72acbcbeac6b25e86/huggingface_hub-0.0.8-py3-none-any.whl
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 34.2MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/d4/e2/df3543e8ffdab68f5acc73f613de9c2b155ac47f162e725dcac87c521c11/tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3MB)
[K     |█

# Importing Libraries

In [None]:
import os
import datetime
import random
import json
import numpy as np
import pandas as pd
import torch
from transformers import BertTokenizer, EncoderDecoderModel

# Initialize the random number generator.
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed(42)
torch.cuda.manual_seed_all(42)
torch.backends.cudnn.enabled = False
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

# Define available device.
device = "cuda" if torch.cuda.is_available() else "cpu"
torch.device(device)

device(type='cuda')

# Downloading Datasets

In [None]:
# Download filtered_indosum.tar.gz.
os.system("gdown https://drive.google.com/uc?id=1RbsRMjXplaGTLMdk_vfa5b47gnZHg2y3")
# Extracting filtered_indosum.tar.gz.
! tar -xvf filtered_indosum.tar.gz
! rm filtered_indosum.tar.gz

filtered_indosum/
filtered_indosum/test.05.jsonl
filtered_indosum/test.02.jsonl
filtered_indosum/train.02.jsonl
filtered_indosum/train.04.jsonl
filtered_indosum/train.05.jsonl
filtered_indosum/test.03.jsonl
filtered_indosum/train.01.jsonl
filtered_indosum/dev.04.jsonl
filtered_indosum/test.01.jsonl
filtered_indosum/dev.05.jsonl
filtered_indosum/dev.01.jsonl
filtered_indosum/dev.02.jsonl
filtered_indosum/dev.03.jsonl
filtered_indosum/test.04.jsonl
filtered_indosum/train.03.jsonl


# Bert2Bert Summarization

In [None]:
"""
Loading Tokenizer.
"""
tokenizer = BertTokenizer.from_pretrained("cahya/bert2bert-indonesian-summarization")
tokenizer.bos_token = tokenizer.cls_token
tokenizer.eos_token = tokenizer.sep_token

"""
Loading Summarization Model.
"""
model = EncoderDecoderModel.from_pretrained("cahya/bert2bert-indonesian-summarization")
model = model.to(device) # Moving into GPU if available.

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=229513.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=62.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=4261.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1079069947.0, style=ProgressStyle(descr…




In [None]:
startTime = datetime.datetime.now()
print("Started at", startTime.strftime("%Y-%m-%d %H:%M:%S"))

count = 0
summaries = []
for type in ["train", "dev", "test"]: # Loop over types.
  with open(f"filtered_indosum/{type}.01.jsonl") as lines: # Reading dataset.
    for line in lines: # Loop over the articles. One line, one article.
      jsonObj = json.loads(line) # Convert json to be object.

      # An article.
      article = []
      for paragraph in jsonObj["paragraphs"]: # Loop over paragraphs.
        for sentences in paragraph: # Loop over sentences.
          for token in sentences: # Loop over tokens.
            article.append(token)

      # Encoding the data.
      inputIds = tokenizer.encode(article, is_split_into_words=True, return_tensors='pt', max_length=512, truncation=True)
      inputIds = inputIds.to(device)

      # Generating a summary.
      summaryIds = model.generate(inputIds,
                                  min_length=60,
                                  max_length=120, 
                                  num_beams=10,
                                  early_stopping=True,
                                  no_repeat_ngram_size=2,
                                  do_sample=True,
                                  temperature=0.8,
                                  top_k=50,
                                  top_p=0.95,
                                  repetition_penalty=2.5, 
                                  length_penalty=1.0, 
                                  use_cache=True,
                                  )

      # Decoding the data.
      summary = tokenizer.decode(summaryIds.squeeze(), skip_special_tokens=True)
      summaryDict = {"id":jsonObj["id"], "summary":summary}
      summaries.append(summaryDict)

      count += 1
      if count % 1000 == 0: # Because it is a long journey, we save the result every multiples of 1000.
        summaryDf = pd.DataFrame(summaries)
        summaryDf.to_csv("drive/MyDrive/summary-bert2bert.tsv", sep="\t", index=False)
        print(f"{count} articles")

# Save the result when all is done.
summaryDf = pd.DataFrame(summaries)
summaryDf.to_csv("drive/MyDrive/summary-bert2bert.tsv", sep="\t", index=False)
print(f"{count} articles")

endTime = datetime.datetime.now()
print("Finished at", endTime.strftime("%Y-%m-%d %H:%M:%S"))
print("Executed in", endTime-startTime)

Started at 2021-05-26 18:52:31
1000 articles
2000 articles
3000 articles
4000 articles
5000 articles
6000 articles
7000 articles
8000 articles
9000 articles
10000 articles
11000 articles
12000 articles
13000 articles
14000 articles
15000 articles
16000 articles
17000 articles
18000 articles
18774 articles
Finished at 2021-05-27 14:05:52
Executed in 19:13:21.251455
