### Importing Libraries

In [1]:
%%time
import numpy as np
import pandas as pd

import matplotlib
import matplotlib.pyplot as plt

from summarizer import Summarizer,TransformerSummarizer
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings('ignore')

CPU times: total: 3.08 s
Wall time: 6.7 s


### Data 

In [2]:
df = pd.read_csv('news_summary.csv', encoding='latin-1', usecols=['headlines', 'text'])
  
from_i = 10
count = 5
headlines = df['headlines']
headlines = headlines[from_i:from_i+count].to_list()
df = df['text']
df = pd.DataFrame(df[from_i:from_i+count])
df.reset_index(inplace=True, drop=True)
df

Unnamed: 0,text
0,India's food regulator Food Safety and Standar...
1,"The mother of Harshit Sharma, the class 12 Cha..."
2,Municipal Corporation of Gurugram on Wednesday...
3,"Scientists, for the first time, successfully f..."
4,A Union Minister of State for Home Affairs inf...


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5 non-null      object
dtypes: object(1)
memory usage: 168.0+ bytes


In [4]:
df['text'].head(10)

0    India's food regulator Food Safety and Standar...
1    The mother of Harshit Sharma, the class 12 Cha...
2    Municipal Corporation of Gurugram on Wednesday...
3    Scientists, for the first time, successfully f...
4    A Union Minister of State for Home Affairs inf...
Name: text, dtype: object

In [5]:
df['text'].str.len().max()

388

In [6]:
# Creation the list with new long block
max_length = 400  # minimum characters in each block
i = 0
bodies = []
while i < len(df):
    body = ""
    body_empty = True
    while (len(body) < max_length) and (i < len(df)):
        if body_empty:
            body = df.loc[i,'text']
            body_empty = False
        else: body += " " + df.loc[i,'text']
        i += 1
    bodies.append(body)
    print("Length of blocks =", len(body))
print(f"\nNumber of text blocks = {len(bodies)}\n")
print("Text blocks:\n", bodies)

Length of blocks = 704
Length of blocks = 743
Length of blocks = 388

Number of text blocks = 3

Text blocks:
 ['India\'s food regulator Food Safety and Standards Authority of India (FSSAI) is planning to create a network to collect leftover food and provide it to the needy. It is looking to connect with organisations which can collect, store and distribute leftover food from weddings and large parties. It further added that all food must meet the safety and hygiene standards. The mother of Harshit Sharma, the class 12 Chandigarh boy who got a hoax job offer call from Google, said that the incident "devastated" his life. He got a call, after which he shared the information with the school principal, who sent out a press release. Harshit is hospitalised since Google denied giving him a job, his mother added.', 'Municipal Corporation of Gurugram on Wednesday said that 19 out of 45 commercial building owners have decided to pay property tax instead of providing free parking to the public.

### Text Summarization

In [7]:
min_length_text = 40

#### Bert Summerizer

In [8]:
%%time
bert_summary = []
for i in range(len(bodies)):
    bert_model = Summarizer()
    bert_summary.append(''.join(bert_model(bodies[i], min_length=min_length_text)))

100%|█████████████████████████████████████████████████████████████████████████████████████████| 434/434 [00:00<?, ?B/s]
100%|█████████████████████████████████████████████████████████████| 1344997306/1344997306 [02:12<00:00, 10114614.14B/s]
100%|███████████████████████████████████████████████████████████████████████| 231508/231508 [00:00<00:00, 356748.27B/s]


CPU times: total: 45 s
Wall time: 2min 39s


#### GPT-2 Summarizing

In [9]:
%%time
gpt_summary = []
for i in range(len(bodies)):
    GPT2_model = TransformerSummarizer(transformer_type="GPT2",transformer_model_key="gpt2-medium")
    gpt_summary.append(''.join(GPT2_model(bodies[i], min_length=min_length_text)))

100%|█████████████████████████████████████████████████████████████████████████████| 718/718 [00:00<00:00, 680875.03B/s]
100%|██████████████████████████████████████████████████████████████| 1520013706/1520013706 [02:59<00:00, 8462228.97B/s]
100%|█████████████████████████████████████████████████████████████████████| 1042301/1042301 [00:01<00:00, 997625.87B/s]
100%|███████████████████████████████████████████████████████████████████████| 456318/456318 [00:00<00:00, 557732.12B/s]


CPU times: total: 1min 27s
Wall time: 3min 35s


#### XLNet Summarizing

In [10]:
%%time
xlnet_summary = []
for i in range(len(bodies)):
    model = TransformerSummarizer(transformer_type="XLNet",transformer_model_key="xlnet-base-cased")
    xlnet_summary.append(''.join(model(bodies[i], min_length=min_length_text)))

100%|█████████████████████████████████████████████████████████████████████████████████████████| 760/760 [00:00<?, ?B/s]
100%|███████████████████████████████████████████████████████████████| 467042463/467042463 [00:44<00:00, 10390176.69B/s]
100%|███████████████████████████████████████████████████████████████████████| 798011/798011 [00:01<00:00, 782457.38B/s]


CPU times: total: 23.9 s
Wall time: 1min 1s


### Result

In [11]:
%%time
print("All Summarizing Results:\n")
for i in range(len(bodies)):
    print("ORIGINAL TEXT:")
    print(bodies[i])
    print("\nBERT Summarizing Result:")
    print(bert_summary[i])
    print("\nGPT-2 Summarizing Result:")
    print(gpt_summary[i])
    print("\nXLNet Summarizing Result:")
    print(xlnet_summary[i])
    print("\nOriginal headline:")
    print(headlines[i])
    print("\n\n")

All Summarizing Results:

ORIGINAL TEXT:
India's food regulator Food Safety and Standards Authority of India (FSSAI) is planning to create a network to collect leftover food and provide it to the needy. It is looking to connect with organisations which can collect, store and distribute leftover food from weddings and large parties. It further added that all food must meet the safety and hygiene standards. The mother of Harshit Sharma, the class 12 Chandigarh boy who got a hoax job offer call from Google, said that the incident "devastated" his life. He got a call, after which he shared the information with the school principal, who sent out a press release. Harshit is hospitalised since Google denied giving him a job, his mother added.

BERT Summarizing Result:
India's food regulator Food Safety and Standards Authority of India (FSSAI) is planning to create a network to collect leftover food and provide it to the needy.

GPT-2 Summarizing Result:
India's food regulator Food Safety and 