## Installations

In [1]:
import zipfile
import os
import pandas as pd
import glob

In [2]:
# Specify the file path
file_path = r'C:\Users\Admin\Document-Summarization\BBCNews_archive.zip'

# Extract the ZIP file
with zipfile.ZipFile(file_path, 'r') as zip_ref:
    zip_ref.extractall()

# Specify the parent folder path
parent_folder = 'BBC News Summary'

# Specify the news articles and summaries folder paths
articles_folder = os.path.join(parent_folder, 'News Articles')
summaries_folder = os.path.join(parent_folder, 'Summaries')

# Print the list of files in the articles folder
print('News Articles:')
for file_name in os.listdir(articles_folder):
    file_path = os.path.join(articles_folder, file_name)
    if os.path.isfile(file_path):  # Check if it's a file and not a subdirectory
        print(file_name)
        
# Print the list of files in the summaries folder
print('\nSummaries:')
for file_name in os.listdir(summaries_folder):
    file_path = os.path.join(summaries_folder, file_name)
    if os.path.isfile(file_path):  # Check if it's a file and not a subdirectory
        print(file_name)

# Get the file paths for articles and summaries
articles_path = glob.glob(os.path.join(articles_folder, "*/*.txt"))
summaries_path = glob.glob(os.path.join(summaries_folder, "*/*.txt"))

# Loading the data into arrays
articles = []
summaries = []
categories = []

for article_path, summary_path in zip(articles_path, summaries_path):
    with open(article_path, "r", encoding="ISO-8859-1") as article_file:
        article_text = article_file.read()
        articles.append(article_text)
        
    with open(summary_path, "r", encoding="ISO-8859-1") as summary_file:
        summary_text = summary_file.read()
        summaries.append(summary_text)
        category = os.path.basename(os.path.dirname(article_path))
        categories.append(category)



News Articles:

Summaries:


In [3]:
# Creating pandas DataFrame
df = pd.DataFrame({
    'Articles': articles,
    'Summaries': summaries,
    'Categories': categories
})

# Print DataFramedf
df

Unnamed: 0,Articles,Summaries,Categories
0,Ad sales boost Time Warner profit\n\nQuarterly...,TimeWarner said fourth quarter sales rose 2% t...,business
1,Dollar gains on Greenspan speech\n\nThe dollar...,The dollar has hit its highest level against t...,business
2,Yukos unit buyer faces loan claim\n\nThe owner...,Yukos' owner Menatep Group says it will ask Ro...,business
3,High fuel prices hit BA's profits\n\nBritish A...,"Rod Eddington, BA's chief executive, said the ...",business
4,Pernod takeover talk lifts Domecq\n\nShares in...,Pernod has reduced the debt it took on to fund...,business
...,...,...,...
2220,BT program to beat dialler scams\n\nBT is intr...,BT is introducing two initiatives to help beat...,tech
2221,Spam e-mails tempt net shoppers\n\nComputer us...,A third of them read unsolicited junk e-mail a...,tech
2222,Be careful how you code\n\nA new European dire...,This goes to the heart of the European project...,tech
2223,US cyber security chief resigns\n\nThe man mak...,Amit Yoran was director of the National Cyber ...,tech


In [4]:
# !pip install pytorch-lightning
# !pip install transformers
# !pip install rouge
# !pip install sentencepiece

## BBC News

In [5]:
df['Articles'][3]

'High fuel prices hit BA\'s profits\n\nBritish Airways has blamed high fuel prices for a 40% drop in profits.\n\nReporting its results for the three months to 31 December 2004, the airline made a pre-tax profit of Â£75m ($141m) compared with Â£125m a year earlier. Rod Eddington, BA\'s chief executive, said the results were "respectable" in a third quarter when fuel costs rose by Â£106m or 47.3%. BA\'s profits were still better than market expectation of Â£59m, and it expects a rise in full-year revenues.\n\nTo help offset the increased price of aviation fuel, BA last year introduced a fuel surcharge for passengers.\n\nIn October, it increased this from Â£6 to Â£10 one-way for all long-haul flights, while the short-haul surcharge was raised from Â£2.50 to Â£4 a leg. Yet aviation analyst Mike Powell of Dresdner Kleinwort Wasserstein says BA\'s estimated annual surcharge revenues - Â£160m - will still be way short of its additional fuel costs - a predicted extra Â£250m. Turnover for the q

In [6]:
#characters length of article
len(df['Articles'][3])

2412

In [7]:
# Getting article and summary word length 
df['Article Length'] = df["Articles"].apply(lambda x: len(x.split()))
df['Summary Length'] = df["Summaries"].apply(lambda x: len(x.split()))

df.head()

Unnamed: 0,Articles,Summaries,Categories,Article Length,Summary Length
0,Ad sales boost Time Warner profit\n\nQuarterly...,TimeWarner said fourth quarter sales rose 2% t...,business,421,134
1,Dollar gains on Greenspan speech\n\nThe dollar...,The dollar has hit its highest level against t...,business,384,158
2,Yukos unit buyer faces loan claim\n\nThe owner...,Yukos' owner Menatep Group says it will ask Ro...,business,264,121
3,High fuel prices hit BA's profits\n\nBritish A...,"Rod Eddington, BA's chief executive, said the ...",business,406,197
4,Pernod takeover talk lifts Domecq\n\nShares in...,Pernod has reduced the debt it took on to fund...,business,265,106


## Remove Punctuation and stopwords for Analysis

In [8]:
import string
# let's remove punctuation 
def rem_punct(text): 
    translator = str.maketrans('', '', string.punctuation) 
    return text.translate(translator) 

df['Clean Article'] = df["Articles"].apply(rem_punct)
df['Clean Summaries'] = df['Summaries'].apply(rem_punct)
df.head()

Unnamed: 0,Articles,Summaries,Categories,Article Length,Summary Length,Clean Article,Clean Summaries
0,Ad sales boost Time Warner profit\n\nQuarterly...,TimeWarner said fourth quarter sales rose 2% t...,business,421,134,Ad sales boost Time Warner profit\n\nQuarterly...,TimeWarner said fourth quarter sales rose 2 to...
1,Dollar gains on Greenspan speech\n\nThe dollar...,The dollar has hit its highest level against t...,business,384,158,Dollar gains on Greenspan speech\n\nThe dollar...,The dollar has hit its highest level against t...
2,Yukos unit buyer faces loan claim\n\nThe owner...,Yukos' owner Menatep Group says it will ask Ro...,business,264,121,Yukos unit buyer faces loan claim\n\nThe owner...,Yukos owner Menatep Group says it will ask Ros...
3,High fuel prices hit BA's profits\n\nBritish A...,"Rod Eddington, BA's chief executive, said the ...",business,406,197,High fuel prices hit BAs profits\n\nBritish Ai...,Rod Eddington BAs chief executive said the res...
4,Pernod takeover talk lifts Domecq\n\nShares in...,Pernod has reduced the debt it took on to fund...,business,265,106,Pernod takeover talk lifts Domecq\n\nShares in...,Pernod has reduced the debt it took on to fund...


In [9]:

# let's remove stopwords 
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 

nltk.download('stopwords')
nltk.download('punkt')
  
# remove stopwords function 
def rem_stopwords(text): 
    stop_words = set(stopwords.words("english")) 
    word_tokens = word_tokenize(text) 
    filtered_text = [word for word in word_tokens if word not in stop_words] 
    return filtered_text 

df['Clean Article'] = df["Clean Article"].apply(rem_stopwords)
df['Clean Summaries'] = df['Clean Summaries'].apply(rem_stopwords)
df.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,Articles,Summaries,Categories,Article Length,Summary Length,Clean Article,Clean Summaries
0,Ad sales boost Time Warner profit\n\nQuarterly...,TimeWarner said fourth quarter sales rose 2% t...,business,421,134,"[Ad, sales, boost, Time, Warner, profit, Quart...","[TimeWarner, said, fourth, quarter, sales, ros..."
1,Dollar gains on Greenspan speech\n\nThe dollar...,The dollar has hit its highest level against t...,business,384,158,"[Dollar, gains, Greenspan, speech, The, dollar...","[The, dollar, hit, highest, level, euro, almos..."
2,Yukos unit buyer faces loan claim\n\nThe owner...,Yukos' owner Menatep Group says it will ask Ro...,business,264,121,"[Yukos, unit, buyer, faces, loan, claim, The, ...","[Yukos, owner, Menatep, Group, says, ask, Rosn..."
3,High fuel prices hit BA's profits\n\nBritish A...,"Rod Eddington, BA's chief executive, said the ...",business,406,197,"[High, fuel, prices, hit, BAs, profits, Britis...","[Rod, Eddington, BAs, chief, executive, said, ..."
4,Pernod takeover talk lifts Domecq\n\nShares in...,Pernod has reduced the debt it took on to fund...,business,265,106,"[Pernod, takeover, talk, lifts, Domecq, Shares...","[Pernod, reduced, debt, took, fund, Seagram, p..."


## Creating DataFrame and spliting into train test dataset

In [10]:
df.isnull().sum()

Articles           0
Summaries          0
Categories         0
Article Length     0
Summary Length     0
Clean Article      0
Clean Summaries    0
dtype: int64

In [11]:
df=df[['Articles','Summaries']]

In [12]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df[['Articles','Summaries']], test_size = 0.1)

## Pegasus

In [13]:
import torch
from transformers import AutoTokenizer, PegasusForConditionalGeneration

  from .autonotebook import tqdm as notebook_tqdm


In [14]:
def pegasus(input_text):
  model_name = 'google/pegasus-cnn_dailymail'
  device = 'cuda' if torch.cuda.is_available() else 'cpu'
  tokenizer = AutoTokenizer.from_pretrained(model_name)
  model = PegasusForConditionalGeneration.from_pretrained(model_name).to(device)
  batch = tokenizer(input_text, truncation=True, padding='longest', return_tensors="pt").to(device)
  translated = model.generate(**batch)
  summary = tokenizer.batch_decode(translated, skip_special_tokens=True)

  return summary


In [15]:
def generate_summaries(dataframe,article_column='Articles'):
    dataframe['generated_summary'] = dataframe[article_column].apply(lambda x: pegasus(x)[0])
    return dataframe

In [16]:
import random
# Select random rows
df_samples1 = df.sample(n=5).reset_index(drop=True)
df_samples1

Unnamed: 0,Articles,Summaries
0,Man Utd to open books to Glazer\n\nManchester ...,Many of Manchester United's supporters own sha...
1,US woman sues over cartridges\n\nA US woman is...,"A US woman is suing Hewlett Packard (HP), sayi..."
2,Capriati out of Australian Open\n\nJennifer Ca...,Jennifer Capriati has become the third leading...
3,UK youth 'interested' in politics\n\nThe major...,"""Mock elections can play an extremely valuable..."
4,Iraqi voters turn to economic issues\n\nBeyond...,Blackwatch's Baghdad-based affiliate Falcon Gr...


In [17]:
import os
os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python'


In [18]:
generate_summaries(df_samples1)



Unnamed: 0,Articles,Summaries,generated_summary
0,Man Utd to open books to Glazer\n\nManchester ...,Many of Manchester United's supporters own sha...,Manchester United's board has agreed to give U...
1,US woman sues over cartridges\n\nA US woman is...,"A US woman is suing Hewlett Packard (HP), sayi...","US woman sues Hewlett Packard, saying its prin..."
2,Capriati out of Australian Open\n\nJennifer Ca...,Jennifer Capriati has become the third leading...,Jennifer Capriati has become the third leading...
3,UK youth 'interested' in politics\n\nThe major...,"""Mock elections can play an extremely valuable...",81% of 16 to 20-year-olds feel strongly about ...
4,Iraqi voters turn to economic issues\n\nBeyond...,Blackwatch's Baghdad-based affiliate Falcon Gr...,The economy has become the second-most dominan...


## Evaluation Peagasus

In [19]:

from rouge import Rouge

# Assuming your DataFrame is called 'df_samples' and the summary and predicted summary columns are 'Summaries' and 'generated_summary'
summary_list = df_samples1['Summaries'].tolist()
predicted_summary_list = df_samples1['generated_summary'].tolist()

# Convert the summaries from lists to strings
summary_list = [' '.join(summary) for summary in summary_list]
predicted_summary_list = [' '.join(summary) for summary in predicted_summary_list]

rouge = Rouge()
scores = rouge.get_scores(predicted_summary_list, summary_list, avg=True)

# Create a DataFrame to store the scores
scores_df_Peagasus = pd.DataFrame(scores)

# Print the ROUGE scores
print('scores_df_Peagasus')
print(scores_df_Peagasus)


scores_df_Peagasus
    rouge-1   rouge-2   rouge-l
r  0.709712  0.524881  0.653429
p  0.881152  0.791148  0.817445
f  0.779656  0.614950  0.720130


## BERT (Hugging-Face)

In [20]:
# Summary using BER with Hugging-Face
from transformers import pipeline 

def huggingFace(dataFrame):

  # Load the text summarization pipeline
  summarizer = pipeline("summarization")
  summary = summarizer(dataFrame, max_length=250, min_length=50, do_sample=False)[0]["summary_text"]

  # Print the generated summary
  return summary


In [21]:
def generate_summaries(dataframe,article_column='Articles'):
    dataframe['generated_summary'] = dataframe[article_column].apply(lambda x: huggingFace(x))
    return dataframe

In [22]:
import random
# Select random rows
df_samples2 = df.sample(n=5).reset_index(drop=True)
df_samples2

Unnamed: 0,Articles,Summaries
0,Newcastle to join Morientes race\n\nNewcastle ...,"On Tuesday, Morientes had said: ""I like Liverp..."
1,Chinese exports rise 25% in 2004\n\nExports fr...,The US's overall trade deficit with China was ...
2,Old Firm pair handed suspensions\n\nCeltic's H...,Camara will miss one additional game.Both play...
3,Qantas considers offshore option\n\nAustralian...,If Qantas were to follow the lead of other air...
4,New York rockers top talent poll\n\nNew York e...,New York electro-rock group The Bravery have c...


In [23]:
generate_summaries(df_samples2)

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.
No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.
No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.
No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.
No model was supplied, defaulted to sshleife

Unnamed: 0,Articles,Summaries,generated_summary
0,Newcastle to join Morientes race\n\nNewcastle ...,"On Tuesday, Morientes had said: ""I like Liverp...",Newcastle join race to sign Fernando Moriente...
1,Chinese exports rise 25% in 2004\n\nExports fr...,The US's overall trade deficit with China was ...,Chinese exports rise 25% in 2004 over the pre...
2,Old Firm pair handed suspensions\n\nCeltic's H...,Camara will miss one additional game.Both play...,Both players have been handed two-match bans ...
3,Qantas considers offshore option\n\nAustralian...,If Qantas were to follow the lead of other air...,"Qantas could transfer 7,000 jobs out of its h..."
4,New York rockers top talent poll\n\nNew York e...,New York electro-rock group The Bravery have c...,New York electro-rock group The Bravery top S...


## Evaluation BERT HuggingFace

In [24]:

from rouge import Rouge

# Assuming your DataFrame is called 'df_samples' and the summary and predicted summary columns are 'Summaries' and 'generated_summary'
summary_list = df_samples2['Summaries'].tolist()
predicted_summary_list = df_samples2['generated_summary'].tolist()

# Convert the summaries from lists to strings
summary_list = [' '.join(summary) for summary in summary_list]
predicted_summary_list = [' '.join(summary) for summary in predicted_summary_list]

rouge = Rouge()
scores = rouge.get_scores(predicted_summary_list, summary_list, avg=True)

# Create a DataFrame to store the scores
scores_df_BERT = pd.DataFrame(scores)

# Print the ROUGE scores
print('scores_df_BERT_HiggingFace')
print(scores_df_BERT)


scores_df_BERT_HiggingFace
    rouge-1   rouge-2   rouge-l
r  0.755389  0.465973  0.700133
p  0.952465  0.851392  0.888215
f  0.838591  0.599670  0.779371


### Pickling of BERT MOdel

In [29]:
import pickle

def huggingFace(dataFrame):
    # Load the text summarization pipeline
    summarizer = pipeline("summarization")
    summary = summarizer(dataFrame, max_length=250, min_length=50, do_sample=False)[0]["summary_text"]

    # Print the generated summary
    return summary

# Pickle the huggingFace function
with open("huggingface.pickle", "wb") as f:
    pickle.dump(huggingFace, f)

In [None]:
import pickle

# Save the model as a pickle file
with open('bert_final.pkl', 'wb') as file:
    pickle.dump(huggingFace, file)

In [None]:
model =huggingFace

In [None]:

# Load the text summarization pipeline
summarizer = pipeline("summarization")
summary = summarizer(train_df['Articles'], max_length=250, min_length=50, do_sample=False)[0]["summary_text"]

In [25]:
import pickle

In [27]:
df_samples2['Articles'][0]

'Newcastle to join Morientes race\n\nNewcastle have joined the race to sign Real Madrid striker Fernando Morientes and scupper Liverpool\'s bid to snap up the player, according to reports.\n\nLiverpool were reported to have bid Â£3.5m for the 28-year-old Spanish international this week. But the Liverpool Echo newspaper has said Anfield boss Rafa Benitez will avoid a bidding war and instead turn his attentions to Nicolas Anelka. Real are believed to still want Â£7m before selling Morientes. Monaco are also in the race for the player they had on loan last season. Reports suggest Liverpool will lift their offer to Â£5m - the highest they are willing to go before bowing out of any deal.\n\nOn Tuesday, Morientes had said: "I like Liverpool and I am pleased that a club of their stature want to buy me. I have told Madrid that I want it to happen. "Madrid know my situation and they know they must do something about me. They must sort out the situation by being sensible. "I am in a position whe

In [28]:
model = huggingFace(input_text)  # Assign the function to the model variable
with open("bert_model.pkl", "wb") as file:
    pickle.dump(model, file)

# with open("bert_model.pkl", "rb") as file:
#         model = pickle.load(file)
#     return model

NameError: name 'input_text' is not defined

In [39]:
save_model()  # Save the BERT model
loaded_model = load_model()  # Load the BERT model


In [40]:
# Generate summaries for each row of the DataFrame
df_samples2['generated_summary'] = df_samples2['Articles'].apply(loaded_model)

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.
Your max_length is set to 250, but your input_length is only 245. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=122)
No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.
No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.
No model was supplied, defaulted

In [44]:
df_samples2

Unnamed: 0,Articles,Summaries,generated_summary
0,Housewives lift Channel 4 ratings\n\nThe debut...,BBC Two's share of the audience fell from 11.2...,Channel 4's January audience share has risen ...
1,Stallone evicted from Big Brother\n\nJackie St...,"""That alone was worth $1m,"" she said.Jackie St...","Jackie Stallone, mother of actor Sylvester, b..."
2,"Global digital divide 'narrowing'\n\nThe ""digi...","""The digital divide is rapidly closing,"" the W...",The World Bank questioned a United Nation's c...
3,Boothroyd calls for Lords speaker\n\nBetty Boo...,The lord chancellor currently has the role of ...,"House of Lords needs its own Speaker, says La..."
4,Moore questions captaincy\n\nBrian Moore belie...,Brian Moore believes the England captain shoul...,Brian Moore says full-backs are too far away ...


### Max Length

In [30]:
from transformers import pipeline

def huggingFace(text):
    # Load the text summarization pipeline
    summarizer = pipeline("summarization")

    # Split the text into chunks of maximum 600 words
    max_words = 600
    chunks = [text[i:i+max_words] for i in range(0, len(text), max_words)]

    summaries = []
    # Process each chunk and generate a summary
    for chunk in chunks:
        summary = summarizer(chunk, max_length=250, min_length=50, do_sample=False)[0]["summary_text"]
        summaries.append(summary)

    # Combine the summaries into a single string
    combined_summary = " ".join(summaries)

    # Return the combined summary
    return combined_summary

In [31]:
text = """ This report sets out the rationale and preliminary findings of the data sharing exercise between
G-7 and the World Bank, which has been extended to all Paris Club creditors on a voluntary basis.
The exercise was initiated by Japan as an important contribution to furthering the agenda for
greater debt transparency particularly regarding the world’s poorest countries. The current
exercise is the first step in the process of reconciling information reported by debtorsto the World
Bank Debtor Reporting System (DRS) with the comparable creditor data. As a second step, this
exercise could be institutionalized and extended to other creditors (in particular G20 creditors) in
order to improve data reporting in a permanent way.
I. The importance of better debt data transparency and the World Bank’s role
Comprehensive, accurate and transparent public debt data are fundamental to the
management of public liabilities, and the foundation of informed and sustainable borrowing
decisions. Knowing what is already owed is essential for policymakers to make informed
borrowing choices, creditors to appropriately price for sovereign risk, and citizens to hold their
governments accountable. Critically, the availability of high-quality data is a pre-requisite for the
ability of national governments and the international community to make informed debt
sustainability analyses, to minimize the risk of debt crisis, and to take timely remedial action when
they occur.
The World Bank has long played a leading role in promoting and delivering on greater debt data
transparency. It collects and compiles the single verifiable source of long-series, cross-country
comparable data on the external debt of low- and middle-income countries through the World
Bank Debtor Reporting System (DRS). The DRS requires regular (annual and quarterly) and
detailed (loan-by-loan) information on external public and publicly external debt from all World
Bank borrowers. The Bank regards the external debt data it collects through the DRS as a global
public good and has disseminated comprehensive information, updated on an annual basis, since
1973.
The DRS helps ensure debt data conform to international definitions and standards. The
reporting standards of the DRS accord with the methodology and definitions of other
international systems to which it links, i.e., the IMF Balance of Payments (BOP) and International
Investment Position (IIP), and the System of National Accounts (SNA). Thus, data drawn from the
DRS are cross-country comparable and the DRS also provides the yardstick to measure borrowers’
adherence to, or deviations from international standards and definitions. This is key to ensuring
that greater debt transparency brings more clarity and not confusion from a proliferation of
data compiled in accordance with national standards that cannot be readily compared."""

In [None]:
gereated  =""" ' This report sets out the rationale and preliminary findings of the data sharing exercise between G-7 and the World Bank . 
It has been extended to all Paris Club creditors on a voluntary basis . The exercise was initiated by Japan as an important contribution to 
furthering the agenda for greater debt transparency .  The importance of better debt data transparency and the World Bank’s role . 
Comprehensive, accurate and transparent public debt data are fundamental to the management of public liabilities .
 Knowing what is already owed is essential for policymakers to make informed and sustainable borrowing decisions .  
 The World Bank has long played a leading role in promoting and delivering on greater debt data . 
 The availability of high-quality data is a pre-requisite for the ability of national governments 
 and the international community to make informed debt-sustainability analyses, 
 to minimize the risk of debt crisis .  The World Bank regards the external debt data it collects through the DRS as a global good and has disseminated 
 comprehensive information, updated on an annual basis, since 1973 . The DRS helps ensure debt data conform to international definitions and standards .  
The DRS provides the yardstick to measure borrowers’ adherence to, or deviations from international standards and definitions . 
This is key to ensuring that greater debt transparency brings more clarity and not confusion . Data drawn from the DRS are cross-country comparable .'"""

In [32]:
huggingFace(text)

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.
Your max_length is set to 250, but your input_length is only 124. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=62)
Your max_length is set to 250, but your input_length is only 119. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=59)
Your max_length is set to 250, but your input_length is only 128. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=64)
Your max_le

' This report sets out the rationale and preliminary findings of the data sharing exercise between G-7 and the World Bank . It has been extended to all Paris Club creditors on a voluntary basis . The exercise was initiated by Japan as an important contribution to furthering the agenda for greater debt transparency .  The importance of better debt data transparency and the World Bank’s role . Comprehensive, accurate and transparent public debt data are fundamental to the management of public liabilities . Knowing what is already owed is essential for policymakers to make informed and sustainable borrowing decisions .  The World Bank has long played a leading role in promoting and delivering on greater debt data . The availability of high-quality data is a pre-requisite for the ability of national governments and the international community to make informed debt-sustainability analyses, to minimize the risk of debt crisis .  The World Bank regards the external debt data it collects throu

## T5_base (Hugging-Face)

In [38]:

from torch.utils.data import Dataset,DataLoader


from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5TokenizerFast as T5Tokenizer,
)


class NewsSummaryDataset(Dataset):
  def __init__(
      self,
      data: pd.DataFrame,
      tokenizer: T5Tokenizer,
      text_max_token_len: int = 512,
      summary_max_token_len: int = 128
      ):
    self.tokenizer = tokenizer
    self.data = data
    self.text_max_token_len = text_max_token_len
    self.summary_max_token_len = summary_max_token_len

  def __len__(self):
    return len(self.data)

  def __getitem__(self, index: int):
    data_row = self.data.iloc[index]

    text = data_row['Articles']
    
    text_encoding = tokenizer(
        text,
        max_length = self.text_max_token_len,
        padding = 'max_length',
        truncation = True,
        return_attention_mask = True,
        add_special_tokens = True,
        return_tensors = 'pt'
    )
    
    summary_encoding = tokenizer(
        data_row['Summaries'],
        max_length= self.summary_max_token_len,
        padding = 'max_length',
        truncation = True,
        return_attention_mask = True,
        add_special_tokens = True,
        return_tensors = 'pt'
    )

    labels = summary_encoding['input_ids']
    labels[labels == 0] = -100

    return dict(
        text=text,
        summary=data_row['Summaries'],
        text_input_ids = text_encoding['input_ids'].flatten(),
        text_attention_mask = text_encoding['attention_mask'].flatten(),
        labels = labels.flatten(),
        labels_attention_mask = summary_encoding['attention_mask'].flatten()
        )

* ***'pl.LightningDataModule'* simplifies and standardizes the data loading process, making it easier to maintain and scale your codebase**

In [39]:

import pytorch_lightning as pl



class NewsSummaryDataModule(pl.LightningDataModule):
  def __init__(
      self,
      train_df: pd.DataFrame,
      test_df: pd.DataFrame,
      tokenizer: T5Tokenizer,
      batch_size: int = 8,
      text_max_token_len: int = 512,
      summary_max_token_len: int = 128
      ):
        super().__init__()
        self.train_df = train_df
        self.test_df = test_df
        self.batch_size = batch_size
        self.tokenizer = tokenizer
        self.text_max_token_len = text_max_token_len
        self.summary_max_token_len = summary_max_token_len
  
  def setup(self, stage=None):
    self.train_dataset = NewsSummaryDataset(
        self.train_df,
        self.tokenizer,
        self.text_max_token_len,
    )

    self.test_dataset = NewsSummaryDataset(
        self.test_df,
        self.tokenizer,
        self.text_max_token_len,
    )

  def train_dataloader(self):
    return DataLoader(
        self.train_dataset,
        batch_size = self.batch_size,
        shuffle = True,
        num_workers = 2
    )

In [40]:
# T5 tokenizer from the Hugging Face Transformers library using the 't5-base' model
#T5 model, which is a transformer-based model architecture for text generation tasks

MODEL_NAME = 't5-base'
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)

Downloading (…)ve/main/spiece.model: 100%|██████████| 792k/792k [00:00<00:00, 34.4MB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 1.39M/1.39M [00:00<00:00, 22.8MB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 1.21k/1.21k [00:00<00:00, 1.21MB/s]
For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


## Analysis for T5 

In [41]:

text_token_counts, summary_token_counts = [], []

for _, row in train_df.iterrows():
  text_token_count = len(tokenizer.encode(row['Articles']))
  text_token_counts.append(text_token_count)

  summary_token_count = len(tokenizer.encode(row['Summaries']))
  summary_token_counts.append(summary_token_count)

Token indices sequence length is longer than the specified maximum sequence length for this model (577 > 512). Running this sequence through the model will result in indexing errors


In [42]:
import matplotlib.pyplot as plt
import seaborn as sns
fig, (ax1, ax2) = plt.subplots(1, 2)
sns.histplot(text_token_counts, ax=ax1)
ax1.set_title('full text token counts')
sns.histplot(summary_token_counts, ax=ax2)

ModuleNotFoundError: No module named 'matplotlib'

## T5 continues...

In [43]:
N_EPOCHS = 2
BATCH_SIZE = 8

data_module = NewsSummaryDataModule(train_df, test_df, tokenizer)

In [44]:
class NewsSummaryModel(pl.LightningModule):
  def __init__(self):
    super().__init__()
    self.model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, return_dict = True)

  def forward(self, input_ids, attention_mask, decoder_attention_mask, labels=None):
    output = self.model(
        input_ids,
        attention_mask = attention_mask,
        labels = labels,
        decoder_attention_mask = decoder_attention_mask
    )

    return output.loss, output.logits

  def training_step(self, batch, batch_size):
    input_ids = batch['text_input_ids']
    attention_mask = batch['text_attention_mask']
    labels = batch['labels']
    labels_attention_mask = batch['labels_attention_mask']

    loss, outputs = self(
        input_ids = input_ids,
        attention_mask = attention_mask,
        decoder_attention_mask = labels_attention_mask,
        labels = labels
    )

    self.log("test_loss", loss, prog_bar = True, logger = True)
    return loss

  def configure_optimizers(self):
    return AdamW(self.parameters(), lr = 0.0001)

In [45]:
model = NewsSummaryModel()

Downloading pytorch_model.bin: 100%|██████████| 892M/892M [00:23<00:00, 37.6MB/s] 
Downloading (…)neration_config.json: 100%|██████████| 147/147 [00:00<00:00, 142kB/s]


In [47]:
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger


checkpoint_callback = ModelCheckpoint(
    dirpath = 'checkpoints',
    filename = 'best-checkpoint',
    save_top_k = 1,
    verbose = True,
    monitor = 'test_loss',
    mode = 'min'
)

logger = TensorBoardLogger("lightning_logs", name='news-summary')

trainer = pl.Trainer(
    logger = logger,
    callbacks = checkpoint_callback,
    max_epochs = N_EPOCHS
 
)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [48]:
import torch
torch.cuda.empty_cache()

In [49]:
import gc
gc.collect()

1608

In [50]:
trainer.fit(model, data_module)

Missing logger folder: lightning_logs\news-summary

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M 
-----------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.614   Total estimated model params size (MB)


In [None]:

trained_model = NewsSummaryModel.load_from_checkpoint(
    trainer.checkpoint_callback.best_model_path
)
trained_model.freeze()

In [None]:
test_df.iloc[0]

Articles     'Fido' to be taken off vote lists\n\nThe risk ...
Summaries    The risk of pets and children being given vote...
Name: 733, dtype: object

In [None]:
import torch
def summarizeText(text, tokenizer, trained_model):
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    trained_model = trained_model.to(device)

    text_encoding = tokenizer(
        text,
        max_length=512,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        add_special_tokens=True,
        return_tensors='pt'
    )
    input_ids = text_encoding['input_ids'].to(device)
    attention_mask = text_encoding['attention_mask'].to(device)

    generated_ids = trained_model.model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_length=150,
        num_beams=2,
        length_penalty=1.0,
        early_stopping=True
    )

    preds = [
        tokenizer.decode(gen_id, skip_special_tokens=True, clean_up_tokenization_spaces=True)
        for gen_id in generated_ids
    ]
    return "".join(preds)



In [None]:
#Df with module generated summary for T5_base model

def generate_summaries(dataframe, article_column, tokenizer=tokenizer, trained_model=trained_model):
    dataframe['generated_summary'] = dataframe[article_column].apply(lambda x: summarizeText(x, tokenizer, trained_model))
    return dataframe

In [None]:
import random
# Select random rows
df_samples = df.sample(n=5).reset_index(drop=True)
df_samples

Unnamed: 0,Articles,Summaries
0,Gardener battles to narrow win\n\nJason Garden...,also set a new championship mark in the men's ...
1,Label withdraws McFadden's video\n\nThe new vi...,The head of Christian Brothers' school St Fint...
2,No jail for singer Courtney Love\n\nSinger Cou...,In a separate case relating to the same incide...
3,Lee to create new film superhero\n\nComic book...,A third Spider-Man film is scheduled for relea...
4,Lost Doors frontman movie found\n\nHistorians ...,"""We know he was at Florida State University fo..."


In [None]:
generate_summaries(df_samples,'Articles')

Unnamed: 0,Articles,Summaries,generated_summary
0,Gardener battles to narrow win\n\nJason Garden...,also set a new championship mark in the men's ...,"""It was a close race,"" admitted Gardener.The 2..."
1,Label withdraws McFadden's video\n\nThe new vi...,The head of Christian Brothers' school St Fint...,The new video of former Westlife singer Brian ...
2,No jail for singer Courtney Love\n\nSinger Cou...,In a separate case relating to the same incide...,"The assault charge stemmed from April 2004, wh..."
3,Lee to create new film superhero\n\nComic book...,A third Spider-Man film is scheduled for relea...,Comic book veteran Stan Lee is to team up with...
4,Lost Doors frontman movie found\n\nHistorians ...,"""We know he was at Florida State University fo...",Historians in Florida have discovered a 40-yea...


## Evaluation T5_base (Hugging-Face)

In [None]:
# Evaluation Using Rouge


from rouge import Rouge
# Assuming your DataFrame is called 'df' and the summary and predicted summary columns are 'summary' and 'predicted_summary'
summary_list = df_samples['Summaries'].tolist()
predicted_summary_list = df_samples['generated_summary'].tolist()

rouge = Rouge()
scores = rouge.get_scores(predicted_summary_list, summary_list, avg=True)

# Create a DataFrame to store the scores
scores_df = pd.DataFrame(scores)

# Print the ROUGE scores
print('T5_base (Hugging-Face) Score')
print(scores_df)


    rouge-1   rouge-2   rouge-l
r  0.571894  0.485545  0.571894
p  0.808078  0.734195  0.808078
f  0.658695  0.569680  0.658695


## Rough work like Rouge and Bleu code check with some model summary

In [None]:
from nltk.translate.bleu_score import corpus_bleu
from rouge import Rouge


# Load the reference summaries and generated summaries for evaluation
reference_summaries = ['''Yukos owner Menatep Group says it will ask Rosneft to repay a loan that Yugansk had secured on its assets.State-owned Rosneft bought the Yugansk unit for $9.3bn in a sale forced by Russia to part settle a $27.5bn tax claim against Yukos.The sale went ahead in December and Yugansk was sold to a little-known shell company which in turn was bought by Rosneft.But the company has said it intends to take action against Menatep to recover some of the tax claims and debts owed by Yugansk."The pledged assets are with Rosneft, so it will have to pay real money to the creditors to avoid seizure of Yugansk assets," said Moscow-based US lawyer Jamie Firestone, who is not connected to the case.'''
, '''The researchers conducted a study to analyze the effects of sleep deprivation on cognitive performance. They gathered a group of participants and divided them into two groups: one group was allowed a full night's sleep, while the other group was deprived of sleep for 24 hours. The participants were then given a series of cognitive tests to assess their performance. The results showed that the sleep-deprived group performed significantly worse on the tests compared to the well-rested group. This indicates that sleep deprivation can have a detrimental effect on cognitive abilities.''']
generated_summaries = ['''State-owned Rosneft bought the Yugansk unit for $9.3bn in a sale forced by Russia to part settle a $27.5bn tax claim against Yukos.Yukos owner Menatep Group says it will ask Rosneft to repay a loan that Yugansk had secured on its assets.But the company has said it intends to take action against Menatep to recover some of the tax claims and debts owed by Yugansk.Yukos owner Menatep Group says it will ask Rosneft to repay # List of generated summaries''',
'''Researchers conducted a study to analyze the effects of sleep deprivation on cognitive performance . One group was allowed a full night's sleep, while the other group was deprived of sleep for 24 hours . The results showed that the sleep-deprived group performed significantly worse on the tests compared to the well-rested group .''']

# Calculate BLEU score
references = [[summary.split()] for summary in reference_summaries]
hypotheses = [summary.split() for summary in generated_summaries]
bleu_score = corpus_bleu(references, hypotheses)

# Calculate ROUGE score
rouge = Rouge()
scores = rouge.get_scores(generated_summaries, reference_summaries, avg=True)
rouge_1_score = scores["rouge-1"]["f"]
rouge_2_score = scores["rouge-2"]["f"]
rouge_l_score = scores["rouge-l"]["f"]

# Print the evaluation scores
print("BLEU Score:", bleu_score)
print("ROUGE-1 Score:", rouge_1_score)
print("ROUGE-2 Score:", rouge_2_score)
print("ROUGE-L Score:", rouge_l_score)

In [None]:

## ROUGE and BLEU code for evalation



# from nltk.translate.bleu_score import corpus_bleu
# from rouge import Rouge
# # summary_list = []
# for index, row in df_samples.iterrows():
#     # Access individual columns of the row using column names
#     # summary_list.append({'reference_summary':row['Summaries'], 'generated_summary':row['generated_summary']})
#     reference_summaries = row['Summaries']
#     generated_summaries = row['generated_summary']

#     # Calculate BLEU score
#     references = [[summary.split()] for summary in reference_summaries]
#     hypotheses = [summary.split() for summary in generated_summaries]
#     bleu_score = corpus_bleu(references, hypotheses)

#     # Calculate ROUGE score
#     rouge = Rouge()
#     scores = rouge.get_scores(generated_summaries, reference_summaries, avg=True)
#     rouge_1_score = scores["rouge-1"]["f"]
#     rouge_2_score = scores["rouge-2"]["f"]
#     rouge_l_score = scores["rouge-l"]["f"]

#     # Print the evaluation scores
#     print("BLEU Score:", bleu_score)
#     print("ROUGE-1 Score:", rouge_1_score)
#     print("ROUGE-2 Score:", rouge_2_score)
#     print("ROUGE-L Score:", rouge_l_score)



In [None]:
df['Articles'][1]

'Houllier praises Benitez regime\n\nFormer Liverpool manager Gerard Houllier has praised the work of his Anfield successor Rafael Benitez.\n\nHoullier was angry at reports that he has been critical of Benitez since the Spaniard took over at Liverpool. But Houllier told BBC Sport: "In private and in public, I have stressed I believe Rafa is doing a good job. He is the right man at the right place. "Rafa is a good coach and a good man. I\'ve spoken to him since he has been at Liverpool and never criticised him." Houllier also revealed he is now ready to return to the game after leaving Liverpool in May following six years at Anfield.\n\nThe former France boss has been linked with a host of jobs and pulled out of the race to succeed Mark Hughes as Wales national coach. He has been working for Uefa, covering the Premiership for French television and also coaching in Brazil with national coach Carlos Alberto Perreira. Houllier said: "If a good club comes up at the right time then yes, I am 

In [None]:
sample_row = test_df.iloc[1]
text = sample_row['Articles']
model_summary = summarizeText(text, tokenizer, trained_model)
text

'Jack Cunningham to stand down\n\nVeteran Labour MP and former Cabinet minister Jack Cunningham has said he will stand down at the next election.\n\nOne of the few Blair-era ministers to serve under Jim Callaghan, he was given the agriculture portfolio when Labour regained power in 1997. Mr Cunningham went on to become Tony Blair\'s "cabinet enforcer". He has represented the constituency now known as Copeland since 1970. Mr Blair said he was a "huge figure" in Labour and a "valued, personal friend".\n\nDuring Labour\'s long period in opposition, Mr Cunningham held a number of shadow roles including foreign affairs, the environment and as trade spokesman. As agriculture minister he caused controversy when he decided to ban beef on the bone in the wake of fears over BSE. He quit the government in 1999 and in recent years has served as the chairman of the all-party committee on Lords reform and has been a loyal supporter of the government from the backbenches.\n'

In [None]:
sample_row['Summaries']

In [None]:
model_summary

In [None]:
sample_row = test_df.iloc[2]
text = sample_row['Articles']
model_summary = summarizeText(text, tokenizer, trained_model)
text

## WORD BANK text summarization for check

In [None]:
!pip install PyPDF2
!pip install kaggle


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [None]:
import numpy as np
import nltk
from PyPDF2 import PdfReader
import string

In [None]:
with open('/content/G7-and-Paris-Club-Data-Reconciliation-Exercise-April-2023.pdf', 'rb') as file:
    pdf_reader = PdfReader(file)
    num_pages = len(pdf_reader.pages)
    
    # Specify the number of lines to skip from the heading
    lines_to_skip = 3
    
    for page_number in range(num_pages):
        page = pdf_reader.pages[page_number]
        text = page.extract_text()
        
        # Split the text into lines
        lines = text.split('\n')
        
        # Extract the lines after skipping the specified number of lines
        lines_after_heading = lines[lines_to_skip:]
        
        # Join the lines back into a single string
        updated_text = '\n'.join(lines_after_heading)
        
        print(updated_text)


FileNotFoundError: ignored