In [None]:
import torch
print(torch.cuda.is_available())

True


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import os
base_path = '/content/drive/MyDrive/datasets/NLP/text summarization/BBC News Summary'
articles_path = os.path.join(base_path, 'News Articles')
summaries_path = os.path.join(base_path, 'Summaries')

categories = ['business', 'entertainment', 'politics', 'sport', 'tech']
data = []

for category in categories:
  article_dir = os.path.join(articles_path, category)
  summary_dir = os.path.join(summaries_path, category)

for filename in os.listdir(article_dir):
  article_file = os.path.join(article_dir, filename)
  summary_file = os.path.join(summary_dir, filename)

  if os.path.exists(article_file) and os.path.exists(summary_file):
    with open(article_file, 'r') as file:
      article = file.read().strip()
    with open(summary_file, 'r') as file:
      summary = file.read().strip()

    data.append(
        {
            'category' : category,
            'filename' : filename,
            'article' : article,
            'summary' : summary
        }
    )

df = pd.DataFrame(data)
print(df.head())

  category filename                                            article  \
0     tech  262.txt  Broadband steams ahead in the US\n\nMore and m...   
1     tech  142.txt  Text messages aid disaster recovery\n\nText me...   
2     tech  347.txt  Cebit opens to mobile music tune\n\nCebit, the...   
3     tech  048.txt  'No re-draft' for EU patent law\n\nA proposed ...   
4     tech  179.txt  Seamen sail into biometric future\n\nThe luxur...   

                                             summary  
0  Broadband over the phone line makes up 11.4 mi...  
1  Right now, the Alert Retrieval Cache can only ...  
2  "The digital home will be a hyped theme at the...  
3  A proposed European law on software patents wi...  
4  She said French, Jordanian and Nigerian nation...  


In [None]:
def clean_text(text):
    return text.replace('\n', ' ').replace('\r', ' ').strip()

# Apply to both article and summary columns
df['article_clean'] = df['article'].apply(clean_text)
df['summary_clean'] = df['summary'].apply(clean_text)

# Preview cleaned text
print(df[['article_clean', 'summary_clean']].head())

                                       article_clean  \
0  Broadband steams ahead in the US  More and mor...   
1  Text messages aid disaster recovery  Text mess...   
2  Cebit opens to mobile music tune  Cebit, the w...   
3  'No re-draft' for EU patent law  A proposed Eu...   
4  Seamen sail into biometric future  The luxury ...   

                                       summary_clean  
0  Broadband over the phone line makes up 11.4 mi...  
1  Right now, the Alert Retrieval Cache can only ...  
2  "The digital home will be a hyped theme at the...  
3  A proposed European law on software patents wi...  
4  She said French, Jordanian and Nigerian nation...  


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk

nltk.download('punkt')
text = df['article'].iloc[0]

sentences = nltk.sent_tokenize(text) #sentences la list chua cac cau cua van ban

vectorizer = TfidfVectorizer(stop_words='english')
x = vectorizer.fit_transform(sentences)

similarity_matrix = cosine_similarity(x)
sentence_scores = similarity_matrix.sum(axis=1)

top_n = 3
top_sentence_indices = sentence_scores.argsort()[-top_n:][::-1]
top_sentences = [sentences[i] for i in sorted(top_sentence_indices)]

# Print results
print("Original Article:\n", text[:500], "\n...")
print("\nTraditional Extractive Summary:")
for sent in top_sentences:
    print("-", sent)


Original Article:
 Broadband steams ahead in the US

More and more Americans are joining the internet's fast lane, according to official figures.

The number of people and business connected to broadband jumped by 38% in a year, said the US Federal Communications Commission (FCC). In a report, it said there were more than 32 million broadband connections by the end of June 2004. But the US is still behind compared to other nations, ranked 13th in the world by a UN telecoms body.

During his 2004 re-election campai 
...

Traditional Extractive Summary:
- In a report, it said there were more than 32 million broadband connections by the end of June 2004.
- The total number of people and businesses on broadband rose by to 32.5 million in the year ending June 2004, compared to 23.5 million in June 2003.
- Broadband over the phone line makes up 11.4 million connections, according to the FCC figures.


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
!pip install sumy

Collecting sumy
  Downloading sumy-0.11.0-py2.py3-none-any.whl.metadata (7.5 kB)
Collecting docopt<0.7,>=0.6.1 (from sumy)
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting breadability>=0.1.20 (from sumy)
  Downloading breadability-0.1.20.tar.gz (32 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pycountry>=18.2.23 (from sumy)
  Downloading pycountry-24.6.1-py3-none-any.whl.metadata (12 kB)
Downloading sumy-0.11.0-py2.py3-none-any.whl (97 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m97.3/97.3 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pycountry-24.6.1-py3-none-any.whl (6.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m76.2 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: breadability, docopt
  Building wheel for breadability (setup.py) ... [?25l[?25hdone
  Created wheel for breadability: filename=brea

In [None]:
from sumy.parsers.plaintext import PlaintextParser #chuyen van ban thuan thanh doi tuong ma sumy co the xu ly
from sumy.nlp.tokenizers import Tokenizer #tach van ban thanh cau/tu
from sumy.summarizers.text_rank import TextRankSummarizer #trien khai thuat toan textrank de tom tat

# Use the same article
text = df['article_clean'].iloc[0]

# Set up parser and summarizer
parser = PlaintextParser.from_string(text, Tokenizer("english"))
summarizer = TextRankSummarizer()

# Generate summary with N sentences
summary = summarizer(parser.document, sentences_count=3) #chi giu lai 3 cau quan trong

# Print result
print("TextRank Summary:")
for sentence in summary:
    print("-", sentence)

TextRank Summary:
- The number of people and business connected to broadband jumped by 38% in a year, said the US Federal Communications Commission (FCC).
- According to the report by the FCC, broadband is becoming increasingly popular, with people using it for research and shopping, as well as downloading music and watching video.
- The total number of people and businesses on broadband rose by to 32.5 million in the year ending June 2004, compared to 23.5 million in June 2003.


In [None]:
!pip install transformers sentencepiece



In [None]:
from transformers import BartForConditionalGeneration, BartTokenizer
import torch

# Load the pre-trained BART model and tokenizer
model_name = "facebook/bart-large-cnn"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

# Select your article text
text = df['article_clean'].iloc[0]

# Tokenize and encode the input text
inputs = tokenizer.encode(text, return_tensors="pt", max_length=1024, truncation=True)

# Generate summary (you can tweak length and decoding strategy)
summary_ids = model.generate(
    inputs,
    max_length=130,
    min_length=30,
    length_penalty=2.0,
    num_beams=4,
    early_stopping=True
)

# Decode and print the summary
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print('Article: \n', df['article_clean'].iloc[0])
print("BART Summary:\n", summary)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Article: 
 Broadband steams ahead in the US  More and more Americans are joining the internet's fast lane, according to official figures.  The number of people and business connected to broadband jumped by 38% in a year, said the US Federal Communications Commission (FCC). In a report, it said there were more than 32 million broadband connections by the end of June 2004. But the US is still behind compared to other nations, ranked 13th in the world by a UN telecoms body.  During his 2004 re-election campaign, President George W Bush pledge to ensure that affordable high-speed net access would be available to all Americans by 2007.  According to the report by the FCC, broadband is becoming increasingly popular, with people using it for research and shopping, as well as downloading music and watching video. The total number of people and businesses on broadband rose by to 32.5 million in the year ending June 2004, compared to 23.5 million in June 2003. Whereas in the UK, most people hook

In [None]:
# from transformers import BartForConditionalGeneration, BartTokenizer
# import nltk
# nltk.download('punkt')

# model_name = "facebook/bart-large-cnn"
# tokenizer = BartTokenizer.from_pretrained(model_name)
# model = BartForConditionalGeneration.from_pretrained(model_name)

# text = " ".join(df['article_clean'].iloc[0].split())

# inputs = tokenizer.encode(text, return_tensors="pt", max_length=1024, truncation=True)

# summary_ids = model.generate(
#     inputs,
#     max_length=150,
#     min_length=50,
#     length_penalty=1.0,
#     num_beams=6,
#     repetition_penalty=2.5,
#     no_repeat_ngram_size=3,
#     early_stopping=True
# )

# summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# # Chia câu và lọc
# sentences = nltk.sent_tokenize(summary)
# summary = " ".join(sentences[:3])  # lấy 3 câu đầu

# print("BART Summary:\n", summary)
