# b. ABSTRACTIVE TEXT SUMMARIZATION WITH TRANSFORMERS

In [None]:
import spacy
from sentence_transformers import SentenceTransformer, util


In [None]:
# Load SpaCy model
nlp = spacy.load("en_core_web_sm")


In [None]:
model = SentenceTransformer('bert-base-nli-mean-tokens')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/399 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:

# Sample text
text = """Machine learning is a branch of artificial intelligence that focuses on building
systems that learn from data.
It has become essential in applications like recommendation systems, fraud detection, and
autonomous driving.
Despite its success, challenges such as overfitting and data quality issues remain
important considerations."""

# Tokenize sentences
doc = nlp(text)
sentences = [sent.text.strip() for sent in doc.sents]

# Encode sentences using Sentence-BERT
embeddings = model.encode(sentences, convert_to_tensor=True)

# Compute cosine similarity between sentence embeddings
cosine_scores = util.cos_sim(embeddings, embeddings)

# Rank sentences by average similarity with other sentences
avg_scores = cosine_scores.mean(dim=1)

# Get indices of the top 2 sentences with highest average similarity
top_indices = avg_scores.argsort(descending=True)[:2]

# Create the summary by selecting the top-ranked sentences
summary = ' '.join([sentences[i] for i in top_indices])

# Output the summary
print("Extractive Summary:\n", summary)


Extractive Summary:
 It has become essential in applications like recommendation systems, fraud detection, and
autonomous driving. Despite its success, challenges such as overfitting and data quality issues remain
important considerations.


# a. BASIC TEXT SUMMARIZATION USING TF-IDF AND COSINE SIMILARITY

In [None]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:

import re
import networkx as nx
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


# Sample text
text = """Artificial Intelligence is rapidly transforming the world.
It has applications in healthcare, finance, education, and more.
AI systems can process large amounts of data efficiently.
However, there are challenges such as bias and ethical concerns.
Ongoing research aims to make AI safer and more transparent."""

# Function to clean text
def clean_text(text):
    text = re.sub(r'\[[0-9]*\]', ' ', text)  # Remove citations like [1]
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = text.lower()  # Convert to lowercase
    return text

# Clean the input text
cleaned_text = clean_text(text)

# Split the cleaned text into sentences
sentences = nltk.sent_tokenize(cleaned_text)

# Create TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Transform sentences into TF-IDF matrix
X = vectorizer.fit_transform(sentences)

# Compute cosine similarity matrix
similarity_matrix = cosine_similarity(X, X)

# Create a graph from the similarity matrix
nx_graph = nx.from_numpy_array(similarity_matrix)

# Apply PageRank algorithm
scores = nx.pagerank(nx_graph)

# Rank sentences by importance based on PageRank scores
ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)

# Select top 2 sentences for the summary
summary_sentences = [ranked_sentences[i][1] for i in range(2)]

# Combine the top sentences to form the summary
summary = ' '.join(summary_sentences)

# Output the summary
print("Summary:\n", summary)


Summary:
 ongoing research aims to make ai safer and more transparent. it has applications in healthcare, finance, education, and more.


# c. EXTRACTIVE SUMMARIZATION USING BERT AND SPACY

In [None]:
import spacy
from sentence_transformers import SentenceTransformer, util

In [None]:
nlp = spacy.load("en_core_web_sm")
model = SentenceTransformer('bert-base-nli-mean-tokens')

In [None]:
text = """Machine learning is a branch of artificial intelligence that focuses on building
systems that learn from data.
It has become essential in applications like recommendation systems, fraud detection, and
autonomous driving.
Despite its success, challenges such as overfitting and data quality issues remain
important considerations."""
# Tokenize sentences
doc = nlp(text)
sentences = [sent.text.strip() for sent in doc.sents]
# Encode sentences
embeddings = model.encode(sentences, convert_to_tensor=True)
# Compute similarity
cosine_scores = util.cos_sim(embeddings, embeddings)
# Rank sentences by average similarity
avg_scores = cosine_scores.mean(dim=1)
top_indices = avg_scores.argsort(descending=True)[:2]
summary = ' '.join([sentences[i] for i in top_indices])
print("Extractive Summary:\n", summary)

Extractive Summary:
 It has become essential in applications like recommendation systems, fraud detection, and
autonomous driving. Despite its success, challenges such as overfitting and data quality issues remain
important considerations.
