<a href="https://colab.research.google.com/github/om123-collab/playground-f4tod5ym/blob/master/8_Text_Summarization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
import nltk
# Download necessary NLTK resources
nltk.download('punkt_tab') # This line was missing, downloading 'punkt_tab'
nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

def summarize(text, num_sentences=3):
  """
  Summarizes the given text using TextRank.

  Args:
    text: The input text to summarize.
    num_sentences: The number of sentences to include in the summary.

  Returns:
    A string containing the summary.
  """

  # Downloads the 'stopwords' dataset if it hasn't been downloaded already
  # nltk.download('stopwords')  # This line is executed before the function definition to avoid downloading multiple times


  stop_words = set(stopwords.words('english'))
  sentences = sent_tokenize(text)

  # Preprocess sentences
  clean_sentences = []
  for sentence in sentences:
    words = word_tokenize(sentence.lower())
    words = [word for word in words if word not in stop_words]
    clean_sentences.append(" ".join(words))

  # Create TF-IDF vectors
  vectorizer = TfidfVectorizer()
  tfidf_matrix = vectorizer.fit_transform(clean_sentences)

  # Calculate similarity scores
  similarity_matrix = cosine_similarity(tfidf_matrix)

  # Calculate sentence scores using PageRank
  scores = [0] * len(sentences)
  for i in range(len(sentences)):
    for j in range(len(sentences)):
      scores[i] += similarity_matrix[i][j]

  # Rank sentences by score
  ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)

  # Extract top-ranked sentences
  summary_sentences = [ranked_sentence[1] for ranked_sentence in ranked_sentences[:num_sentences]]

  return " ".join(summary_sentences)

# Example usage
text = """This is an example text.
It contains multiple sentences.
We want to summarize this text.
The summary should be concise and informative."""

summary = summarize(text)
print(summary)

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


We want to summarize this text. This is an example text. The summary should be concise and informative.
