# Text Summarization

In [1]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Swapnil
[nltk_data]     Mishra\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
from collections import Counter
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from string import punctuation
from heapq import nlargest

In [3]:
STOPWORDS = set(stopwords.words('english') + list(punctuation))
MIN_WORD_PROP, MAX_WORD_PROP = 0.05, 0.85

In [4]:
def compute_word_frequencies(word_sentences):
    words = [word for sentence in word_sentences
                    for word in sentence
                        if word not in STOPWORDS]
    counter = Counter(words)
    limit = float(max(counter.values()))
    word_frequencies = {word: freq/limit
                                   for word, freq in counter.items()}
    # Drop words if too common or too uncommon
    word_frequencies = {word:freq
                           for word,freq in word_frequencies.items()
                               if freq > MIN_WORD_PROP
                               and freq < MAX_WORD_PROP}
    return word_frequencies

def sentence_score(word_sentence, word_frequencies):
    return sum([word_frequencies.get(word,0)
                    for word in word_sentence])


def summarize(text:str, num_sentences = 3):
    '''
    Summarize the text, return the most relevant sentences
    :text the text to summarize
    :num_sentences the number of sentences to return
    '''
    text = text.lower() # Make the text lowercase
    sentences = sent_tokenize(text) # Breaks texts into sentences
    
    # Break sentences into words
    word_sentences = [word_tokenize(sentence) for sentence in sentences]
    
    # COmpute word frequencies
    word_frequencies = compute_word_frequencies(word_sentences)
    
    # Calculate the scores for each of the sentences
    scores = [sentence_score(word_sentence, word_frequencies) for word_sentence in word_sentences]
    sentence_scores = list(zip(sentences, scores))
    
    # Rank the sentences
    top_sentence_scores = nlargest(num_sentences, sentence_scores, key = lambda t: t[1])
    
    # Return the top sentences
    return[t[0] for t in top_sentence_scores]
    
with open("C:/Users/Swapnil Mishra/Desktop/DS/Text Mining/Text Summarization/Lordoftherings.txt",'r',encoding='utf-8') as file:
    lor = file.read()
    
# Cleaned text
lor = lor.replace('\n', ' ').strip()

# Summarize
print(f"Total Sentences: {len(sent_tokenize(lor))}")
print("Summary (3 sentences):", summarize(lor))
print("Summary (1 sentence):", summarize(lor, num_sentences=1))

Total Sentences: 21
Summary (3 sentences): ['‘you had better come and live here, frodo my lad,’ said bilbo one day; ‘and then we can celebrate our birthday-parties comfortably together.’ at that time frodo was still in his tweens, as the hobbits called the irresponsible twenties between childhood and coming of age at thirty-three.', 'bilbo was going to be eleventy-one, 111, a rather curious number and a very respectable age for a hobbit (the old took himself had only reached 130); and frodo was going to be thirty-three, 33) an important number: the date of his ‘coming of age’.', 'when mr. bilbo baggins of bag end announced that he would shortly be celebrating his eleventy-first birthday with a party of special magnificence, there was much talk and excitement in hobbiton.']
Summary (1 sentence): ['‘you had better come and live here, frodo my lad,’ said bilbo one day; ‘and then we can celebrate our birthday-parties comfortably together.’ at that time frodo was still in his tweens, as the

In [5]:
def count_words(text):
    """Count the number of words in the text."""
    words = word_tokenize(text)
    return len(words)

# Count words before summarization
original_word_count = count_words(lor)

# Get the summary
summary = summarize(lor, num_sentences=3)

# Count words after summarization
summary_word_count = count_words(' '.join(summary))

# Print results
print(f"Word Count Before Summarization: {original_word_count}")
print(f"Word Count After Summarization: {summary_word_count}")


Word Count Before Summarization: 575
Word Count After Summarization: 145
