<img src="../Pics/MLSb-T.png" width="160">
<br><br>
<center><u><H1>Text Summarization</H1></u></center>

In [2]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from collections import defaultdict
from string import punctuation

In [3]:
stopwords = set(stopwords.words('english')+list(punctuation))
cut_min = 0.2
cut_max = 0.8

In [4]:
def compute_frequencies(word_sent):
    freq = defaultdict(int)
    for s in word_sent:
        for word in s:
            if word not in stopwords:
                freq[word] += 1
    n = float(max(freq.values()))
    for w in freq.keys():
        freq[w] = freq[w]/n
        if freq[w] >= cut_max or freq[w] <= cut_min:
            del freq[w]
    return freq        

In [5]:
def rank(ranking, n):
    return sorted(ranking, key=ranking.get, reverse=True)[:n]

In [6]:
def summarize(text, n):
    sents = sent_tokenize(text)
    assert n <= len(sents)
    word_sent = [word_tokenize(s.lower()) for s in sents]
    freq = compute_frequencies(word_sent)
    ranking = defaultdict(int)
    for i, sent in enumerate(word_sent):
        for w in sent:
            if w in freq:
                ranking[i] += freq[w]
    sents_idx = rank(ranking, n)
    return [sents[j] for j in sents_idx]

In [7]:
f=open('../data/carl_sagan_quote.txt')
text=f.read()
text

"What an astonishing thing a book is. It's a flat object made from a tree with flexible parts on which are imprinted lots of funny dark squiggles. But one glance at it and you're inside the mind of another person, maybe somebody dead for thousands of years. Across the millennia, an author is speaking clearly and silently inside your head, directly to you. Writing is perhaps the greatest of human inventions, binding together people who never knew each other, citizens of distant epochs. Books break the shackles of time. A book is proof that humans are capable of working magic."

In [8]:
summarize(text, 2)

['Writing is perhaps the greatest of human inventions, binding together people who never knew each other, citizens of distant epochs.',
 "It's a flat object made from a tree with flexible parts on which are imprinted lots of funny dark squiggles."]

## Another approach

In [9]:
results = []

In [10]:
for number, sentence in enumerate(nltk.sent_tokenize(text)):
    number_tokens=len(nltk.word_tokenize(sentence))
    tagged=nltk.pos_tag(nltk.word_tokenize(sentence))
    number_nouns=len([word for word, pos in tagged if pos in ["NN","NNP"]])
    #use NER to tag the named entities
    ners=nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sentence)),binary=False)
    number_ners=len([chunk for chunk in ners if hasattr(chunk,'node')])
    score=(number_ners+number_nouns)/float(number_tokens)
    results.append((number,score,sentence))

In [11]:
print(results)

[(0, 0.25, 'What an astonishing thing a book is.'), (1, 0.13636363636363635, "It's a flat object made from a tree with flexible parts on which are imprinted lots of funny dark squiggles."), (2, 0.17391304347826086, "But one glance at it and you're inside the mind of another person, maybe somebody dead for thousands of years."), (3, 0.15789473684210525, 'Across the millennia, an author is speaking clearly and silently inside your head, directly to you.'), (4, 0.043478260869565216, 'Writing is perhaps the greatest of human inventions, binding together people who never knew each other, citizens of distant epochs.'), (5, 0.2857142857142857, 'Books break the shackles of time.'), (6, 0.16666666666666666, 'A book is proof that humans are capable of working magic.')]


In [12]:
# sorting the tuple based on the score in descending order
for i in sorted(results, key=lambda x: x[1], reverse=True):
    print(i[2])

Books break the shackles of time.
What an astonishing thing a book is.
But one glance at it and you're inside the mind of another person, maybe somebody dead for thousands of years.
A book is proof that humans are capable of working magic.
Across the millennia, an author is speaking clearly and silently inside your head, directly to you.
It's a flat object made from a tree with flexible parts on which are imprinted lots of funny dark squiggles.
Writing is perhaps the greatest of human inventions, binding together people who never knew each other, citizens of distant epochs.


## Reference:

http://www.nltk.org/api/nltk.tokenize.html

http://www.nltk.org/book/ch02.html

http://www.nltk.org/book/ch05.html

http://www.nltk.org/book/ch07.html