<a href="https://colab.research.google.com/github/mowillia/phantom_pen/blob/master/text_summarizer_1_word_ct_cos_sim.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Text Summarizer (#1) - Word Count and Cosine Similarity
**(June 17, 2019)**

Extractive Text Summarizer described in https://towardsdatascience.com/understand-text-summarization-and-create-your-own-summarizer-in-python-b26a9f09fc70

In [1]:
#!/usr/bin/env python
# coding: utf-8

import nltk
import textwrap

import nltk.data # natural language tool kit

# for tokenizing sentences according by the words
from nltk.tokenize import WhitespaceTokenizer

from nltk.tokenize import sent_tokenize, word_tokenize # $ pip install nltk
nltk.download('punkt')

import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
import numpy as np
import networkx as nx
 

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [0]:
filename  = '/content/sample_essay.txt'

In [0]:
## Function that outputs paragraphs from text file
def text_to_para(filename):
    
    para_list = open(filename).read().splitlines()
    
    para_list[:] = (value for value in para_list if value != '')
    
    return para_list

## function that outputs the sentences in a paragraph
def sents(para): 
    
    return sent_tokenize(para)

### function takes in a file and outputs a sentence length trajectory

## vector of sentences in a piece 
def raw_sents(filename):
    
    sent = []
    
    paragraphs = text_to_para(filename)[:]
    
    for paragraph in paragraphs:
        sent += sents(paragraph)
        
    return sent

In [4]:
raw_sents(filename)[55]

'"If you listen to Yunior on where you should put your eye on the text, you will miss the whole book.'

In [0]:
def sentence_similarity(sent1, sent2, stopwords=None):
    if stopwords is None:
        stopwords = []
 
    sent1 = [w.lower() for w in sent1]
    sent2 = [w.lower() for w in sent2]
 
    all_words = list(set(sent1 + sent2))
 
    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)
 
    # build the vector for the first sentence
    for w in sent1:
        if w in stopwords:
            continue
        vector1[all_words.index(w)] += 1
 
    # build the vector for the second sentence
    for w in sent2:
        if w in stopwords:
            continue
        vector2[all_words.index(w)] += 1
 
    return 1 - cosine_distance(vector1, vector2)
 
def build_similarity_matrix(sentences, stop_words):
    # Create an empty similarity matrix
    similarity_matrix = np.zeros((len(sentences), len(sentences)))
 
    for idx1 in range(len(sentences)):
        for idx2 in range(len(sentences)):
            if idx1 == idx2: #ignore if both are same sentences
                continue 
            similarity_matrix[idx1][idx2] = sentence_similarity(sentences[idx1], sentences[idx2], stop_words)

    return similarity_matrix


def generate_summary(file_name, top_n=5):
    stop_words = stopwords.words('english')
    summarize_text = []

    # Step 1 - Read text anc split it
    sentences =  raw_sents(file_name)

    # Step 2 - Generate Similary Martix across sentences
    sentence_similarity_martix = build_similarity_matrix(sentences, stop_words)

    # Step 3 - Rank sentences in similarity martix
    sentence_similarity_graph = nx.from_numpy_matrix(sentence_similarity_martix)
    scores = nx.pagerank(sentence_similarity_graph)

    # Step 4 - Sort the rank and pick top sentences
    ranked_sentence = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)    
    #print("Indexes of top ranked_sentence order are ", ranked_sentence)    

    for i in range(top_n):
        summarize_text.append("".join(ranked_sentence[i][1]))

    # Step 5 - Offcourse, output the summarize texr
    print("Summarize Text: \n",textwrap.fill(" ".join(summarize_text), 50))

In [6]:
generate_summary(filename, 5)

Summarize Text: 
 And although my appreciation of her writing
tempered as I grew older, unlike much of the
culture which now categorically vilifies Rand, I
still saw a considerable potency and relevance in
what she had written. Most learned truths about
the world are confused and complicated, bearing
Bohr’s hallmark of a deep truth in which even
their seemingly antithetical statements are also
somehow true of the world. In explaining his work,
Díaz said the real story of A Brief and Wondrous
Life of Oscar Wao could not be gleaned by
following precisely where Yunior led or to where
the intellectual insecurity Yunior deliberately
creates in the reader may push you [2]: And so
even after I grew out (or, thought I grew out) of
the self-absorption which initially attached me to
her fiction, I still remained intrigued by the
effect it had on people. It rather has more to do
with how people understand themselves and the need
for that understanding to be expressed, if not
necessarily according