In [1]:
#import necessary libraries
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from scipy.sparse.linalg import svds
import re
import nltk
import string
from nltk.stem import WordNetLemmatizer
import unicodedata
import numpy as np
import networkx

In [2]:
def build_feature_matrix(documents, feature_type='frequency'):
    
    """function to build document term matrix based on bag of words features"""

    feature_type = feature_type.lower().strip()  
    
    if feature_type == 'binary':
        vectorizer = CountVectorizer(binary=True, min_df=1, 
                                     ngram_range=(1, 1))
    elif feature_type == 'frequency':
        vectorizer = CountVectorizer(binary=False, min_df=1, 
                                     ngram_range=(1, 1))
    elif feature_type == 'tfidf':
        vectorizer = TfidfVectorizer(min_df=1, 
                                     ngram_range=(1, 1))
    else:
        raise Exception("Wrong feature type entered. Possible values: 'binary', 'frequency', 'tfidf'")

    feature_matrix = vectorizer.fit_transform(documents).astype(float)
    
    return vectorizer, feature_matrix


In [3]:
def low_rank_svd(matrix, singular_count=2):
    
    u, s, vt = svds(matrix, k=singular_count)
    return u, s, vt

In [4]:
stopword_list = nltk.corpus.stopwords.words('english')
wnl = WordNetLemmatizer()

In [5]:
def tokenize_text(text): #to tokenizewords
    tokens = nltk.word_tokenize(text) 
    tokens = [token.strip() for token in tokens]
    return tokens

In [6]:
from pattern.en import tag
from nltk.corpus import wordnet as wn


def pos_tag_text(text): # add POS tags
    
    def penn_to_wn_tags(pos_tag):
        if pos_tag.startswith('J'):
            return wn.ADJ
        elif pos_tag.startswith('V'):
            return wn.VERB
        elif pos_tag.startswith('N'):
            return wn.NOUN
        elif pos_tag.startswith('R'):
            return wn.ADV
        else:
            return None
    
    tagged_text = tag(text)
    tagged_lower_text = [(word.lower(), penn_to_wn_tags(pos_tag))
                         for word, pos_tag in
                         tagged_text]
    return tagged_lower_text

In [7]:
def lemmatize_text(text):
    
    pos_tagged_text = pos_tag_text(text)
    lemmatized_tokens = [wnl.lemmatize(word, pos_tag) if pos_tag
                         else word                     
                         for word, pos_tag in pos_tagged_text]
    lemmatized_text = ' '.join(lemmatized_tokens)
    return lemmatized_text

In [8]:
def remove_special_characters(text):
    tokens = tokenize_text(text)
    pattern = re.compile('[{}]'.format(re.escape(string.punctuation)))
    filtered_tokens = filter(None, [pattern.sub(' ', token) for token in tokens])
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text

In [9]:
def remove_stopwords(text):
    tokens = tokenize_text(text)
    filtered_tokens = [token for token in tokens if token not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

In [10]:
def normalize_corpus(corpus, lemmatize=True, tokenize=False):
    
    """ function to normalize text"""
    
    normalized_corpus = []  
    for text in corpus:
        if lemmatize:
            text = lemmatize_text(text)
        else:
            text = text.lower()
        text = remove_special_characters(text)
        text = remove_stopwords(text)
        if tokenize:
            text = tokenize_text(text)
            normalized_corpus.append(text)
        else:
            normalized_corpus.append(text)
            
    return normalized_corpus

In [11]:
def parse_document(document):
    document = re.sub('\n', ' ', document)
    if isinstance(document, str):
        document = document
    elif isinstance(document, unicode):
        return unicodedata.normalize('NFKD', document).encode('ascii', 'ignore')
    else:
        raise ValueError('Document is not string or unicode!')
    document = document.strip()
    sentences = nltk.sent_tokenize(document)
    sentences = [sentence.strip() for sentence in sentences]
    return sentences

In [12]:
a = """
Elephants are large mammals of the family Elephantidae
and the order Proboscidea. Two species are traditionally recognised,
the African elephant and the Asian elephant. Elephants are scattered
throughout sub-Saharan Africa, South Asia, and Southeast Asia. Male
African elephants are the largest extant terrestrial animals. All
elephants have a long trunk used for many purposes,
particularly breathing, lifting water and grasping objects. Their
incisors grow into tusks, which can serve as weapons and as tools
for moving objects and digging. Elephants' large ear flaps help
to control their body temperature. Their pillar-like legs can
carry their great weight. African elephants have larger ears
and concave backs while Asian elephants have smaller ears
and convex or level backs.
"""

In [13]:
sentences = parse_document(a)

In [14]:
norm_sentences = normalize_corpus(sentences,lemmatize=True)

In [15]:
vec, dt_matrix = build_feature_matrix(sentences, feature_type='frequency') # build document term matrix based on bag of words features

In [16]:
total_sentences = len(norm_sentences)

In [17]:
total_sentences

9

In [18]:
td_matrix = dt_matrix.transpose() # convert to term document matrix
td_matrix = td_matrix.multiply(td_matrix > 0)

In [19]:
num_topics = 2
num_sentences = 3

In [20]:
# get low rank SVD components
u, s, vt = low_rank_svd(td_matrix, singular_count=num_topics)

In [21]:
# remove singular values below threshold
sv_threshold = 0.7
min_sigma_value = max(s) * sv_threshold
s[s < min_sigma_value] = 0

In [22]:
# compute salience scores for all sentences in document
salience_scores = np.sqrt(np.dot(np.square(s), np.square(vt)))

In [23]:
# print salience score for each sentence
print (np.round(salience_scores, 2))

[2.02 2.07 1.53 1.4  2.1  4.48 0.7  1.2  4.65]


In [24]:
# rank sentences based on their salience scores
top_sentence_indices = salience_scores.argsort()[-num_sentences:][::-1]
top_sentence_indices.sort()

In [25]:
print (top_sentence_indices)

[4 5 8]


**TextRank**

In [26]:
def textrank_text_summarizer(documents, num_top_sentences=2, feature_type='frequency'):
    
    vec, dt_matrix = build_feature_matrix(norm_sentences,
    feature_type='tfidf')
    similarity_matrix = (dt_matrix * dt_matrix.T)
    similarity_graph = networkx.from_scipy_sparse_matrix(similarity_matrix)
    scores = networkx.pagerank(similarity_graph)
    ranked_sentences = sorted(((score, index) for index, score in scores.items()), reverse=True)
    top_sentence_indices = [ranked_sentences[index][1] for index in range(num_top_sentences)]
    top_sentence_indices.sort()
    for index in top_sentence_indices:
        print (sentences[index])

In [27]:
textrank_text_summarizer(norm_sentences, num_top_sentences=3,feature_type='frequency')

Two species are traditionally recognised, the African elephant and the Asian elephant.
Male African elephants are the largest extant terrestrial animals.
African elephants have larger ears and concave backs while Asian elephants have smaller ears and convex or level backs.


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
