In [1]:
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
import numpy as np
import networkx as nx

In [2]:
stop_words = stopwords.words('english')

In [3]:
def get_sentences(text):
    raw_sentences = text.split('. ')
    cleaned_sentences = []
    for raw_sentence in raw_sentences:
        cleaned_sentences.append(raw_sentence.replace("[^a-zA-Z]", " ").split(" "))
    return cleaned_sentences

In [4]:
def sentence_similarity(sentence_1, sentence_2):
    sentence_1_lower = [w.lower() for w in sentence_1]
    sentence_2_lower = [w.lower() for w in sentence_2]
    
    all_words = list(set(sentence_1_lower + sentence_2_lower))
    
    vector_1 = [0] * len(all_words)
    vector_2 = [0] * len(all_words)
    
    for word in sentence_1_lower:
        if word in stop_words:
            continue
        vector_1[all_words.index(word)] += 1
    
    for word in sentence_2_lower:
        if word in stop_words:
            continue
        vector_2[all_words.index(word)] += 1
    
    return 1 - cosine_distance(vector_1, vector_2)

In [5]:
def build_similarity_matrix(sentences):
    similarity_matrix = np.zeros((len(sentences), len(sentences)))
    for idx1 in range(len(sentences)):
        for idx2 in range(len(sentences)):
            if idx1 == idx2:
                continue 
            similarity_matrix[idx1][idx2] = sentence_similarity(sentences[idx1], sentences[idx2])
    return similarity_matrix

In [6]:
def generate_summary(text):
    summarize_text = []
    sentences = get_sentences(text)
    
    top_n = int(len(sentences) * 0.5)
    
    sentence_similarity_matrix = build_similarity_matrix(sentences)
    sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_matrix)
    
    scores = nx.pagerank(sentence_similarity_graph)
    ranked_sentence = sorted(((scores[i], s) for i,s in enumerate(sentences)), reverse=True)    
    for i in range(top_n):
        summarize_text.append(" ".join(ranked_sentence[i][1]))

    return ". ".join(summarize_text) + "."

In [7]:
sample_text = """An approach to semi-supervised learning is proposed that is based on a Gaussian random field model. Labeled and unlabeled data are represented as vertices in a weighted graph, with edge weights encoding the similarity between instances. The learning problem is then formulated in terms of a Gaussian random field on this graph, where the mean of the field is characterized in terms of harmonic functions, and is efficiently obtained using matrix methods or belief propagation. The resulting learning algorithms have intimate connections with random walks, electric networks, and spectral graph theory. We discuss methods to incorporate class priors and the predictions of classifiers obtained by supervised learning. We also propose a method of parameter learning by entropy minimization, and show the algorithm\u2019s ability to perform feature selection. Promising experimental results are presented for synthetic data, digit classification, ands text classification tasks."""

print('Summary')
print('-'*50)
print(generate_summary(sample_text))

Summary
--------------------------------------------------
The learning problem is then formulated in terms of a Gaussian random field on this graph, where the mean of the field is characterized in terms of harmonic functions, and is efficiently obtained using matrix methods or belief propagation. An approach to semi-supervised learning is proposed that is based on a Gaussian random field model. The resulting learning algorithms have intimate connections with random walks, electric networks, and spectral graph theory.
