In [1]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from string import punctuation
from nltk.cluster.util import cosine_distance
import numpy as np
import networkx as nx

In [2]:
# 两个句子的相似度
def sentence_similarity(sent1, sent2):
    
    sent1 = [w.lower() for w in sent1]
    sent2 = [w.lower() for w in sent2]
    
    all_words = list(set(sent1 + sent2))
    
    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)
    
    for w in sent1:
        vector1[all_words.index(w)] += 1
        
    for w in sent2:
        vector2[all_words.index(w)] += 1
    
    return 1 - cosine_distance(vector1, vector2)

In [3]:
def build_similarity_matrix(sentences):
    
    similarity_matrix = np.zeros((len(sentences), len(sentences)))
    for idx1 in range(len(sentences)):
        for idx2 in range(len(sentences)):
            if idx1 == idx2:
                continue
            similarity_matrix[idx1][idx2] = sentence_similarity(sentences[idx1], sentences[idx2])
    return similarity_matrix

In [36]:
def generate_summary(file_name, top_n=5):
    
    summarize_text = []
    
    data = open(file_name, "r").read().replace('\n', ' ')
    
    # 切分句子
    article = data.split('. ')
    sentences = []
    
    for sen in article:
        sentences.append(sen.split(" "))
        
    # 句子转化为向量，并计算相似度
    sentence_similarity_matrix = build_similarity_matrix(sentences)
    
    # 得到相似矩阵图
    sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_matrix)
    print(sentence_similarity_graph.edges(data=True))
    
    # 计算图中每个节点的PageRank值
    scores = nx.pagerank(sentence_similarity_graph)
    ranked_sentence = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
    
    # 构建最后结果
    for i in range(top_n):
        summarize_text.append(" ".join(ranked_sentence[i][1]))
        
    print("Summarize: \n", ". ".join(summarize_text))
    

In [37]:
generate_summary("./data.txt")

[(0, 1, {'weight': 0.26650089544451305}), (0, 2, {'weight': 0.31980107453341566}), (0, 3, {'weight': 0.21821789023599236}), (0, 4, {'weight': 0.3127716210856122}), (0, 5, {'weight': 0.1147078669352809}), (0, 6, {'weight': 0.22360679774997894}), (0, 7, {'weight': 0.3592106040535499}), (0, 8, {'weight': 0.22360679774997894}), (0, 9, {'weight': 0.24333213169614387}), (0, 10, {'weight': 0.06933752452815367}), (1, 2, {'weight': 0.2727272727272727}), (1, 3, {'weight': 0.23262105259961774}), (1, 4, {'weight': 0.22227711223719349}), (1, 5, {'weight': 0.14673479641335552}), (1, 6, {'weight': 0.19069251784911845}), (1, 7, {'weight': 0.30633583242699347}), (1, 8, {'weight': 0.1430193883868388}), (1, 9, {'weight': 0.27668578554642986}), (1, 10, {'weight': 0.059131239598908314}), (2, 3, {'weight': 0.37219368415938836}), (2, 4, {'weight': 0.3111879571320709}), (2, 5, {'weight': 0.19564639521780736}), (2, 6, {'weight': 0.09534625892455917}), (2, 7, {'weight': 0.1914598952668709}), (2, 8, {'weight': 0