In [9]:
import nltk
import math
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from string import punctuation
from nltk.cluster.util import cosine_distance
import numpy as np
import networkx as nx

## 两个句子的相似度

In [2]:
def sentence_similarity(sent1, sent2, stopword):
    
    sent1 = [w.lower() for w in sent1 if w not in stopword]
    sent2 = [w.lower() for w in sent2 if w not in stopword]
    
    all_words = list(set(sent1 + sent2))
    
    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)
    
    for w in sent1:
        vector1[all_words.index(w)] += 1
        
    for w in sent2:
        vector2[all_words.index(w)] += 1
    
    return 1 - cosine_distance(vector1, vector2)

## 计算句子-句子相似矩阵

In [3]:
def build_similarity_matrix(sentences, stopword):
    
    similarity_matrix = np.zeros((len(sentences), len(sentences)))
    for idx1 in range(len(sentences)):
        for idx2 in range(len(sentences)):
            if idx1 == idx2:
                continue
            similarity_matrix[idx1][idx2] = sentence_similarity(sentences[idx1], sentences[idx2], stopword)
    return similarity_matrix

## 手动计算每个句子的 PageRank 值

In [4]:
def sentences_pagerank(similarity_matrix):
    
    new_scores = [0.5 for _ in range(len(similarity_matrix))]
    old_scores = [0.0 for _ in range(len(similarity_matrix))]
    
    while isUpdate(new_scores, old_scores):
        for i in range(len(similarity_matrix)):
            old_scores[i] = new_scores[i]
        for i in range(len(similarity_matrix)):
            new_scores[i] = update_score(similarity_matrix, new_scores, i)
    return new_scores

In [5]:
"""
若更新后的分数和更新前的分数相差小于0.0001，则认为 PR 值已趋于稳定
"""
def isUpdate(new_scores, old_scores):
    flag = False
    for i in range(len(new_scores)):
        if math.fabs(new_scores[i] - old_scores[i]) >= 0.0001:
            flag = True
            break
    return flag

In [6]:
def update_score(weight_graph, scores, i):
    length = len(weight_graph)
    d = 0.85
    added_score = 0.0
 
    for j in range(length):
        fraction = 0.0
        denominator = 0.0
        # 计算分子
        fraction = weight_graph[j][i] * scores[j]
        # 计算分母
        for k in range(length):
            denominator += weight_graph[j][k]
        added_score += fraction / denominator
    weighted_score = (1 - d) + d * added_score
 
    return weighted_score

In [10]:
def generate_summary(file_name, top_n=5):
    
    stopword = set(stopwords.words('english') + list(punctuation))
    
    data = open(file_name, "r").read().replace('\n', ' ')
    
    # 切分句子
    article = data.split('. ')
    sentences = []
    
    for sen in article:
        sentences.append(sen.split(" "))
        
    # 句子转化为向量，并计算相似度
    sentence_similarity_matrix = build_similarity_matrix(sentences, stopword)
    
    ############################################################
    #
    # 1. 手动计算每个句子的 PageRank 值
    #
    scores = sentences_pagerank(sentence_similarity_matrix)
    # print(scores)
    
    
    """
    ############################################################
    #
    # 2. 使用复杂网络库networkx计算图中每个节点的 PageRank 值
    #
    
    # 根据句子间的相似度，构建相似矩阵图
    sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_matrix)
    # print(sentence_similarity_graph.edges(data=True))
    
    scores = nx.pagerank(sentence_similarity_graph)
    """
    
    
    # 根据 PR 值对句子排序
    ranked_sentence = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
    # print(ranked_sentence)
    
    # 选取 PR 值最高的 top_n 个句子组成摘要
    summarize_text = []
    for i in range(top_n):
        summarize_text.append(" ".join(ranked_sentence[i][1]))
        
    print("Summarize: \n", ". ".join(summarize_text))

In [11]:
generate_summary("./data.txt")

Summarize: 
 You can read more about installing beta software on iOS and Apple TV devices on Apple's support page. The new 12.3 betas include the new Apple TV app which is scheduled to officially launch in May. Apple has released the first beta of iOS 12.3 and tvOS 12.3 to developers. The developments comes just a few days since it released iOS 12.2 which came with support for Apple News+ and new Animoji. For those of you with Apple's third-generation set-top box, the firm has released a beta version of Apple TV Software 7.3 which can be downloaded and installed via iTunes
