# 链接分析

## 导入文件

In [1]:
import re
import time
import os
import csv
import json
import pickle
from typing import List, Dict, Tuple

In [2]:
with open("page_info.json", "r", encoding='utf-8') as f:
    url_info : Dict = json.load(f)

In [3]:
with open("link_graph.pk", "rb") as f:
    link_graph : Dict[int, List[int]] = pickle.load(f)

In [4]:
with open("num2link.pk", "rb") as f:
    num2link : Dict[int, str] = pickle.load(f)

## 计算链接

先为每个链接分配编号，将图中的顶点转换为编号

In [5]:
num2id = {}
i = 0
for num in num2link:
    num2id[num] = i
    i += 1

In [6]:
link_num = i

In [7]:
id_graph = {}
for key, value in link_graph.items():
    temp = []
    for to_num in value:
        temp.append(num2id[to_num])
    id_graph[num2id[key]] = temp

维护两个数组，用于计算当前状态和下一状态的`PageRank`。

In [14]:
iter = 100
p = 0.85
curr_pageRank = [1 / link_num] * link_num
next_pageRank = [0] * link_num
random_jump = (1 - p) / link_num

In [15]:
for i in range(iter):
    for key, value in id_graph.items():
        if len(value) == 0:
            continue
        step = curr_pageRank[key] / len(value)
        for i in value:
            next_pageRank[i] += step

    curr_pageRank = next_pageRank
    next_pageRank = [0] * link_num
    for i, n in enumerate(curr_pageRank):
        # if n == 0:
        #     curr_pageRank[i] = 1 / link_num
        curr_pageRank[i] = curr_pageRank[i] * p + random_jump

In [12]:
curr_pageRank[:10]

[0.0008203634487337186,
 0.0005754530347213719,
 6.495309414156298e-05,
 0.0011902521225991888,
 9.838135739914219e-05,
 0.00013628247244336223,
 0.00010884854144308217,
 0.00013479594759141714,
 7.953003503214826e-05,
 7.155839626656156e-05]

In [22]:
# with open("test.json", "w", encoding="utf-8") as f:
#     json.dump(PageRank, f, ensure_ascii=False, indent=4)

将`PageRank`存入页面信息中：

In [23]:
for num, id in num2id.items():
    link = num2link[num]
    url_info[link]["page_rank"] = curr_pageRank[id]

In [24]:
with open("page_info.json", "w", encoding="utf-8") as f:
    json.dump(url_info, f, ensure_ascii=False, indent=4)