In [1]:
from file_storage import FileStorage
from collections import defaultdict
import tqdm

In [2]:
storage = FileStorage('../storage')

In [3]:
filtered_storage = FileStorage('../filtered_storage')

Отлифльтруем сторадж, полученный в прошлом задании

In [4]:
beginning = 'https://simple.wikipedia.org/wiki/'
ban_patterns = [
    'Help', 'Help_talk', 'Wikipedia', 'Special', 'File', 'Template', 'Talk',
    'Template_talk', 'User_talk', 'User', 'Meta', 'user', 'MediaWiki', 'MediaWiki_talk',
    'Wikipedia_talk', 'Category_talk', 'Module', 'Media', 'Category', 
]

def filter_url(url):
    url_end = url[len(beginning):]
    return any(url_end.startswith(ban_pattern + ':') for ban_pattern in ban_patterns)

In [5]:
for url, page in storage.items():
    if not filter_url(url):
        filtered_storage.write(url, page)

Сторим соседей для каждого урла: извлекаем другие урлы из html, применяем к ним urldefrag и проверяем, что url находится в storage, то есть принадлежит к нашему графу.

In [6]:
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from urllib.parse import urldefrag

def extract_links_from_html(url, html):
    parser = BeautifulSoup(html)
    return [
        urldefrag(urljoin(url, link.get('href'))).url
        for link in parser.findAll('a')
    ]

In [8]:
def build_neighbors(storage):
    result = {}
    for url, page in tqdm.tqdm(storage.items()):
        result[url] = [
            link for link in extract_links_from_html(url, page)
            if link in storage
        ]
    return result

Вычислим pagerank

In [19]:
def normilize_pagerank(pagerank):
    max_rank = max(pagerank.values())
    return {url: rank / max_rank for url, rank in pagerank.items()}


def calc_pagerank(storage, iterations=20, delta=0.1, prev_pagerank=None, neighbors=None):
    if prev_pagerank is None:
        prev_pagerank = defaultdict(lambda: 1 / len(storage))
    if neighbors is None:
        neighbors = build_neighbors(storage)

    for ind in range(iterations):
        print('{} iteration, India rank: {}'.format(
            ind, prev_pagerank['https://simple.wikipedia.org/wiki/India']
        ))
        pagerank = defaultdict(lambda: delta)
        for url in storage.keys():
            for neighbor_url in neighbors[url]:
                pagerank[neighbor_url] += prev_pagerank[url] / len(neighbors) * (1 - delta)
        prev_pagerank = normilize_pagerank(pagerank)
    return prev_pagerank

In [10]:
neighbors = build_neighbors(filtered_storage)

151127it [1:00:03, 41.94it/s]


In [13]:
import pickle
with open('neighbors.pkl', 'wb') as f_out:
    pickle.dump(neighbors, f_out)

In [20]:
pagerank = calc_pagerank(filtered_storage, neighbors=neighbors)

0 iteration, India rank: 6.61695130585534e-06
1 iteration, India rank: 0.9998837565569197
2 iteration, India rank: 0.07561958666173003
3 iteration, India rank: 0.5244410349795183
4 iteration, India rank: 0.11969777718198288
5 iteration, India rank: 0.37737983824209537
6 iteration, India rank: 0.1541345572981528
7 iteration, India rank: 0.311416860969089
8 iteration, India rank: 0.17921224047126483
9 iteration, India rank: 0.2770313844149672
10 iteration, India rank: 0.19655436199770773
11 iteration, India rank: 0.2577005784515863
12 iteration, India rank: 0.20812264917378712
13 iteration, India rank: 0.24636982801133658
14 iteration, India rank: 0.21565508353137788
15 iteration, India rank: 0.23956519885937994
16 iteration, India rank: 0.2204827325576716
17 iteration, India rank: 0.2354190248469218
18 iteration, India rank: 0.22354556369840317
19 iteration, India rank: 0.2328703364303573


In [22]:
pagerank = calc_pagerank(filtered_storage, neighbors=neighbors, prev_pagerank=pagerank)

0 iteration, India rank: 0.22547622228433256
1 iteration, India rank: 0.23129514282086075
2 iteration, India rank: 0.22668826437635
3 iteration, India rank: 0.2303183537684309
4 iteration, India rank: 0.22744722242311402
5 iteration, India rank: 0.2297113854935061
6 iteration, India rank: 0.22792170590075722
7 iteration, India rank: 0.22933373572829416
8 iteration, India rank: 0.22821804482450017
9 iteration, India rank: 0.22909857783823864
10 iteration, India rank: 0.22840300751052484
11 iteration, India rank: 0.228952075013358
12 iteration, India rank: 0.22851840852999267
13 iteration, India rank: 0.2288607757935349
14 iteration, India rank: 0.22859039139919576
15 iteration, India rank: 0.22880386796403082
16 iteration, India rank: 0.22863528480277076
17 iteration, India rank: 0.2287683924034312
18 iteration, India rank: 0.2286632807176408
19 iteration, India rank: 0.2287462757655117


Отсортируем и сохраним результат

In [23]:
sorted_pagerank = sorted(pagerank.items(), key=lambda x: -x[1])

In [24]:
for ind, (url, rank) in enumerate(sorted_pagerank):
    print("{:>3} {:>5.4} {}".format(ind, rank, url))
    if ind == 19:
        break

  0   1.0 https://simple.wikipedia.org/wiki/Main_Page
  1 0.3121 https://simple.wikipedia.org/wiki/United_States
  2 0.3093 https://simple.wikipedia.org/wiki/International_Standard_Book_Number
  3 0.2784 https://simple.wikipedia.org/wiki/France
  4 0.2545 https://simple.wikipedia.org/wiki/Germany
  5 0.2526 https://simple.wikipedia.org/wiki/List_of_Wikipedias
  6 0.2502 https://simple.wikipedia.org/wiki/United_Kingdom
  7 0.2454 https://simple.wikipedia.org/wiki/Japan
  8 0.2437 https://simple.wikipedia.org/wiki/Canada
  9 0.2429 https://simple.wikipedia.org/wiki/Italy
 10 0.2406 https://simple.wikipedia.org/wiki/Geographic_coordinate_system
 11 0.2393 https://simple.wikipedia.org/wiki/England
 12 0.2366 https://simple.wikipedia.org/wiki/Americans
 13 0.236 https://simple.wikipedia.org/wiki/Spain
 14 0.2359 https://simple.wikipedia.org/wiki/Departments_of_France
 15 0.233 https://simple.wikipedia.org/wiki/Digital_object_identifier
 16 0.2325 https://simple.wikipedia.org/wiki/Communes_o

In [25]:
with open('pagerank_results.txt', 'w') as f_out:
    for ind, (url, rank) in enumerate(sorted_pagerank):
        f_out.write("{:>6} {:>7.6} {}\n".format(ind + 1, rank, url))