In [1]:
from file_storage import FileStorage

In [2]:
storage = FileStorage('../storage')

In [3]:
filtered_storage = FileStorage('filtered_storage')

In [4]:
beginning = 'https://simple.wikipedia.org/wiki/'
ban_patterns = [
    'Help', 'Help_talk', 'Wikipedia', 'Special', 'File', 'Template', 'Talk',
    'Template_talk', 'User_talk', 'User', 'Meta', 'user', 'MediaWiki', 'MediaWiki_talk',
    'Wikipedia_talk', 'Category_talk', 'Module', 'Media', 'Category', 
]

def filter_url(url):
    url_end = url[len(beginning):]
    return any(url_end.startswith(ban_pattern + ':') for ban_pattern in ban_patterns)

In [5]:
for ind, (url, page) in enumerate(storage.items()):
    if not filter_url(url):
        filtered_storage.write(url, page)
    if ind > 2000:
        break

In [6]:
from bs4 import BeautifulSoup
from urllib.parse import urljoin

def extract_links_from_html(url, html):
    parser = BeautifulSoup(html)
    return [
        urljoin(url, link.get('href'))
        for link in parser.findAll('a')
    ]

In [7]:
from collections import defaultdict

In [22]:
def calc_pagerank(storage, iterations=20, delta=0.1, prev_pagerank=None):
    if prev_pagerank is None:
        prev_pagerank = defaultdict(lambda: 1 / len(storage))
    neighbors = {
        url: [link for link in extract_links_from_html(url, page) if link in storage]
        for url, page in storage.items()
    }
    for _ in range(iterations):
        pagerank = defaultdict(lambda: delta)
        for url in storage.keys():
            for neighbor_url in neighbors[url]:
                pagerank[neighbor_url] += prev_pagerank[url] / len(neighbors) * (1 - delta)
        prev_pagerank = pagerank
    max_rank = max(pagerank.values())
    return {url: rank / max_rank for url, rank in pagerank.items()}

In [23]:
pagerank = calc_pagerank(filtered_storage)

In [26]:
sorted_pagerank = sorted(pagerank.items(), key=lambda x: -x[1])

In [27]:
for ind, (url, rank) in enumerate(sorted_pagerank):
    print("{:3<} {:.5} {}".format(ind, rank, url))
    if ind == 19:
        break

0 1.0 https://simple.wikipedia.org/wiki/Main_Page
1 0.48721 https://simple.wikipedia.org/wiki/International_Standard_Book_Number
2 0.38554 https://simple.wikipedia.org/wiki/Netherlands
3 0.3835 https://simple.wikipedia.org/wiki/India
4 0.38255 https://simple.wikipedia.org/wiki/World_War_II
5 0.38156 https://simple.wikipedia.org/wiki/Democratic_Party_(United_States)
6 0.37829 https://simple.wikipedia.org/wiki/1960
7 0.37791 https://simple.wikipedia.org/wiki/June
8 0.37717 https://simple.wikipedia.org/wiki/April
9 0.37714 https://simple.wikipedia.org/wiki/1979
10 0.37657 https://simple.wikipedia.org/wiki/Turkey
11 0.37641 https://simple.wikipedia.org/wiki/1928
12 0.37619 https://simple.wikipedia.org/wiki/Greece
13 0.37612 https://simple.wikipedia.org/wiki/Singer-songwriter
14 0.37586 https://simple.wikipedia.org/wiki/1940
15 0.37546 https://simple.wikipedia.org/wiki/1994
16 0.37469 https://simple.wikipedia.org/wiki/Belgium
17 0.37437 https://simple.wikipedia.org/wiki/1968
18 0.37229 http