In [20]:
from urllib.parse import urlparse
from urllib.parse import urldefrag
import time

In [14]:
from urllib.request import urlopen

def download_from_the_internet(url):
    try:
        return urlopen(url).read().decode('utf-8')
    except KeyboardInterrupt:
        raise
    except:
        return None

start_url = 'https://simple.wikipedia.org/wiki/Main_Page'

In [3]:
from file_storage import FileStorage

file_storage = FileStorage('storage')

In [4]:
from bs4 import BeautifulSoup
from urllib.parse import urljoin

def extract_links_from_html(url, html):
    parser = BeautifulSoup(html)
    return [urljoin(url, link.get('href')) for link in parser.findAll('a')]

In [5]:
import pickle

In [17]:
ban_patterns = ['Help', 'Wikipedia', 'Special', 'File', 'Template']
ban_patterns = [pattern + ':' for pattern in ban_patterns]
urls_to_visit = {start_url}
iteration = 0
end = 0
beginning = 'https://simple.wikipedia.org/wiki/'

def filter_url(url):
    if not url.startswith(beginning):
        return True
    url_end = url[len(beginning):]
    return any(url_end.startswith(ban_pattern) for ban_pattern in ban_patterns)

def download_wiki(iteration, urls_to_visit, need_log=False):
    while urls_to_visit:
        begin = time.time()
        new_url = urls_to_visit.pop()
        new_html = download_from_the_internet(new_url)
        file_storage.write(new_url, new_html)
        urls = extract_links_from_html(new_url, new_html)    
        fresh_urls = [
            urldefrag(url).url
            for url in urls
            if not file_storage.contains(url) and not filter_url(urldefrag(url).url)
        ]
        urls_to_visit.update(fresh_urls)
        end = time.time()
        if need_log:
            print(iteration, new_url, len(fresh_urls), end - begin)
        iteration += 1
        if iteration % 10 == 0:
            with open('state', 'wb') as state_file:
                pickle.dump((iteration, urls_to_visit), state_file)
                print('state saved')
        

In [None]:
def load_state():
    with open('state', 'rb') as state_file:
        return pickle.load(state_file)

In [None]:
download_wiki(iteration, urls_to_visit)

In [18]:
iteration, urls_to_visit = load_state()
download_wiki(iteration, urls_to_visit, need_log=True)

0 https://simple.wikipedia.org/wiki/Main_Page 123
1 https://simple.wikipedia.org/wiki/Theology 19
2 https://simple.wikipedia.org/wiki/Physics 262
3 https://simple.wikipedia.org/wiki/Education 52
4 https://simple.wikipedia.org/wiki/History_of_the_United_States 1543
5 https://simple.wikipedia.org/wiki/Heat 61
6 https://simple.wikipedia.org/wiki/History_of_North_America 148
7 https://simple.wikipedia.org/wiki/Whig_Party_(United_States) 45
8 https://simple.wikipedia.org/wiki/Alien_and_Sedition_Acts 17
9 https://simple.wikipedia.org/wiki/Project_Gutenberg 21
state saved
10 https://simple.wikipedia.org/wiki/Fur_Trade 37
11 https://simple.wikipedia.org/wiki/File:At_close_grips2.jpg 8
12 https://simple.wikipedia.org/wiki/Max_Planck 150
13 https://simple.wikipedia.org/wiki/Wrigley_Field 288
14 https://simple.wikipedia.org/wiki/Artist 105
15 https://simple.wikipedia.org/wiki/Guam 157
16 https://simple.wikipedia.org/wiki/Algebra 88
17 https://simple.wikipedia.org/wiki/John_Cabot 67
18 https://sim

KeyboardInterrupt: 