In [1]:
import urllib
import requests

In [2]:
from urllib.parse import urlparse
from urllib.parse import urldefrag
import time

In [3]:
from urllib.request import urlopen

def download_from_the_internet(url):
    try:
        return urlopen(url).read().decode('utf-8')
    except KeyboardInterrupt:
        raise
    except Exception as e:
        print(e)

start_url = 'https://simple.wikipedia.org/wiki/Main_Page'

In [4]:
from file_storage import FileStorage

file_storage = FileStorage('storage')

In [5]:
from bs4 import BeautifulSoup
from urllib.parse import urljoin

def extract_links_from_html(url, html):
    parser = BeautifulSoup(html)
    return [urljoin(url, link.get('href')) for link in parser.findAll('a')]

In [6]:
import pickle
state_file_path = 'state.pickle'

def dump_state(iteration, urls_to_visit):
    with open(state_file_path, 'wb') as state_file:
        pickle.dump((iteration, urls_to_visit), state_file)
        print('state saved, iteration {}'.format(iteration))

def load_state():
    with open(state_file_path, 'rb') as state_file:
        return pickle.load(state_file)

In [7]:
urls_to_visit = {start_url}
iteration = 0

In [8]:
beginning = 'https://simple.wikipedia.org/wiki/'

def filter_url(url):
    if not url.startswith(beginning):
        return True
    url_end = url[len(beginning):]
    return any(url_end.startswith(ban_pattern) for ban_pattern in ban_patterns)

In [9]:
from multiprocessing.pool import Pool

In [10]:
import os
cpu_count = os.cpu_count()

In [11]:
import requests
from lxml.html import fromstring
from itertools import cycle

def get_proxy_pools():
    url = 'https://free-proxy-list.net/'
    response = requests.get(url)
    parser = fromstring(response.text)
    proxies = set()
    for i in parser.xpath('//tbody/tr'):
        if i.xpath('.//td[7][contains(text(),"yes")]'):
            #Grabbing IP and corresponding PORT
            proxy = ":".join([i.xpath('.//td[1]/text()')[0], i.xpath('.//td[2]/text()')[0]])
            proxies.add(proxy)
    proxies = list(proxies)
    proxy_pools = {
        i: cycle(proxies[i::cpu_count])
        for i in range(cpu_count)
    }
    return proxy_pools

In [12]:
ban_patterns = ['Help', 'Wikipedia', 'Special', 'File', 'Template', 'Talk', 'Template_talk', 'User_talk']
ban_patterns = [pattern + ':' for pattern in ban_patterns]

def parse_url(url):
    for attempt in range(2):
        html = download_from_the_internet(url)
        time.sleep(0.2)
        if html is not None:
            break
    else:
        return None

    urls = extract_links_from_html(url, html)
    return url, html, urls

def download_wiki(iteration, urls_to_visit, need_log=False):
    results = []
    with Pool() as pool:
        while urls_to_visit or results:
            new_url = urls_to_visit.pop()
            results.append(pool.apply_async(parse_url, (new_url,)))
            for ind, result in enumerate(results):
                if result.ready():
                    completed_result = result.get()
                    results.pop(ind)
                    break
            else:
                if len(results) < cpu_count - 1:
                    continue
                else:
                    completed_result = results[0].get()
                    results.pop(0)
            if completed_result is None:
                continue
            new_url, new_html, new_urls = completed_result
            file_storage.write(new_url, new_html)
                
            fresh_urls = [
                urldefrag(url).url
                for url in new_urls
                if url not in file_storage and not filter_url(urldefrag(url).url)
            ]
            urls_to_visit.update(fresh_urls)
            iteration += 1
            if iteration % 10 == 0:
                dump_state(iteration, urls_to_visit)

In [14]:
iteration, urls_to_visit = load_state()
download_wiki(iteration, urls_to_visit, need_log=False)

state saved, iteration 120820
state saved, iteration 120830
state saved, iteration 120840
state saved, iteration 120850
state saved, iteration 120860
state saved, iteration 120870
state saved, iteration 120880
state saved, iteration 120890
state saved, iteration 120900
state saved, iteration 120910
state saved, iteration 120920
state saved, iteration 120930
state saved, iteration 120940
state saved, iteration 120950
state saved, iteration 120960
HTTP Error 404: Not Found
state saved, iteration 120970
HTTP Error 404: Not Found
state saved, iteration 120980
HTTP Error 404: Not Found
HTTP Error 404: Not Found
state saved, iteration 120990
state saved, iteration 121000
state saved, iteration 121010
state saved, iteration 121020
state saved, iteration 121030
state saved, iteration 121040
state saved, iteration 121050
state saved, iteration 121060
state saved, iteration 121070
state saved, iteration 121080
state saved, iteration 121090
state saved, iteration 121100
state saved, iteration 121

state saved, iteration 123500
state saved, iteration 123510
state saved, iteration 123520
state saved, iteration 123530
state saved, iteration 123540
state saved, iteration 123550
state saved, iteration 123560
state saved, iteration 123570
state saved, iteration 123580
state saved, iteration 123590
state saved, iteration 123600
state saved, iteration 123610
state saved, iteration 123620
state saved, iteration 123630
state saved, iteration 123640
state saved, iteration 123650
state saved, iteration 123660
state saved, iteration 123670
state saved, iteration 123680
state saved, iteration 123690
state saved, iteration 123700
state saved, iteration 123710
state saved, iteration 123720
state saved, iteration 123730
state saved, iteration 123740
state saved, iteration 123750
state saved, iteration 123760
state saved, iteration 123770
state saved, iteration 123780
state saved, iteration 123790
state saved, iteration 123800
state saved, iteration 123810
state saved, iteration 123820
state save

state saved, iteration 126200
state saved, iteration 126210
state saved, iteration 126220
state saved, iteration 126230
HTTP Error 404: Not Found
HTTP Error 404: Not Found
state saved, iteration 126240
state saved, iteration 126250
state saved, iteration 126260
state saved, iteration 126270
state saved, iteration 126280
state saved, iteration 126290
state saved, iteration 126300
state saved, iteration 126310
state saved, iteration 126320
state saved, iteration 126330
state saved, iteration 126340
state saved, iteration 126350
state saved, iteration 126360
state saved, iteration 126370
state saved, iteration 126380
state saved, iteration 126390
state saved, iteration 126400
state saved, iteration 126410
state saved, iteration 126420
state saved, iteration 126430
state saved, iteration 126440
state saved, iteration 126450
state saved, iteration 126460
state saved, iteration 126470
state saved, iteration 126480
state saved, iteration 126490
state saved, iteration 126500
state saved, itera

state saved, iteration 128920
state saved, iteration 128930
state saved, iteration 128940
state saved, iteration 128950
state saved, iteration 128960
state saved, iteration 128970
state saved, iteration 128980
state saved, iteration 128990
state saved, iteration 129000
state saved, iteration 129010
state saved, iteration 129020
state saved, iteration 129030
state saved, iteration 129040
state saved, iteration 129050
state saved, iteration 129060
state saved, iteration 129070
state saved, iteration 129080
state saved, iteration 129090
state saved, iteration 129100
state saved, iteration 129110
state saved, iteration 129120
state saved, iteration 129130
state saved, iteration 129140
state saved, iteration 129150
state saved, iteration 129160
state saved, iteration 129170
state saved, iteration 129180
state saved, iteration 129190
state saved, iteration 129200
state saved, iteration 129210
state saved, iteration 129220
state saved, iteration 129230
state saved, iteration 129240
HTTP Error

state saved, iteration 131600
state saved, iteration 131610
state saved, iteration 131620
state saved, iteration 131630
state saved, iteration 131640
state saved, iteration 131650
state saved, iteration 131660
state saved, iteration 131670
state saved, iteration 131680
state saved, iteration 131690
state saved, iteration 131700
state saved, iteration 131710
state saved, iteration 131720
state saved, iteration 131730
state saved, iteration 131740
state saved, iteration 131750
state saved, iteration 131760
state saved, iteration 131770
state saved, iteration 131780
state saved, iteration 131790
state saved, iteration 131800
state saved, iteration 131810
state saved, iteration 131820
state saved, iteration 131830
state saved, iteration 131840
state saved, iteration 131850
state saved, iteration 131860
state saved, iteration 131870
state saved, iteration 131880
state saved, iteration 131890
state saved, iteration 131900
state saved, iteration 131910
state saved, iteration 131920
state save

state saved, iteration 134300
state saved, iteration 134310
state saved, iteration 134320
state saved, iteration 134330
state saved, iteration 134340
state saved, iteration 134350
state saved, iteration 134360
state saved, iteration 134370
state saved, iteration 134380
state saved, iteration 134390
state saved, iteration 134400
state saved, iteration 134410
state saved, iteration 134420
state saved, iteration 134430
state saved, iteration 134440
state saved, iteration 134450
state saved, iteration 134460
state saved, iteration 134470
state saved, iteration 134480
state saved, iteration 134490
state saved, iteration 134500
state saved, iteration 134510
state saved, iteration 134520
state saved, iteration 134530
state saved, iteration 134540
state saved, iteration 134550
state saved, iteration 134560
state saved, iteration 134570
state saved, iteration 134580
state saved, iteration 134590
state saved, iteration 134600
state saved, iteration 134610
state saved, iteration 134620
state save

Process ForkPoolWorker-7:
Process ForkPoolWorker-8:
Process ForkPoolWorker-6:
Process ForkPoolWorker-5:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/local/Cellar/python/3.6.4_4/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/local/Cellar/python/3.6.4_4/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()


KeyboardInterrupt: 