diff --git a/crawler/crawler.py b/crawler/crawler.py index f49ed30..3e9616f 100755 --- a/crawler/crawler.py +++ b/crawler/crawler.py @@ -70,7 +70,7 @@ def stop(self): if __name__ == "__main__": c = Crawler(10, 4, MONGODB_HOST, MONGODB_PORT, MONGODB_DBNAME, MONGODB_COLLECTION, - feeds=['http://www.pornhub.com'], + feeds=['http://www.youporn.com'], nb_ask_feeds=0) try: c.loop() diff --git a/crawler/fetcher.py b/crawler/fetcher.py index 1a0707c..bee41e3 100644 --- a/crawler/fetcher.py +++ b/crawler/fetcher.py @@ -26,6 +26,7 @@ def __init__(self, robot, queue_in, queue_out, max_depth, proxies): self._is_working = threading.Event() def stop(self): + self.mongodbAPI.stop() self.e_stop.set() def is_working(self): @@ -113,6 +114,7 @@ def extract(self, html, url): return extractor def url_need_a_visit(self, url): + return True p = urllib.parse.urlparse(url) if p.scheme in ('http','https'): return self.mongodbAPI.url_need_a_visit(url) diff --git a/crawler/mongodbapi.py b/crawler/mongodbapi.py index 996082d..8c6f629 100644 --- a/crawler/mongodbapi.py +++ b/crawler/mongodbapi.py @@ -3,7 +3,7 @@ import urllib.request import urllib.error import threading - +import queue from tools import * @@ -15,19 +15,30 @@ class MongodbAPI: def __init__(self, host='localhost', port=8080): self.host = host self.port = port + + self.queue = queue.Queue() + self.e_stop = threading.Event() + + self.t = threading.Thread(target=self.loop_send, name="MongoAPI.loop_send") + self.t.setDaemon(True) + self.t.start() + + def stop(self): + self.e_stop.set() + def add_page(self, *, url): page = { 'url': url } - self.send("add_page", dict_to_json(page), block=False) + self.queue.put("add_page", dict_to_json(page)) def add_link(self, *, source, target): link = { 'source': source, 'target': target, } - self.send("add_link", dict_to_json(link), block=False) + self.queue.put("add_link", dict_to_json(link)) def url_need_a_visit(self, url): url = { 'url' : url } @@ -39,27 +50,31 @@ def get_urls_to_visit(self, max_urls): r = self.send("get_urls_to_visit", dict_to_json(req)) return eval(r) - def send(self, operation, req, *, block=True): + def send(self, operation, req): url = "http://{host}:{port}/_rest_/{operation}".format( host=self.host, port=self.port, operation=operation ) encoded_req = req.encode() - def _f(): - try: - r = urllib.request.urlopen(url, encoded_req) - except urllib.error.URLError as ex: - print("ERROR", self.__class__.__name__, "send :", ex, "url=", url, "req=", req, "\n"+get_traceback()) - else: - return r - if block: - r = _f() - return r.read().decode() if r else "" + s = "" + try: + r = urllib.request.urlopen(url, encoded_req) + except urllib.error.URLError as ex: + print("ERROR", self.__class__.__name__, "send :", ex, "url=", url, "req=", req, "\n"+get_traceback()) else: - t = threading.Thread(target=_f) - t.start() - + s = r.read().decode() + r.close() + finally: + return s + def loop_send(self): + while not self.e_stop.is_set(): + try: + operation, req = self.queue.get(True, 0.5) + except: + pass + else: + self.send(operation,req)