Skip to content

Commit

Permalink
moins d'erreurs sur l'API Mongo
Browse files Browse the repository at this point in the history
  • Loading branch information
trecouvr committed Mar 15, 2012
1 parent 5912923 commit d50ba3b
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 18 deletions.
2 changes: 1 addition & 1 deletion crawler/crawler.py
Expand Up @@ -70,7 +70,7 @@ def stop(self):

if __name__ == "__main__":
c = Crawler(10, 4, MONGODB_HOST, MONGODB_PORT, MONGODB_DBNAME, MONGODB_COLLECTION,
feeds=['http://www.pornhub.com'],
feeds=['http://www.youporn.com'],
nb_ask_feeds=0)
try:
c.loop()
Expand Down
2 changes: 2 additions & 0 deletions crawler/fetcher.py
Expand Up @@ -26,6 +26,7 @@ def __init__(self, robot, queue_in, queue_out, max_depth, proxies):
self._is_working = threading.Event()

def stop(self):
self.mongodbAPI.stop()
self.e_stop.set()

def is_working(self):
Expand Down Expand Up @@ -113,6 +114,7 @@ def extract(self, html, url):
return extractor

def url_need_a_visit(self, url):
return True
p = urllib.parse.urlparse(url)
if p.scheme in ('http','https'):
return self.mongodbAPI.url_need_a_visit(url)
Expand Down
49 changes: 32 additions & 17 deletions crawler/mongodbapi.py
Expand Up @@ -3,7 +3,7 @@
import urllib.request
import urllib.error
import threading

import queue

from tools import *

Expand All @@ -15,19 +15,30 @@ class MongodbAPI:
def __init__(self, host='localhost', port=8080):
self.host = host
self.port = port

self.queue = queue.Queue()
self.e_stop = threading.Event()

self.t = threading.Thread(target=self.loop_send, name="MongoAPI.loop_send")
self.t.setDaemon(True)
self.t.start()


def stop(self):
self.e_stop.set()

def add_page(self, *, url):
page = {
'url': url
}
self.send("add_page", dict_to_json(page), block=False)
self.queue.put("add_page", dict_to_json(page))

def add_link(self, *, source, target):
link = {
'source': source,
'target': target,
}
self.send("add_link", dict_to_json(link), block=False)
self.queue.put("add_link", dict_to_json(link))

def url_need_a_visit(self, url):
url = { 'url' : url }
Expand All @@ -39,27 +50,31 @@ def get_urls_to_visit(self, max_urls):
r = self.send("get_urls_to_visit", dict_to_json(req))
return eval(r)

def send(self, operation, req, *, block=True):
def send(self, operation, req):
url = "http://{host}:{port}/_rest_/{operation}".format(
host=self.host,
port=self.port,
operation=operation
)
encoded_req = req.encode()
def _f():
try:
r = urllib.request.urlopen(url, encoded_req)
except urllib.error.URLError as ex:
print("ERROR", self.__class__.__name__, "send :", ex, "url=", url, "req=", req, "\n"+get_traceback())
else:
return r
if block:
r = _f()
return r.read().decode() if r else ""
s = ""
try:
r = urllib.request.urlopen(url, encoded_req)
except urllib.error.URLError as ex:
print("ERROR", self.__class__.__name__, "send :", ex, "url=", url, "req=", req, "\n"+get_traceback())
else:
t = threading.Thread(target=_f)
t.start()

s = r.read().decode()
r.close()
finally:
return s

def loop_send(self):
while not self.e_stop.is_set():
try:
operation, req = self.queue.get(True, 0.5)
except:
pass
else:
self.send(operation,req)


0 comments on commit d50ba3b

Please sign in to comment.