Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

Add the project code.

  • Loading branch information...
commit 15ae16b9a0a77d025b23c0dfb2259cf13e5ca961 1 parent 77ebe6b
@andrewjw andrewjw authored
View
0  __init__.py
No changes.
View
0  crawler/__init__.py
No changes.
View
5 crawler/couchviews/db/page/by_rank/map.js
@@ -0,0 +1,5 @@
+function (doc) {
+ if(doc.type == "page") {
+ emit(-doc.rank, doc._id);
+ }
+}
View
5 crawler/couchviews/db/page/by_url/map.js
@@ -0,0 +1,5 @@
+function (doc) {
+ if(doc.type == "page") {
+ emit(doc.url, doc._id);
+ }
+}
View
7 crawler/couchviews/db/page/links_to_url/map.js
@@ -0,0 +1,7 @@
+function (doc) {
+ if(doc.type == "page") {
+ for(i = 0; i < doc.links.length; i++) {
+ emit(doc.links[i], [doc.rank, doc.links.length]);
+ }
+ }
+}
View
5 crawler/couchviews/db/robotstxt/by_domain/map.js
@@ -0,0 +1,5 @@
+function (doc) {
+ if(doc.type == "robotstxt") {
+ emit([doc.protocol, doc.domain], doc._id);
+ }
+}
View
43 crawler/indexer.py
@@ -0,0 +1,43 @@
+import os
+
+from whoosh import index
+from whoosh.fields import *
+
+schema = Schema(title=TEXT(stored=True), url=ID(stored=True, unique=True), desc=ID(stored=True), rank=NUMERIC(stored=True, type=float), content=TEXT)
+
+_ix = None
+
+def get_index():
+ global _ix
+
+ if _ix is not None:
+ pass
+ elif not os.path.exists("indexdir"):
+ os.mkdir("indexdir")
+ _ix = index.create_in("indexdir", schema)
+ else:
+ _ix = index.open_dir("indexdir")
+
+ return _ix
+
+def get_writer():
+ return get_index().writer()
+
+def get_searcher():
+ return get_index().searcher()
+
+def get_last_change():
+ get_index() # create directory
+
+ if os.path.exists("indexdir/since.txt"):
+ try:
+ return int(open("indexdir/since.txt").read())
+ except ValueError:
+ return 0
+ else:
+ return 0
+
+def set_last_change(since):
+ get_index() # create directory
+
+ open("indexdir/since.txt", "w").write(str(since))
View
0  crawler/management/__init__.py
No changes.
View
0  crawler/management/commands/__init__.py
No changes.
View
47 crawler/management/commands/index_update.py
@@ -0,0 +1,47 @@
+import re
+
+from BeautifulSoup import BeautifulSoup
+import couchdb
+from django.core.management.base import BaseCommand, CommandError
+
+from crawler.indexer import get_writer, get_last_change, set_last_change
+
+import settings
+
+desc_re = re.compile("^description$", re.I)
+
+class Command(BaseCommand):
+ def handle(self, **options):
+ since = get_last_change()
+ writer = get_writer()
+ try:
+ while True:
+ changes = settings.db.changes(since=since)
+ since = changes["last_seq"]
+ for changeset in changes["results"]:
+ try:
+ doc = settings.db[changeset["id"]]
+ except couchdb.http.ResourceNotFound:
+ continue
+ if "type" in doc and doc["type"] == "page":
+ print "indexing", doc["url"]
+ soup = BeautifulSoup(doc["content"])
+ if soup.body is None:
+ continue
+
+ desc = soup.findAll('meta', attrs={ "name": desc_re })
+
+ writer.update_document(
+ title=unicode(soup.title(text=True)[0]) if soup.title is not None and len(soup.title(text=True)) > 0 else doc["url"],
+ url=unicode(doc["url"]),
+ desc=unicode(desc[0]["content"]) if len(desc) > 0 and desc[0]["content"] is not None else u"",
+ rank=doc["rank"],
+ content=unicode(soup.title(text=True)[0] + "\n" + doc["url"] + "\n" + "".join(soup.body(text=True)))
+ )
+
+ writer.commit()
+ writer = get_writer()
+
+ set_last_change(since)
+ finally:
+ set_last_change(since)
View
7 crawler/management/commands/start_crawl.py
@@ -0,0 +1,7 @@
+from django.core.management.base import BaseCommand, CommandError
+
+from crawler.tasks import retrieve_page
+
+class Command(BaseCommand):
+ def handle(self, url, **options):
+ retrieve_page.delay(url, rank=1)
View
43 crawler/management/commands/update_couchdb.py
@@ -0,0 +1,43 @@
+import couchdb
+import glob
+import os
+
+from django.core.management.base import NoArgsCommand
+
+class Command(NoArgsCommand):
+ help = "Update couchdb views"
+
+ can_import_settings = True
+
+ def handle_noargs(self, **options):
+ import settings
+
+ couchdir = os.path.realpath(os.path.split(__file__)[0] + "../../../couchviews")
+
+ databases = glob.glob(couchdir+"/*")
+ for d in databases:
+ if not os.path.isdir(d):
+ continue
+
+ db = getattr(settings, d.split("/")[-1])
+
+ for design in glob.glob(d + "/*"):
+ design = design.split("/")[-1]
+ try:
+ doc = db["_design/" + design]
+ except couchdb.http.ResourceNotFound:
+ doc = {"_id": "_design/" + design}
+
+ doc["views"] = {}
+ for mapreduce in glob.glob(d+"/"+design+"/*"):
+ mapreduce = mapreduce.split("/")[-1]
+ mr = {}
+ mr["map"] = open(d+"/"+design+"/"+mapreduce+"/map.js").read()
+ try:
+ mr["reduce"] = reduce = open(d+"/"+design+"/"+mapreduce+"/reduce.js").read()
+ except IOError:
+ pass
+
+ doc["views"][mapreduce] = mr
+
+ db["_design/" + design] = doc
View
153 crawler/models.py
@@ -0,0 +1,153 @@
+import base64
+from datetime import datetime
+import pickle
+from robotparser import RobotFileParser
+import time
+from urlparse import urlparse
+from urllib2 import urlopen, Request, HTTPError, install_opener, build_opener, HTTPRedirectHandler
+
+from django.core.cache import cache
+
+from couchdb.mapping import Document, TextField, DateTimeField, ListField, FloatField
+
+import settings
+
+install_opener(build_opener(HTTPRedirectHandler()))
+
+class Page(Document):
+ type = TextField(default="page")
+
+ url = TextField()
+
+ content = TextField()
+
+ links = ListField(TextField())
+
+ rank = FloatField(default=0)
+
+ last_checked = DateTimeField(default=datetime.now)
+
+ def is_valid(self):
+ return (datetime.now() - self.last_checked).days < 7
+
+ def update(self):
+ parse = urlparse(self.url)
+
+ robotstxt = RobotsTxt.get_by_domain(parse.scheme, parse.netloc)
+ if not robotstxt.is_allowed(parse.netloc):
+ return False
+
+ while cache.get(parse.netloc) is not None:
+ time.sleep(1)
+ cache.set(parse.netloc, True, 10)
+
+ print "getting", self.url
+ req = Request(self.url, None, { "User-Agent": settings.USER_AGENT })
+
+ resp = urlopen(req)
+ if not resp.info()["Content-Type"].startswith("text/html"):
+ return
+ self.content = resp.read().decode("utf8")
+ self.last_checked = datetime.now()
+
+ self.store(settings.db)
+
+ @staticmethod
+ def count():
+ r = settings.db.view("page/by_url", limit=0)
+ return r.total_rows
+
+ @staticmethod
+ def get_top_by_rank(limit=10):
+ r = settings.db.view("page/by_rank", limit=limit)
+ docs = []
+ for row in r.rows:
+ docs.append(Page.load(settings.db, row.value))
+ return docs
+
+ @staticmethod
+ def get_by_url(url, update=True):
+ r = settings.db.view("page/by_url", key=url)
+ if len(r.rows) == 1:
+ doc = Page.load(settings.db, r.rows[0].value)
+ if doc.is_valid():
+ return doc
+ elif not update:
+ return None
+ else:
+ doc = Page(url=url)
+
+ doc.update()
+
+ return doc
+
+ @staticmethod
+ def get_id_by_url(url, update=True):
+ r = settings.db.view("page/by_url", key=url)
+ if len(r) == 1:
+ return r.rows[0].value
+ else:
+ doc = Page.get_by_url(url, update=update)
+ if doc is not None:
+ return doc.id
+ else:
+ return None
+
+ @staticmethod
+ def get_links_to_url(url):
+ return [row.value for row in settings.db.view("page/links_to_url", key=url).rows]
+
+class RobotsTxt(Document):
+ type = TextField(default="robotstxt")
+
+ domain = TextField()
+ protocol = TextField()
+
+ robot_parser_pickle = TextField()
+
+ def _get_robot_parser(self):
+ if self.robot_parser_pickle is not None:
+ return pickle.loads(base64.b64decode(self.robot_parser_pickle))
+ else:
+ parser = RobotFileParser()
+ parser.set_url(self.protocol + "://" + self.domain + "/robots.txt")
+ self.robot_parser = parser
+
+ return parser
+ def _set_robot_parser(self, parser):
+ self.robot_parser_pickle = base64.b64encode(pickle.dumps(parser))
+ robot_parser = property(_get_robot_parser, _set_robot_parser)
+
+ def is_valid(self):
+ return (time.time() - self.robot_parser.mtime()) < 7*24*60*60
+
+ def is_allowed(self, url):
+ return self.robot_parser.can_fetch(settings.USER_AGENT, url)
+
+ def update(self):
+ while cache.get(self.domain) is not None:
+ time.sleep(1)
+ cache.set(self.domain, True, 10)
+
+ print "getting %s://%s/robots.txt" % (self.protocol, self.domain)
+ parser = self.robot_parser
+ parser.read()
+ parser.modified()
+ self.robot_parser = parser
+
+ self.store(settings.db)
+
+ @staticmethod
+ def get_by_domain(protocol, domain):
+ r = settings.db.view("robotstxt/by_domain", key=[protocol, domain])
+ if len(r) > 0:
+ doc = RobotsTxt.load(settings.db, r.rows[0].value)
+ if doc.is_valid():
+ return doc
+ else:
+ doc = RobotsTxt(protocol=protocol, domain=domain)
+
+ doc.update()
+ doc.store(settings.db)
+
+ return doc
View
99 crawler/tasks.py
@@ -0,0 +1,99 @@
+from datetime import datetime
+import re
+import time
+from urlparse import urlparse
+from utils import unescape
+
+from celery.decorators import task
+
+from crawler.models import Page, RobotsTxt
+
+import settings
+
+@task
+def retrieve_page(url, rank=None):
+ print "retrieve_page %s" % (url, )
+ if url.startswith("http://showmedo.com") or url.startswith("http://www.rentacarnow.com"):
+ return
+ page = Page.get_by_url(url)
+ if page is None:
+ return
+
+ if rank is not None:
+ page.rank = rank
+ page.store(settings.db)
+
+ assert page.id is not None
+ find_links.delay(page.id)
+
+link_single_re = re.compile(r"<a[^>]+href='([^']+)'")
+link_double_re = re.compile(r'<a[^>]+href="([^"]+)"')
+
+@task
+def find_links(doc_id):
+ if doc_id is None:
+ return
+
+ doc = Page.load(settings.db, doc_id)
+
+ if doc.content is None:
+ print "Got None for the content of %s -> %s." % (doc_id, doc.url)
+ return
+
+ raw_links = []
+ for match in link_single_re.finditer(doc.content):
+ raw_links.append(match.group(1))
+
+ for match in link_double_re.finditer(doc.content):
+ raw_links.append(match.group(1))
+
+ doc.links = []
+ for link in raw_links:
+ if link.startswith("#"):
+ continue
+ elif link.startswith("http://") or link.startswith("https://"):
+ pass
+ elif link.startswith("/"):
+ parse = urlparse(doc["url"])
+ link = parse.scheme + "://" + parse.netloc + link
+ else:
+ link = "/".join(doc["url"].split("/")[:-1]) + "/" + link
+
+ doc.links.append(unescape(link.split("#")[0]))
+
+ print "find_links %s -> %i" % (doc.url, len(doc.links))
+ doc.store(settings.db)
+
+ calculate_rank.delay(doc.id)
+
+ for link in doc.links:
+ p = Page.get_id_by_url(link, update=False)
+ if p is not None:
+ calculate_rank.delay(p)
+ else:
+ retrieve_page.delay(link)
+
+@task
+def calculate_rank(doc_id):
+ page = Page.load(settings.db, doc_id)
+
+ links = Page.get_links_to_url(page.url)
+
+ rank = 0
+ for link in links:
+ rank += link[0] / link[1]
+
+ old_rank = page.rank
+ page.rank = rank * 0.85
+
+ if page.rank == 0:
+ page.rank = 1.0/settings.db.view("page/by_url", limit=0).total_rows
+
+ if abs(old_rank - page.rank) > 0.0001:
+ print "%s: %s -> %s" % (page.url, old_rank, page.rank)
+ page.store(settings.db)
+
+ for link in page.links:
+ p = Page.get_id_by_url(link, update=False)
+ if p is not None:
+ calculate_rank.delay(p)
View
12 crawler/templates/base.html
@@ -0,0 +1,12 @@
+<!DOCTYPE html>
+<html>
+<head>
+ <title>Celery Search</title>
+</head>
+<body>
+ <h1>Celery Search</h1>
+
+ {% block body %}
+ {% endblock %}
+</body>
+</html>
View
22 crawler/templates/index.html
@@ -0,0 +1,22 @@
+{% extends "base.html" %}
+
+{% block body %}
+ <form action="/search" method="get">
+ <input name="q" type="text">
+ <input type="submit">
+ </form>
+
+ <hr>
+
+ <p>{{ doc_count }} pages in index.</p>
+
+ <hr>
+
+ <h2>Top Pages</h2>
+
+ <ol>
+ {% for page in top_docs %}
+ <li><a href="{{ page.url }}">{{ page.url }}</a> - {{ page.rank }}</li>
+ {% endfor %}
+ </ol>
+{% endblock %}
View
15 crawler/templates/results.html
@@ -0,0 +1,15 @@
+{% extends "base.html" %}
+
+{% block body %}
+ <form action="/search" method="get">
+ <input name="q" type="text" value="{{ q }}">
+ <input type="submit">
+ </form>
+
+ {% for result in results|slice:":20" %}
+ <p>
+ <b><a href="{{ result.url }}">{{ result.title|safe }}</a></b> ({{ result.score }}, {{ result.rank }}, {{ result.combined }})<br>
+ {{ result.desc|safe }}
+ </p>
+ {% endfor %}
+{% endblock %}
View
23 crawler/tests.py
@@ -0,0 +1,23 @@
+"""
+This file demonstrates two different styles of tests (one doctest and one
+unittest). These will both pass when you run "manage.py test".
+
+Replace these with more appropriate tests for your application.
+"""
+
+from django.test import TestCase
+
+class SimpleTest(TestCase):
+ def test_basic_addition(self):
+ """
+ Tests that 1 + 1 always equals 2.
+ """
+ self.failUnlessEqual(1 + 1, 2)
+
+__test__ = {"doctest": """
+Another way to test that 1 + 1 is equal to 2.
+
+>>> 1 + 1 == 2
+True
+"""}
+
View
38 crawler/utils.py
@@ -0,0 +1,38 @@
+import re
+
+def unescape(text):
+ """Removes HTML or XML character references
+ and entities from a text string.
+ keep &amp;, &gt;, &lt; in the source code.
+ from Fredrik Lundh
+ http://effbot.org/zone/re-sub.htm#unescape-html
+ """
+ def fixup(m):
+ text = m.group(0)
+ if text[:2] == "&#":
+ # character reference
+ try:
+ if text[:3] == "&#x":
+ return unichr(int(text[3:-1], 16))
+ else:
+ return unichr(int(text[2:-1]))
+ except ValueError:
+ print "erreur de valeur"
+ pass
+ else:
+ # named entity
+ try:
+ if text[1:-1] == "amp":
+ text = "&amp;amp;"
+ elif text[1:-1] == "gt":
+ text = "&amp;gt;"
+ elif text[1:-1] == "lt":
+ text = "&amp;lt;"
+ else:
+ print text[1:-1]
+ text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
+ except KeyError:
+ print "keyerror"
+ pass
+ return text # leave as is
+ return re.sub("&#?\w+;", fixup, text)
View
33 crawler/views.py
@@ -0,0 +1,33 @@
+from django.shortcuts import render_to_response
+from whoosh.qparser import QueryParser
+
+from crawler.indexer import get_searcher, schema
+from crawler.models import Page
+
+def index(req):
+ return render_to_response("index.html", { "doc_count": Page.count(), "top_docs": Page.get_top_by_rank(limit=20) })
+
+def search(req):
+ searcher = get_searcher()
+
+ q = QueryParser("content", schema=schema).parse(req.GET["q"])
+
+ results = searcher.search(q, limit=100)
+
+ if len(results) > 0:
+ max_score = max([r.score for r in results])
+ max_rank = max([r.fields()["rank"] for r in results])
+
+ combined = []
+ for r in results:
+ fields = r.fields()
+ r.score = r.score/max_score
+ r.rank = fields["rank"]/max_rank
+ r.combined = r.score + r.rank
+ combined.append(r)
+
+ combined.sort(key=lambda x: x.combined, reverse=True)
+ else:
+ combined = []
+
+ return render_to_response("results.html", { "q": req.GET["q"], "results": combined })
View
11 manage.py
@@ -0,0 +1,11 @@
+#!/usr/bin/python
+from django.core.management import execute_manager
+try:
+ import settings # Assumed to be in the same directory.
+except ImportError:
+ import sys
+ sys.stderr.write("Error: Can't find the file 'settings.py' in the directory containing %r. It appears you've customized things.\nYou'll have to run django-admin.py, passing it your settings module.\n(If the file settings.py does indeed exist, it's causing an ImportError somehow.)\n" % __file__)
+ sys.exit(1)
+
+if __name__ == "__main__":
+ execute_manager(settings)
View
128 settings.py
@@ -0,0 +1,128 @@
+# Django settings for celerycrawler project.
+
+DEBUG = True
+TEMPLATE_DEBUG = DEBUG
+
+ADMINS = (
+ # ('Your Name', 'your_email@domain.com'),
+)
+
+MANAGERS = ADMINS
+
+DATABASES = {
+ 'default': {
+ 'ENGINE': 'django.db.backends.sqlite3', # Add 'postgresql_psycopg2', 'postgresql', 'mysql', 'sqlite3' or 'oracle'.
+ 'NAME': 'sqlit3.db', # Or path to database file if using sqlite3.
+ 'USER': '', # Not used with sqlite3.
+ 'PASSWORD': '', # Not used with sqlite3.
+ 'HOST': '', # Set to empty string for localhost. Not used with sqlite3.
+ 'PORT': '', # Set to empty string for default. Not used with sqlite3.
+ }
+}
+
+# Local time zone for this installation. Choices can be found here:
+# http://en.wikipedia.org/wiki/List_of_tz_zones_by_name
+# although not all choices may be available on all operating systems.
+# On Unix systems, a value of None will cause Django to use the same
+# timezone as the operating system.
+# If running in a Windows environment this must be set to the same as your
+# system time zone.
+TIME_ZONE = 'America/Chicago'
+
+# Language code for this installation. All choices can be found here:
+# http://www.i18nguy.com/unicode/language-identifiers.html
+LANGUAGE_CODE = 'en-us'
+
+SITE_ID = 1
+
+# If you set this to False, Django will make some optimizations so as not
+# to load the internationalization machinery.
+USE_I18N = True
+
+# If you set this to False, Django will not format dates, numbers and
+# calendars according to the current locale
+USE_L10N = True
+
+# Absolute filesystem path to the directory that will hold user-uploaded files.
+# Example: "/home/media/media.lawrence.com/"
+MEDIA_ROOT = ''
+
+# URL that handles the media served from MEDIA_ROOT. Make sure to use a
+# trailing slash if there is a path component (optional in other cases).
+# Examples: "http://media.lawrence.com", "http://example.com/media/"
+MEDIA_URL = ''
+
+# URL prefix for admin media -- CSS, JavaScript and images. Make sure to use a
+# trailing slash.
+# Examples: "http://foo.com/media/", "/media/".
+ADMIN_MEDIA_PREFIX = '/media/'
+
+# Make this unique, and don't share it with anybody.
+SECRET_KEY = 'y!!%#sg$%_(%zdkyn9(efvb(7db!%gtxm58lt=*@!cgre=!(_h'
+
+# List of callables that know how to import templates from various sources.
+TEMPLATE_LOADERS = (
+ 'django.template.loaders.filesystem.Loader',
+ 'django.template.loaders.app_directories.Loader',
+# 'django.template.loaders.eggs.Loader',
+)
+
+MIDDLEWARE_CLASSES = (
+ 'django.middleware.common.CommonMiddleware',
+ 'django.contrib.sessions.middleware.SessionMiddleware',
+ 'django.middleware.csrf.CsrfViewMiddleware',
+ 'django.contrib.auth.middleware.AuthenticationMiddleware',
+ 'django.contrib.messages.middleware.MessageMiddleware',
+)
+
+ROOT_URLCONF = 'celerycrawler.urls'
+
+TEMPLATE_DIRS = (
+ # Put strings here, like "/home/html/django_templates" or "C:/www/django/templates".
+ # Always use forward slashes, even on Windows.
+ # Don't forget to use absolute paths, not relative paths.
+)
+
+INSTALLED_APPS = (
+ 'django.contrib.auth',
+ 'django.contrib.contenttypes',
+ 'django.contrib.sessions',
+ 'django.contrib.sites',
+ 'django.contrib.messages',
+ 'djcelery',
+ 'crawler'
+ # Uncomment the next line to enable the admin:
+ # 'django.contrib.admin',
+ # Uncomment the next line to enable admin documentation:
+ # 'django.contrib.admindocs',
+)
+
+import djcelery
+djcelery.setup_loader()
+
+BROKER_BACKEND = "couchdb"
+BROKER_HOST = "localhost"
+BROKER_PORT = 5984
+BROKER_VHOST = "celery"
+
+CELERYD_CONCURRENCY = 2
+CELERY_QUEUES = {"retrieve": {"exchange": "default", "exchange_type": "direct", "routing_key": "retrieve"},
+ "process": {"exchange": "default", "exchange_type": "direct", "routing_key": "process "},
+ "celery": {"exchange": "default", "exchange_type": "direct", "routing_key": "celery"}}
+
+class MyRouter(object):
+
+ def route_for_task(self, task, args=None, kwargs=None):
+ if task == "crawler.tasks.retrieve_page":
+ return { "queue": "retrieve" }
+ else:
+ return { "queue": "process" }
+
+CELERY_ROUTES = (MyRouter(), )
+
+import couchdb
+
+server = couchdb.Server()
+db = server["celerycrawler"]
+
+USER_AGENT = "CeleryCrawler; Your User Agent Here"
View
16 urls.py
@@ -0,0 +1,16 @@
+from django.conf.urls.defaults import *
+
+# Uncomment the next two lines to enable the admin:
+# from django.contrib import admin
+# admin.autodiscover()
+
+urlpatterns = patterns('',
+ (r'^$', 'crawler.views.index'),
+ (r'^search$', 'crawler.views.search'),
+
+ # Uncomment the admin/doc line below to enable admin documentation:
+ # (r'^admin/doc/', include('django.contrib.admindocs.urls')),
+
+ # Uncomment the next line to enable the admin:
+ # (r'^admin/', include(admin.site.urls)),
+)
Please sign in to comment.
Something went wrong with that request. Please try again.