Permalink
Browse files

Updating and integrating API docs

  • Loading branch information...
1 parent 0dbd99a commit ce96ffa96316feee790b196ab30e3b7bab23e9dc Daniel Truemper committed Mar 3, 2011
@@ -0,0 +1,9 @@
+.. vim: set fileencoding=UTF-8 :
+.. vim: set tw=80 :
+.. include:: ../globals.rst
+
+Entrypoints
+===========
+
+.. automodule:: spyder
+ :members:
@@ -0,0 +1,9 @@
+.. vim: set fileencoding=UTF-8 :
+.. vim: set tw=80 :
+.. include:: ../globals.rst
+
+Link Extractors
+===============
+
+.. automodule:: spyder.processor.htmllinkextractor
+ :members:
@@ -0,0 +1,9 @@
+.. vim: set fileencoding=UTF-8 :
+.. vim: set tw=80 :
+.. include:: ../globals.rst
+
+Content Fetcher
+===============
+
+.. automodule:: spyder.processor.fetcher
+ :members:
@@ -0,0 +1,15 @@
+.. vim: set fileencoding=UTF-8 :
+.. vim: set tw=80 :
+.. include:: ../globals.rst
+
+Frontier
+========
+
+.. automodule:: spyder.core.frontier
+ :members:
+
+Queue Management
+================
+
+.. automodule:: spyder.core.sqlitequeues
+ :members:
@@ -0,0 +1,15 @@
+.. vim: set fileencoding=UTF-8 :
+.. vim: set tw=80 :
+.. include:: ../globals.rst
+
+Masterprocess
+=============
+
+.. automodule:: spyder.masterprocess
+ :members:
+
+ZeroMQ Master
+=============
+
+.. automodule:: spyder.core.master
+ :members:
@@ -0,0 +1,9 @@
+.. vim: set fileencoding=UTF-8 :
+.. vim: set tw=80 :
+.. include:: ../globals.rst
+
+Crawl Scoper
+============
+
+.. automodule:: spyder.processor.scoper
+ :members:
View
@@ -0,0 +1,9 @@
+.. vim: set fileencoding=UTF-8 :
+.. vim: set tw=80 :
+.. include:: ../globals.rst
+
+Sink
+====
+
+.. automodule:: spyder.core.sink
+ :members:
@@ -0,0 +1,24 @@
+.. vim: set fileencoding=UTF-8 :
+.. vim: set tw=80 :
+.. include:: ../globals.rst
+
+.. _secapi:
+
+Spyder API
+==========
+
+This is the main documentation for the |spyder| API. This will hopefully provide
+you with enough information to getting started for coding new features or help
+with bugfixing.
+
+.. toctree::
+ :maxdepth: 2
+
+ entrypoints
+ masterprocess
+ frontier
+ workerprocess
+ fetcher
+ extractor
+ scoper
+ sink
@@ -0,0 +1,15 @@
+.. vim: set fileencoding=UTF-8 :
+.. vim: set tw=80 :
+.. include:: ../globals.rst
+
+Workerprocess
+=============
+
+.. automodule:: spyder.workerprocess
+ :members:
+
+ZeroMQ Worker
+=============
+
+.. automodule:: spyder.core.worker
+ :members:
@@ -0,0 +1,10 @@
+.. vim: set fileencoding=UTF-8 :
+.. vim: set tw=80 :
+.. include:: globals.rst
+
+.. _secgettingstarted:
+
+Getting Started
+===============
+
+
View
@@ -3,7 +3,7 @@
.. include:: globals.rst
Welcome to |spyder|
-==================================
+===================
|spyder| is a scalable web-spider written in Python using the non-blocking
|tornado| library and |zmq| as messaging layer. The messages are serialized
@@ -24,6 +24,9 @@ Table of Contents
crawler-design
libraries
+ getting-started
+ api/spyderapi
+ roadmap
Indices and tables
==================
View
@@ -0,0 +1,15 @@
+.. vim: set fileencoding=UTF-8 :
+.. vim: set tw=80 :
+.. include:: globals.rst
+
+Roadmap
+=======
+
+Version 0.3
++++++++++++
+
+- Integration with `Supervisord`
+
+ The current way of starting |spyder| is quite painful. Using the
+ `supervisord` I want to start the master and worker processes automatically
+ and, in case of failures, be able to restart them automatically.
View
@@ -30,10 +30,6 @@
__version__ = '0.2.0-dev'
-__all__ = ["core", "processor", "defaultsettings", "spyder_template", "thrift",
- "workerprocess"]
-
-
def copy_skeleton_dir(destination):
"""
Copy the skeleton directory (spyder_template) to a new directory.
View
@@ -16,7 +16,7 @@
# limitations under the License.
#
"""
-A sink of :class:`CrawlUri`s.
+A sink of :class:`CrawlUri`.
"""
@@ -56,7 +56,7 @@ def process_server_error(self, curi):
class CouchDbSink(object):
"""
- Simple sink that will store :class:`CrawlUri`s inside a CouchDB instance.
+ Simple sink that will store :class:`CrawlUri` inside a CouchDB instance.
"""
def __init__(self, host_port="127.0.0.1:5984", database="spyder"):
@@ -17,6 +17,18 @@
#
"""
This module contains the default architecture for master process.
+
+The main task for masterprocesses is to create and run the **Frontier**.
+Starting a master involves the following steps:
+
+1. Bind to the configured |zmq| sockets
+2. Start the management interface
+3. Create the frontier
+4. Start the master
+
+Once the master is up and you have configured a ``settings.MASTER_CALLBACK``,
+this method will be called before the master is really started, i.e. before the
+``IOLoop.start()`` is called. This will allow you to insert *Seed* |urls|, e.g.
"""
import logging
@@ -16,11 +16,28 @@
# limitations under the License.
#
"""
-Module for the default HTML Link extractor.
+The :class:`DefaultHtmlLinkExtractor` will try to extract new links from the
+``curi.content_body``. In order to find them two regular expressions are used.
-Most of the regular expressions have been adopted from Heritrix. See:
-Heritrix 3:
- modules/src/main/java/org/archive/modules/extractor/ExtractorHTML.java
+1. The ``RELEVANT_TAG_EXTRACTOR`` extracts the following tags:
+ - ``<script>..</script>``
+ - ``<style>..</style>``
+ - ``<meta>``
+ - or any other open tag with at least one attribute (e.g. not ``<br>``).
+
+2. The ``LINK_EXTRACTOR`` extracts links from tags using `href` or `src`
+attributes.
+
+If the link is relative, the appropriate prefix is automatically added here.
+
+The regular expressions have been adopted from Heritrix. See the Heritrix 3
+source code:
+
+``modules/src/main/java/org/archive/modules/extractor/ExtractorHTML.java``
+
+.. note:: Heritrix has a newer way of extracting links, i.e. with different
+ regular expressions. Since these are working for me at the moment, I am
+ fine with it.
"""
import re
import htmlentitydefs
@@ -73,6 +90,10 @@
class DefaultHtmlLinkExtractor(object):
"""
The default extractor for Links from HTML pages.
+
+ The internal regular expressions currently are not modifiable. Only the
+ maximum length of an opening tag can be configured using the
+ ``settings.REGEX_LINK_XTRACTOR_MAX_ELEMENT_LENGTH``.
"""
def __init__(self, settings):
@@ -16,10 +16,28 @@
# limitations under the License.
#
"""
-Default scoper implementations.
+The *Crawl Scope* defines which *URLs* the *Spyder* should process. The main
+usecases for them are:
-The main scoper maintains a list of regular expressions to be used. Two
-classes of expressions exist: positive and negative.
+- only spider content from the *Seed* Hosts
+- do not spider images, css, videos
+
+and there are probably a lot of other reasons you want to have at least one the
+scoper configured, otherwise you might end up downloading the internet.
+
+So each scoper should iterate over the
+``curi.optional_vars[CURI_EXTRACTED_URLS]`` and determine if it should be
+downloaded or not.
+
+The :class:`RegexScoper` maintains a list of regular expressions that define
+the crawl scope. Two classes of expressions exist: positive and negative.
+The initial decision of the scoper is to not download its content. If a regex
+from the positive list matches, and no regex from the negative list matches,
+the *URL* is marked for downloading. In any other case, the *URL* will be
+abandoned.
+
+.. note:: We should really split up the regex scoper and allow the user to
+ configure more than just one scoper.
"""
import re
@@ -29,7 +47,15 @@
class RegexScoper(object):
"""
- Default implementation of regular expression based scoper.
+ The scoper based on regular expressions.
+
+ There are two settings that influence this scoper:
+
+ 1. ``settings.REGEX_SCOPE_POSITIVE``
+ 2. ``settings.REGEX_SCOPE_NEGATIVE``
+
+ Both have to be a ``list``. The scoper is executed in the
+ :meth:`__call__` method.
"""
def __init__(self, settings):

0 comments on commit ce96ffa

Please sign in to comment.