From 0a2b98f212b9950fd211a937b40e08c1e07c9d80 Mon Sep 17 00:00:00 2001
From: erowan <rowan.shulver@gmail.com>
Date: Sun, 5 Feb 2012 14:25:49 +0000
Subject: [PATCH] delete files in old layout

---
 pull/examples/__init__.py |   0
 pull/examples/health.py   | 146 -------------
 pull/pull/__init__.py     | 418 --------------------------------------
 pull/setup.py             |  34 ----
 pull/tests/__init__.py    |   0
 pull/tests/test_pull.py   | 166 ---------------
 6 files changed, 764 deletions(-)
 delete mode 100644 pull/examples/__init__.py
 delete mode 100644 pull/examples/health.py
 delete mode 100644 pull/pull/__init__.py
 delete mode 100644 pull/setup.py
 delete mode 100644 pull/tests/__init__.py
 delete mode 100644 pull/tests/test_pull.py

diff --git a/pull/examples/__init__.py b/pull/examples/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/pull/examples/health.py b/pull/examples/health.py
deleted file mode 100644
index d321ab5..0000000
--- a/pull/examples/health.py
+++ /dev/null
@@ -1,146 +0,0 @@
-from datetime import (
-    date,
-    datetime
-  )
-import lxml.html
-import re
-import urlparse
-import os
-from pull.site import (
-    FileListCriteria,
-    Parser,
-    build_feed,
-    UrlProtocol,
-    go
-  )
-
-
-# adapted from https://scraperwiki.com/scrapers/cdc_foodborne_outbreaks/edit/
-class FoodParser(Parser):
-    
-    def cc(self, c,s):
-        return len([x for x in s if x == c])
-    
-    def parse(self, file_path):
-        with open(file_path) as f:
-            html = f.read()
-        page = lxml.html.fromstring( html )
-        lis = page.cssselect('.main-inner ul li')
-        l = []
-        for li in lis:
-            tc = li.text_content()
-            if self.cc('-',tc) != 2:
-                source,_,pathogen = tc.rpartition('-')
-                if not source.strip():
-                    if pathogen.find('-') > 0:
-                        source,_,pathogen = pathogen.rpartition('-')
-                    else:
-                        continue
-                href = urlparse.urljoin(FoodFile.url,
-                           li.cssselect('a')[0].attrib.get('href') )
-                l.append({'source':source, 'pathogen': pathogen, 'link': href})
-        return l
-
-
-class FoodFile(FileListCriteria):
-
-    url = 'http://www.cdc.gov/outbreaknet/outbreaks.html'
-
-    def __init__(self):
-        FileListCriteria.__init__(self)
-
-    def build(self, start, end):
-        return [(FoodFile.url, 
-                  self.cache_location+'outbreaks.html')]
-
-class WhoFiles(FileListCriteria):
-
-    url = "http://www.who.int/csr/don/archive/year/{0}/en/index.html"
-
-    def __init__(self):
-        FileListCriteria.__init__(self)
-
-    def build(self, start, end):
-        files = []
-        for year in range(start.year, end.year+1):
-            files.append((self.url.format(year),
-                          self.cache_location+'{0}.html'.format(year)))
-        return files   
-
-# adapted from https://scraperwiki.com/scrapers/who_outbreaks/edit/
-class WhoParser(Parser):
-    
-    months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 
-              'August', 'September', 'October', 'November', 'December']
-    
-    def parse_date(self, datestr):
-        m = re.match('^(\d+) (\w+) (\d+)', datestr).groups(0)
-        return int(m[2]), datetime(year=int(m[2]), 
-            month=WhoParser.months.index(m[1]) + 1, day=int(m[0]))
-    
-    def parse_item(self, s):
-        m = re.match('.*-(.*) in (.*) -.*', s)
-        if m:
-            return m.groups(0)[0], m.groups(0)[1]
-        m = re.match('.*-(.*) in (.*) \(.*', s)
-        if m:
-            return m.groups(0)[0], m.groups(0)[1]
-        m = re.match('.*-(.*) in (.*)', s)
-        if m:
-            return m.groups(0)[0], m.groups(0)[1]
-        # Changed in 2004
-        m = re.match('(.*) [Ii]n (.*)', s)
-        if m:
-            return m.groups(0)[0], m.groups(0)[1]
-        print '**', 'Failed to parse', s
-        return s, ''
-
-    
-    def parse(self, file_path):
-        with open(file_path) as f:
-            html = f.read()
-        page = lxml.html.fromstring( html )
-        l = []
-        url = WhoFiles.url.format(os.path.basename(file_path).rpartition('.')[0])
-        lis = page.cssselect('.auto_archive li')
-        for li in lis:
-            href = li.cssselect('a')[0]
-            link = urlparse.urljoin( url, href.attrib.get('href') )
-            year,date = self.parse_date( href.text_content() )
-            info = li.cssselect('.link_info')[0].text_content()
-            disease, where = self.parse_item(info)
-            disease = disease.strip()
-            where = where.strip()
-    
-            if '-' in where:
-                where = where[0:where.find('-')].strip()
-            if u'-' in where:
-                where = where[0:where.find(u'-')].strip()
-            for x in [',',';',':',u'\u2013' ]:
-                if x in where:
-                    where = where[0:where.find(x)].strip()
-                    disease = where[where.find(x)+1:].strip() + ' ' + disease
-    
-            d = {
-                'year': year, 'date':date.isoformat(), 'link':link,
-                'disease': disease.title(), 'where':where.title()
-            }
-            l.append(d)    
-        return l            
-
-def main():    
-    d = date.today()    
-    food = build_feed('food', UrlProtocol(FoodFile()), FoodParser())
-    who = build_feed('who', UrlProtocol(WhoFiles()), WhoParser())
-    results = go('health', [who], start=date(2011,1,1), end=d)
-    for feed, feed_result in results.iteritems():
-        print 'feed={0}, count={1}'.format(feed, feed_result['count'])
-        for item in feed_result['obj'].updater.data_items:
-            print item 
-   
-if __name__ == "__main__":
-    main()
-
-
-
-      
diff --git a/pull/pull/__init__.py b/pull/pull/__init__.py
deleted file mode 100644
index 76675ad..0000000
--- a/pull/pull/__init__.py
+++ /dev/null
@@ -1,418 +0,0 @@
-import datetime
-import logging
-import os
-import socket
-import sys
-import tempfile
-from asyncore import compact_traceback
-import urllib2
-import cookielib
-
-__version__ = 0.2
-log = logging.getLogger(__name__)
-
-def write_cache_file(data, file_path):
-    def get_parent_dir(f):
-        parentDir, _ = os.path.split(f)
-        if parentDir == "":
-            raise IOError("no parent directory found for file " + f)
-        return parentDir
-    # if file dir path does not exist create it
-    parent_dir = get_parent_dir(file_path)
-    if os.path.isfile(parent_dir):
-        raise AssertionError('Expected a cache directory here not a regular'
-          ' file %s' % `parent_dir`)
-    if not os.path.exists(parent_dir):
-        os.makedirs(parent_dir)
-    f = open(file_path, 'wb')
-    f.write(data)
-    f.close()
-
-class ModuleError(Exception):
-    pass
-
-class ErrorForAllFeeds(ModuleError):
-    pass
-
-class ErrorForSomeFeeds(ModuleError):
-    pass
-
-class FeedError(Exception):
-    pass
-
-class ErrorForAllRequests(FeedError):
-    pass
-
-class Criteria(object):
-
-    def build(self, start, end):
-        pass
-
-class FileListCriteria(Criteria):
-    
-    temp_dir = tempfile.gettempdir() + '/cache/'
-    
-    def __init__(self, cache_location=None):
-        self.cache_location = cache_location or FileListCriteria.temp_dir
-        
-    def build(self, start, end):
-        pass
-
-class SearchCriteria(Criteria):
-    def build(self, start, end):
-        pass
-
-class Protocol(object):
-    """
-    The :term:`Fetch` step is defined by specifying a :class:`Criteria` like 
-    :class:`FileListCriteria` to a :class:`Protocol` like :class:`UrlProtocol`.      
-    """
-    def __init__(self, criteria=None):
-        self.criteria = criteria
-
-    def call(self, start, end):
-        return self.fetch(self.criteria.build(start, end))
-
-    def fetch(self, input_criteria):
-        """
-        Fetch the files based on an input criteria. Input is typically a list 
-        of url & cache filename tuples returned from a FileListCriteria instance 
-        but could be any input suitable for a protocol instance to handle.
-        """
-        return [input_criteria]
-
-"""
-Use :class:`SkipProtocol` to bypass the Protocol step.
-"""
-SkipProtocol = Protocol
-
-class UrlProtocol(Protocol):
-    """
-    :class:`UrlProtocol` is used for url GET fetching.
-    """
-
-    def __init__(self, criteria=None, timeout=None,  httpDebugLevel=0,
-                 proxies=None):
-        Protocol.__init__(self, criteria=criteria)
-        self.cj = cookielib.CookieJar()
-        self.proxyHandler = urllib2.ProxyHandler(proxies) if proxies else None    
-        self.httpLogger = urllib2.HTTPHandler(debuglevel=httpDebugLevel)
-        if timeout:
-            socket.setdefaulttimeout(timeout)
-        # best to make out we are a brower
-        self.userAgent = 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)'
-        self.httpHeaders = {
-            'User-agent' : self.userAgent,
-            'Proxy-Connection' : 'Keep-Alive',
-            'Accept-Encoding' : 'gzip, deflate',
-            'Pragma' : 'no-cache',
-            'Cache-Control' : 'no-cache',
-            'Connection' : 'Keep-Alive'
-        }
-
-    def fetch(self, files):
-        '''
-        fetch urls by writing url responses to cache_files
-        @param files: list of (url, cache_file) tuples
-        @return: cache_file list
-        '''
-        cache_files = []
-        failures = []
-        for url, f in files:
-            try:
-                log.info("Downloading: " + str(url))
-                response, headers = self.fetch_url(url)
-                log.debug('response headers=%s' % str(headers))
-                write_cache_file(response, f)
-                cache_files.append(f)
-            except Exception, e:
-                # todo: think about passing failures back by setting
-                # site.stats[feed]['errors'] because if len(cache_files) > 0
-                # this failure is only logged here and not accessable later on.
-                error_message = 'Exception: %s, for url: %s' % (str(e), url)
-                log.warn(error_message)
-                failures.append(error_message)
-
-        if len(cache_files) == 0:
-            # note only raising an exception if all file retrievals failed
-            raise ErrorForAllRequests('%s%s' % ('URL Fetch failed for all'
-              'requests. Errors: ', ">>>".join(failures)))
-        return cache_files
-
-    def fetch_url(self, url, data=None):
-        from urllib2 import Request
-        try:
-            handlers = []
-            if self.proxyHandler:
-                handlers.append(self.proxyHandler)
-            handlers.extend([urllib2.HTTPCookieProcessor(self.cj),
-                             self.httpLogger])    
-            opener = urllib2.build_opener(*handlers)
-            request = None
-            if data:
-                request = Request(url, data=data, headers=self.httpHeaders)
-            else:
-                request = Request(url, headers = self.httpHeaders)
-            response =  opener.open(request)
-            if response.headers.get('Content-Encoding') == 'gzip':
-                return self.unzip(response.read()), response.headers
-            else:
-                return response.read(), response.headers                    
-
-        except urllib2.HTTPError, e:
-            err_msg = 'Could not get file for {0}. Error: {1}'.format(url, e)
-             
-        except urllib2.URLError, e:
-            err_msg = 'Failed to reach server. Exception: ' + str(e)
-        except IOError, e:
-            err_msg = 'IOError Exception: ' + str(e)
-        except socket.error:
-            errno, errstr = sys.exc_info()[:2]
-            if errno == socket.timeout:
-                err_msg = 'Socket timeout getting ' + url + ':' + str(errstr)
-            else:
-                err_msg = 'Some socket error ' + url + ':' + str(errstr)
-        except Exception, e:
-            err_msg = 'Exception:' + str(e) + ', for url=' + url
-        raise ValueError('Exception during fetch_url, exception=%s' % err_msg)
-
-    def unzip(self, gzip_data):
-        import gzip
-        from StringIO import StringIO
-        log.info('unzipping data before storage')
-        compressed_stream = StringIO(gzip_data)
-        gzipper = gzip.GzipFile(fileobj=compressed_stream)
-        data = gzipper.read()
-        #log.debug('unzipped_data=%s' % data)
-        gzipper.close()
-        compressed_stream.close()
-        return data
-
-class Updater(object):
-    
-    def __call__(self, data_items):
-        pass
-    
-class StoreItems(Updater):
-    
-    def __init__(self):
-        Updater.__init__(self)
-        self.data_items = []
-    
-    def __call__(self, data_items):
-        self.data_items.extend(data_items)    
-
-class Parser(object):
-    """
-    Parser Interface. Subclass to define the term:`Parse` step.
-    """
-    def __init__(self):
-        self.pull_start_date = None
-        self.pull_end_date = None
-        self.name = 'Parser'
-        
-    def parse(self, file_path):
-        """
-        Parse the contents of the file located at ``file_path``.   
-        """
-        return file_path
-    
-    def get_logger(self):
-        return logging.getLogger(self.name)
-
-"""
-Use :class:`SkipParser` to bypass the Parser step.
-"""
-SkipParser = Parser
-
-class Feed(object):
-    '''
-    A Feed is run based on a date range via the 'go' method. It achieves this
-    by using a configured Protocol helper to download files specified by a
-    Criteria. The downloaded files are stored to a local cache and then passed 
-    to a configured Parser to produce a series of dicts. Each distinct fileset
-    should have it's own Feed which represents a unit of work for the activites
-    described above.  
-    '''
-
-    def __init__(self, name, protocol, parser, **kwargs):
-        '''
-        Feed ctor.      
-        @param protocol: protocol used to fetch files
-        @param parser: parser used to parse files
-        @param kwargs: Optional kwargs keys are:
-          'commence_date' - start date of series. file downloads will not try
-          and download any files before this date if specified.
-          'updater' - callable that takes a list of dicts as input
-          'expected_series_count' - Number of series expected in Feed     
-        '''
-        self.name = name
-        self.protocol = protocol
-        self.parser = parser
-        # set defaults
-        self.commence_date = None
-        self.expected_series_count = None
-        self.relative_cache_path = self.name
-        # overwrite state with user supplied args
-        self.__dict__.update(kwargs)  
-        if 'updater' not in self.__dict__:
-            self.updater = StoreItems() 
-        self.protocol.criteria.cache_location = os.path.join(
-          self.protocol.criteria.cache_location, self.relative_cache_path, '')  
-
-        self.updater.cache_location = self.get_cache_location()
-        
-    def go(self, start, end):
-        '''
-        Main function to drive a Feeds download & store.
-        @param start: starting datetime.date of a feed
-        @param end: end datetime.date of a feed
-        '''
-        cache_files = self.__fetch(start, end)
-        if len(cache_files) == 0:
-            raise ErrorForAllRequests('fetch returned zero files')
-
-        return self.parse_files(cache_files)
-
-
-    def parse_files(self, files):
-        '''
-        Parse input files into data_item dicts and pass into updater.
-        @param files: local file list
-        '''
-        count = 0
-        for data_items in self.__parse(files):
-            count += len(data_items)
-            self.updater(data_items)          
-        return count   
-
-    def get_logger(self):
-        return logging.getLogger(self.name)
-  
-    def get_cache_location(self):
-        return self.protocol.criteria.cache_location
-
-    def __fetch(self, start, end):
-        self.clear_cache()
-        if self.commence_date != None and \
-            start.timetuple() < self.commence_date.timetuple():
-            msg = ' '.join(('start date:', str(start), 'falls after this',
-                'feeds commence date of', str(start),
-                ', re-setting start date to commence date for download.'))
-            self.get_logger().warn(msg)
-            start = self.commence_date
-
-        self.parser.pull_start_date = start
-        self.parser.pull_end_date = end
-        return self.protocol.call(start, end)
-
-
-    def clear_cache(self):
-        self.__delete_file_tree(self.get_cache_location())
-
-    def __delete_file_tree(self, rootDir):
-        # Delete everything reachable from the directory named in 'top',
-        # assuming there are no symbolic links.
-        # CAUTION:  This is dangerous!  For example, if top == '/', it
-        # could delete all your disk files.
-        for root, dirs, files in os.walk(rootDir, topdown=False):
-            for name in files:
-                self.get_logger().debug(name + " File Removed!")
-                os.remove(os.path.join(root, name))
-            for name in dirs:
-                self.get_logger().debug(name + " Dir Removed!")
-                os.rmdir(os.path.join(root, name))
-
-    def __delete_files_from_dir(self, cache_loc):
-        regularFiles = [f for f in os.listdir(cache_loc) \
-                        if os.path.isfile(cache_loc + f)]
-        for f in regularFiles:
-            os.remove(cache_loc + f)
-            self.get_logger().debug(f + " Removed!")    
-
-    def __parse(self, cache_files):
-        parser = self.parser
-        for file_path in cache_files:
-            self.get_logger().info('Parsing file %s' % file_path)
-            yield parser.parse(file_path)
-             
-def build_feed(name, protocol, parser=None, **kwargs):
-    parser = parser or SkipParser()
-    return Feed(name, protocol, parser, **kwargs)
-
-def go(name, feeds, start, end=None, logLevel=logging.INFO):  
-    """
-    The main entry point to run feeds.
-    
-    'feeds' is a list of :class:`Feed` objects.
-    'start' is the datetime to fetch from
-    'end' is the datetime  to fetch to
-    """          
-    end = end or datetime.date.today()
-   
-    logging.basicConfig(level=logLevel,
-      format='%(asctime)s %(levelname)s %(name)s %(message)s')
-   
-    failures = []
-    stats = dict([(x.name, {}) for x in feeds])
-    site_series_count = 0
-    for feed_obj in feeds:
-        feed = feed_obj.name
-        try:    
-            feed_obj.get_logger().setLevel(logLevel)
-            feed_obj.parser.get_logger().setLevel(logLevel)
-            count = feed_obj.go(start, end)
-            if count == 0:
-                msg = 'no data updates for feed=%s' % feed
-                # todo: would be nice to know how many cache_files were downloaded
-                log.warn(msg)
-                failures.append(msg)
-                stats[feed]['errors'] = msg
-            elif feed_obj.expected_series_count and \
-                count < feed_obj.expected_series_count:
-                    msg = 'expected feed item count is %s but only processed '\
-                      '%s.' % (feed_obj.expected_series_count,
-                      count)
-                    log.warn(msg)
-                    stats[feed]['warning'] = msg
-            stats[feed]['count'] = count
-            stats[feed]['obj'] = feed_obj
-            site_series_count += count
-
-        except ErrorForAllRequests, fetch_ex:
-            _, t, v, tbinfo = compact_traceback()
-            msg = 'feed failure for {0}, errors are {1}. traceback is: ({2}:{3} {4})'.format(
-              feed, str(fetch_ex), t, v, tbinfo)
-            log.warn(msg)
-            failures.append(msg)
-            stats[feed]['errors'] = msg
-        except Exception, ex:
-            _, t, v, tbinfo = compact_traceback()
-            msg = 'feed failure for {0}, errors are {1}. traceback is: ({2}:{3} {4})'.format(
-              feed, str(ex), t, v, tbinfo)
-            log.warn(msg)
-            failures.append(msg)
-            stats[feed]['errors'] = msg
-
-    clean_stats = [(feed, feed_stats) for (feed, feed_stats) in\
-                   stats.iteritems() if 'errors' not in feed_stats]
-    
-    def statistics_for(key):
-        return zip(*[(feed, feed_stats) for (feed, feed_stats) \
-            in stats.iteritems() if key in feed_stats])
-    
-    if len(clean_stats) > 0:
-        log.info('statistics=%s' % str(clean_stats))
-    if len(failures) > 0:
-        err_msg = 'Run site=%s failed for %s feeds %s. Errors: %s'
-        error_feeds = statistics_for('errors')
-        if len(feeds) == len(failures):
-            raise ErrorForAllFeeds(err_msg % (name, 'all',
-                str(error_feeds[0]), str(error_feeds[1])))
-        else:
-            raise ErrorForSomeFeeds(err_msg % (name, 'some',
-                str(error_feeds[0]), str(error_feeds[1])))
-
-    return stats
-
-
diff --git a/pull/setup.py b/pull/setup.py
deleted file mode 100644
index 6eac135..0000000
--- a/pull/setup.py
+++ /dev/null
@@ -1,34 +0,0 @@
-import os
-
-from setuptools import setup, find_packages
-
-here = os.path.abspath(os.path.dirname(__file__))
-README = open(os.path.join(here, 'README.txt')).read()
-
-requires = [
-     # Testing dependencies
-    'coverage',
-    'nose',
-    ]
-
-setup(name='pull',
-      version='0.1',
-      description='Web Scraper scaffolding library',
-      long_description=README,
-      classifiers=[
-        "Programming Language :: Python",
-        "Topic :: Internet :: WWW/HTTP",
-        ],
-      author='Rowan Shulver',
-      author_email='rowan.shulver@gmail.com',
-      url='',
-      keywords='pull web scraper',
-      packages=find_packages(),
-      include_package_data=True,
-      zip_safe=False,
-      install_requires = requires,
-      tests_require= requires,
-      test_suite="pull",
-      
-      )
-
diff --git a/pull/tests/__init__.py b/pull/tests/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/pull/tests/test_pull.py b/pull/tests/test_pull.py
deleted file mode 100644
index 9d4b6a1..0000000
--- a/pull/tests/test_pull.py
+++ /dev/null
@@ -1,166 +0,0 @@
-import unittest
-
-class TestFileListCriteria(unittest.TestCase):
-
-    def _make(self, cache_location=None):
-        return MyFiles(cache_location)
-
-    def test_ctor(self):
-        criteria = self._make()
-        self.assertEqual(criteria.cache_location,
-            FileListCriteria.temp_dir,
-            "cache_location should be set to FileListCriteria.temp_dir.")
-        
-    def test_ctor_with_cache_loc(self):
-        criteria = self._make('/foo/bar')
-        self.assertEqual(criteria.cache_location,
-            '/foo/bar',
-            "cache_location should be set to my /foo/bar.")          
-   
-class TestFetch(unittest.TestCase):
-
-    def _make(self, criteria):
-        from pull import UrlProtocol
-        return UrlProtocol(criteria)
-    
-    def test_url_fetch(self):
-        def build_who_files(cache_location, start, end):
-            url = "http://www.who.int/csr/don/archive/year/{0}/en/index.html"
-            files = []
-            for year in range(start.year, end.year+1):
-                files.append((url.format(year),
-                              cache_location+'{0}.html'.format(year)))
-            return files     
-            
-        criteria = MyFiles(builder=build_who_files) 
-        protocol = self._make(criteria)
-        import datetime
-        start = datetime.date(2011,1,1)
-        end = datetime.date(2012,1,1)
-        downloads = protocol.call(start, end)
-        self.assertTrue(len(downloads)==2,
-                        "should have downloaded 2 year files")
-        self.assertEqual(downloads, [x[1] for x in criteria.build(start, end)],
-            "protocol should have dowloaded files specified by criteria.")
-
-    def test_url_fetch_error(self):
-        def build_files(cache_location, start, end):
-            return [('you_wont_find_this', 'x')]     
-            
-        criteria = MyFiles(builder=build_files) 
-        protocol = self._make(criteria)
-        import datetime
-        from pull import ErrorForAllRequests
-        d = datetime.date.today()
-        self.assertRaises(ErrorForAllRequests, lambda: protocol.call(d, d))
-        
-    def test_url_fetch_warn(self):
-        def build_files(cache_location, start, end):
-            return [('you_wont_find_this', 'x'),
-              ("http://www.who.int/csr/don/archive/year/2012/en/index.html",
-                '/tmp/delme')]     
-            
-        criteria = MyFiles(builder=build_files) 
-        protocol = self._make(criteria)
-        import datetime
-        d = datetime.date.today()
-        results = protocol.call(d, d)
-        self.assert_(len(results) == 1, 'should have dowloaded 1 file')  
-               
-class TestFeed(unittest.TestCase):
-    
-    def test_build(self):
-        from pull import build_feed, SkipProtocol
-        def build_files():
-            return []
-        criteria = MyFiles(builder=build_files)
-        feed = build_feed('test', SkipProtocol(criteria)) 
-        self.assert_(feed.name=='test', 'name not set')      
-    
-    def test_go(self):
-        from pull import build_feed, SkipProtocol, go, ErrorForAllFeeds
-        import datetime
-        def build_files(*args):
-            return ['x', 'y']
-        criteria = MyFiles(builder=build_files)
-        feed = build_feed('test', SkipProtocol(criteria)) 
-        d = datetime.date.today()
-        results = go('test_feed', [feed], d)
-        self.assert_(results['test']['count']==2, 'should have 2 results')
-        self.assertEqual(results['test']['obj'].updater.data_items,
-           ['x', 'y'], 'updater should have stored what was input')              
-        
-    def test_go_no_results(self):
-        from pull import build_feed, SkipProtocol, go, ErrorForAllFeeds
-        import datetime
-        def build_files(*args):
-            return []
-        criteria = MyFiles(builder=build_files)
-        feed = build_feed('test', SkipProtocol(criteria)) 
-        d = datetime.date.today()
-        
-        self.assertRaises(ErrorForAllFeeds,
-                          lambda: go('test_feed', [feed], d))
-    
-    def test_go_fetch_error(self):
-        def build_files(cache_location, start, end):
-            return [('you_wont_find_this', 'x')]     
-            
-        criteria = MyFiles(builder=build_files) 
-        import datetime
-        from pull import UrlProtocol, build_feed, go, ErrorForAllFeeds
-        protocol = UrlProtocol(criteria)
-        feed = build_feed('test', protocol)
-        from pull import ErrorForAllRequests
-        d = datetime.date.today()
-        self.assertRaises(ErrorForAllFeeds, lambda: go('test_feed', [feed], d))
-        
-    def test_go_some_feeds_failed(self):
-        def build_bad_files(*args):
-            return [('you_wont_find_this', 'x')]
-        
-        def build_ok_files(*args):
-            return [
-                ("http://www.who.int/csr/don/archive/year/2012/en/index.html",
-                '/tmp/delme')]     
-            
-        import datetime
-        from pull import UrlProtocol, build_feed, go, ErrorForSomeFeeds
-        badfeed = build_feed('badfeed',
-                             UrlProtocol(MyFiles(builder=build_bad_files)))
-        okfeed = build_feed('okfeed',
-                            UrlProtocol(MyFiles(builder=build_ok_files)))
-      
-        d = datetime.date.today()
-        self.assertRaises(ErrorForSomeFeeds, lambda: go('test_feed',
-            [badfeed, okfeed], d))    
-        
-    def test_go_with_parse(self):
-        from pull import build_feed, SkipProtocol, go
-        import datetime
-        def build_files(*args):
-            return ['x', 'y']
-        
-        from pull import Parser
-        class MyParser(Parser):
-            def parse(self, file_path):
-                return ['x1', 'y1']
-    
-        criteria = MyFiles(builder=build_files)
-        feed = build_feed('test', SkipProtocol(criteria), parser=MyParser()) 
-        d = datetime.date.today()
-        results = go('test_feed', [feed], d)
-        self.assert_(results['test']['count']==2, 'should have 2 results')
-        self.assertEqual(results['test']['obj'].updater.data_items,
-           ['x1', 'y1'], 'updater should have stored what was input')        
-     
-from pull import FileListCriteria
-class MyFiles(FileListCriteria):
-    def __init__(self, cache_location=None, builder=None):
-        FileListCriteria.__init__(self, cache_location=cache_location)
-        self.builder = builder
-        
-    def build(self, start, end):
-        if self.builder:
-            return self.builder(self.cache_location, start, end)
-