From 0a2b98f212b9950fd211a937b40e08c1e07c9d80 Mon Sep 17 00:00:00 2001 From: erowan Date: Sun, 5 Feb 2012 14:25:49 +0000 Subject: [PATCH] delete files in old layout --- pull/examples/__init__.py | 0 pull/examples/health.py | 146 ------------- pull/pull/__init__.py | 418 -------------------------------------- pull/setup.py | 34 ---- pull/tests/__init__.py | 0 pull/tests/test_pull.py | 166 --------------- 6 files changed, 764 deletions(-) delete mode 100644 pull/examples/__init__.py delete mode 100644 pull/examples/health.py delete mode 100644 pull/pull/__init__.py delete mode 100644 pull/setup.py delete mode 100644 pull/tests/__init__.py delete mode 100644 pull/tests/test_pull.py diff --git a/pull/examples/__init__.py b/pull/examples/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/pull/examples/health.py b/pull/examples/health.py deleted file mode 100644 index d321ab5..0000000 --- a/pull/examples/health.py +++ /dev/null @@ -1,146 +0,0 @@ -from datetime import ( - date, - datetime - ) -import lxml.html -import re -import urlparse -import os -from pull.site import ( - FileListCriteria, - Parser, - build_feed, - UrlProtocol, - go - ) - - -# adapted from https://scraperwiki.com/scrapers/cdc_foodborne_outbreaks/edit/ -class FoodParser(Parser): - - def cc(self, c,s): - return len([x for x in s if x == c]) - - def parse(self, file_path): - with open(file_path) as f: - html = f.read() - page = lxml.html.fromstring( html ) - lis = page.cssselect('.main-inner ul li') - l = [] - for li in lis: - tc = li.text_content() - if self.cc('-',tc) != 2: - source,_,pathogen = tc.rpartition('-') - if not source.strip(): - if pathogen.find('-') > 0: - source,_,pathogen = pathogen.rpartition('-') - else: - continue - href = urlparse.urljoin(FoodFile.url, - li.cssselect('a')[0].attrib.get('href') ) - l.append({'source':source, 'pathogen': pathogen, 'link': href}) - return l - - -class FoodFile(FileListCriteria): - - url = 'http://www.cdc.gov/outbreaknet/outbreaks.html' - - def __init__(self): - FileListCriteria.__init__(self) - - def build(self, start, end): - return [(FoodFile.url, - self.cache_location+'outbreaks.html')] - -class WhoFiles(FileListCriteria): - - url = "http://www.who.int/csr/don/archive/year/{0}/en/index.html" - - def __init__(self): - FileListCriteria.__init__(self) - - def build(self, start, end): - files = [] - for year in range(start.year, end.year+1): - files.append((self.url.format(year), - self.cache_location+'{0}.html'.format(year))) - return files - -# adapted from https://scraperwiki.com/scrapers/who_outbreaks/edit/ -class WhoParser(Parser): - - months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', - 'August', 'September', 'October', 'November', 'December'] - - def parse_date(self, datestr): - m = re.match('^(\d+) (\w+) (\d+)', datestr).groups(0) - return int(m[2]), datetime(year=int(m[2]), - month=WhoParser.months.index(m[1]) + 1, day=int(m[0])) - - def parse_item(self, s): - m = re.match('.*-(.*) in (.*) -.*', s) - if m: - return m.groups(0)[0], m.groups(0)[1] - m = re.match('.*-(.*) in (.*) \(.*', s) - if m: - return m.groups(0)[0], m.groups(0)[1] - m = re.match('.*-(.*) in (.*)', s) - if m: - return m.groups(0)[0], m.groups(0)[1] - # Changed in 2004 - m = re.match('(.*) [Ii]n (.*)', s) - if m: - return m.groups(0)[0], m.groups(0)[1] - print '**', 'Failed to parse', s - return s, '' - - - def parse(self, file_path): - with open(file_path) as f: - html = f.read() - page = lxml.html.fromstring( html ) - l = [] - url = WhoFiles.url.format(os.path.basename(file_path).rpartition('.')[0]) - lis = page.cssselect('.auto_archive li') - for li in lis: - href = li.cssselect('a')[0] - link = urlparse.urljoin( url, href.attrib.get('href') ) - year,date = self.parse_date( href.text_content() ) - info = li.cssselect('.link_info')[0].text_content() - disease, where = self.parse_item(info) - disease = disease.strip() - where = where.strip() - - if '-' in where: - where = where[0:where.find('-')].strip() - if u'-' in where: - where = where[0:where.find(u'-')].strip() - for x in [',',';',':',u'\u2013' ]: - if x in where: - where = where[0:where.find(x)].strip() - disease = where[where.find(x)+1:].strip() + ' ' + disease - - d = { - 'year': year, 'date':date.isoformat(), 'link':link, - 'disease': disease.title(), 'where':where.title() - } - l.append(d) - return l - -def main(): - d = date.today() - food = build_feed('food', UrlProtocol(FoodFile()), FoodParser()) - who = build_feed('who', UrlProtocol(WhoFiles()), WhoParser()) - results = go('health', [who], start=date(2011,1,1), end=d) - for feed, feed_result in results.iteritems(): - print 'feed={0}, count={1}'.format(feed, feed_result['count']) - for item in feed_result['obj'].updater.data_items: - print item - -if __name__ == "__main__": - main() - - - - diff --git a/pull/pull/__init__.py b/pull/pull/__init__.py deleted file mode 100644 index 76675ad..0000000 --- a/pull/pull/__init__.py +++ /dev/null @@ -1,418 +0,0 @@ -import datetime -import logging -import os -import socket -import sys -import tempfile -from asyncore import compact_traceback -import urllib2 -import cookielib - -__version__ = 0.2 -log = logging.getLogger(__name__) - -def write_cache_file(data, file_path): - def get_parent_dir(f): - parentDir, _ = os.path.split(f) - if parentDir == "": - raise IOError("no parent directory found for file " + f) - return parentDir - # if file dir path does not exist create it - parent_dir = get_parent_dir(file_path) - if os.path.isfile(parent_dir): - raise AssertionError('Expected a cache directory here not a regular' - ' file %s' % `parent_dir`) - if not os.path.exists(parent_dir): - os.makedirs(parent_dir) - f = open(file_path, 'wb') - f.write(data) - f.close() - -class ModuleError(Exception): - pass - -class ErrorForAllFeeds(ModuleError): - pass - -class ErrorForSomeFeeds(ModuleError): - pass - -class FeedError(Exception): - pass - -class ErrorForAllRequests(FeedError): - pass - -class Criteria(object): - - def build(self, start, end): - pass - -class FileListCriteria(Criteria): - - temp_dir = tempfile.gettempdir() + '/cache/' - - def __init__(self, cache_location=None): - self.cache_location = cache_location or FileListCriteria.temp_dir - - def build(self, start, end): - pass - -class SearchCriteria(Criteria): - def build(self, start, end): - pass - -class Protocol(object): - """ - The :term:`Fetch` step is defined by specifying a :class:`Criteria` like - :class:`FileListCriteria` to a :class:`Protocol` like :class:`UrlProtocol`. - """ - def __init__(self, criteria=None): - self.criteria = criteria - - def call(self, start, end): - return self.fetch(self.criteria.build(start, end)) - - def fetch(self, input_criteria): - """ - Fetch the files based on an input criteria. Input is typically a list - of url & cache filename tuples returned from a FileListCriteria instance - but could be any input suitable for a protocol instance to handle. - """ - return [input_criteria] - -""" -Use :class:`SkipProtocol` to bypass the Protocol step. -""" -SkipProtocol = Protocol - -class UrlProtocol(Protocol): - """ - :class:`UrlProtocol` is used for url GET fetching. - """ - - def __init__(self, criteria=None, timeout=None, httpDebugLevel=0, - proxies=None): - Protocol.__init__(self, criteria=criteria) - self.cj = cookielib.CookieJar() - self.proxyHandler = urllib2.ProxyHandler(proxies) if proxies else None - self.httpLogger = urllib2.HTTPHandler(debuglevel=httpDebugLevel) - if timeout: - socket.setdefaulttimeout(timeout) - # best to make out we are a brower - self.userAgent = 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)' - self.httpHeaders = { - 'User-agent' : self.userAgent, - 'Proxy-Connection' : 'Keep-Alive', - 'Accept-Encoding' : 'gzip, deflate', - 'Pragma' : 'no-cache', - 'Cache-Control' : 'no-cache', - 'Connection' : 'Keep-Alive' - } - - def fetch(self, files): - ''' - fetch urls by writing url responses to cache_files - @param files: list of (url, cache_file) tuples - @return: cache_file list - ''' - cache_files = [] - failures = [] - for url, f in files: - try: - log.info("Downloading: " + str(url)) - response, headers = self.fetch_url(url) - log.debug('response headers=%s' % str(headers)) - write_cache_file(response, f) - cache_files.append(f) - except Exception, e: - # todo: think about passing failures back by setting - # site.stats[feed]['errors'] because if len(cache_files) > 0 - # this failure is only logged here and not accessable later on. - error_message = 'Exception: %s, for url: %s' % (str(e), url) - log.warn(error_message) - failures.append(error_message) - - if len(cache_files) == 0: - # note only raising an exception if all file retrievals failed - raise ErrorForAllRequests('%s%s' % ('URL Fetch failed for all' - 'requests. Errors: ', ">>>".join(failures))) - return cache_files - - def fetch_url(self, url, data=None): - from urllib2 import Request - try: - handlers = [] - if self.proxyHandler: - handlers.append(self.proxyHandler) - handlers.extend([urllib2.HTTPCookieProcessor(self.cj), - self.httpLogger]) - opener = urllib2.build_opener(*handlers) - request = None - if data: - request = Request(url, data=data, headers=self.httpHeaders) - else: - request = Request(url, headers = self.httpHeaders) - response = opener.open(request) - if response.headers.get('Content-Encoding') == 'gzip': - return self.unzip(response.read()), response.headers - else: - return response.read(), response.headers - - except urllib2.HTTPError, e: - err_msg = 'Could not get file for {0}. Error: {1}'.format(url, e) - - except urllib2.URLError, e: - err_msg = 'Failed to reach server. Exception: ' + str(e) - except IOError, e: - err_msg = 'IOError Exception: ' + str(e) - except socket.error: - errno, errstr = sys.exc_info()[:2] - if errno == socket.timeout: - err_msg = 'Socket timeout getting ' + url + ':' + str(errstr) - else: - err_msg = 'Some socket error ' + url + ':' + str(errstr) - except Exception, e: - err_msg = 'Exception:' + str(e) + ', for url=' + url - raise ValueError('Exception during fetch_url, exception=%s' % err_msg) - - def unzip(self, gzip_data): - import gzip - from StringIO import StringIO - log.info('unzipping data before storage') - compressed_stream = StringIO(gzip_data) - gzipper = gzip.GzipFile(fileobj=compressed_stream) - data = gzipper.read() - #log.debug('unzipped_data=%s' % data) - gzipper.close() - compressed_stream.close() - return data - -class Updater(object): - - def __call__(self, data_items): - pass - -class StoreItems(Updater): - - def __init__(self): - Updater.__init__(self) - self.data_items = [] - - def __call__(self, data_items): - self.data_items.extend(data_items) - -class Parser(object): - """ - Parser Interface. Subclass to define the term:`Parse` step. - """ - def __init__(self): - self.pull_start_date = None - self.pull_end_date = None - self.name = 'Parser' - - def parse(self, file_path): - """ - Parse the contents of the file located at ``file_path``. - """ - return file_path - - def get_logger(self): - return logging.getLogger(self.name) - -""" -Use :class:`SkipParser` to bypass the Parser step. -""" -SkipParser = Parser - -class Feed(object): - ''' - A Feed is run based on a date range via the 'go' method. It achieves this - by using a configured Protocol helper to download files specified by a - Criteria. The downloaded files are stored to a local cache and then passed - to a configured Parser to produce a series of dicts. Each distinct fileset - should have it's own Feed which represents a unit of work for the activites - described above. - ''' - - def __init__(self, name, protocol, parser, **kwargs): - ''' - Feed ctor. - @param protocol: protocol used to fetch files - @param parser: parser used to parse files - @param kwargs: Optional kwargs keys are: - 'commence_date' - start date of series. file downloads will not try - and download any files before this date if specified. - 'updater' - callable that takes a list of dicts as input - 'expected_series_count' - Number of series expected in Feed - ''' - self.name = name - self.protocol = protocol - self.parser = parser - # set defaults - self.commence_date = None - self.expected_series_count = None - self.relative_cache_path = self.name - # overwrite state with user supplied args - self.__dict__.update(kwargs) - if 'updater' not in self.__dict__: - self.updater = StoreItems() - self.protocol.criteria.cache_location = os.path.join( - self.protocol.criteria.cache_location, self.relative_cache_path, '') - - self.updater.cache_location = self.get_cache_location() - - def go(self, start, end): - ''' - Main function to drive a Feeds download & store. - @param start: starting datetime.date of a feed - @param end: end datetime.date of a feed - ''' - cache_files = self.__fetch(start, end) - if len(cache_files) == 0: - raise ErrorForAllRequests('fetch returned zero files') - - return self.parse_files(cache_files) - - - def parse_files(self, files): - ''' - Parse input files into data_item dicts and pass into updater. - @param files: local file list - ''' - count = 0 - for data_items in self.__parse(files): - count += len(data_items) - self.updater(data_items) - return count - - def get_logger(self): - return logging.getLogger(self.name) - - def get_cache_location(self): - return self.protocol.criteria.cache_location - - def __fetch(self, start, end): - self.clear_cache() - if self.commence_date != None and \ - start.timetuple() < self.commence_date.timetuple(): - msg = ' '.join(('start date:', str(start), 'falls after this', - 'feeds commence date of', str(start), - ', re-setting start date to commence date for download.')) - self.get_logger().warn(msg) - start = self.commence_date - - self.parser.pull_start_date = start - self.parser.pull_end_date = end - return self.protocol.call(start, end) - - - def clear_cache(self): - self.__delete_file_tree(self.get_cache_location()) - - def __delete_file_tree(self, rootDir): - # Delete everything reachable from the directory named in 'top', - # assuming there are no symbolic links. - # CAUTION: This is dangerous! For example, if top == '/', it - # could delete all your disk files. - for root, dirs, files in os.walk(rootDir, topdown=False): - for name in files: - self.get_logger().debug(name + " File Removed!") - os.remove(os.path.join(root, name)) - for name in dirs: - self.get_logger().debug(name + " Dir Removed!") - os.rmdir(os.path.join(root, name)) - - def __delete_files_from_dir(self, cache_loc): - regularFiles = [f for f in os.listdir(cache_loc) \ - if os.path.isfile(cache_loc + f)] - for f in regularFiles: - os.remove(cache_loc + f) - self.get_logger().debug(f + " Removed!") - - def __parse(self, cache_files): - parser = self.parser - for file_path in cache_files: - self.get_logger().info('Parsing file %s' % file_path) - yield parser.parse(file_path) - -def build_feed(name, protocol, parser=None, **kwargs): - parser = parser or SkipParser() - return Feed(name, protocol, parser, **kwargs) - -def go(name, feeds, start, end=None, logLevel=logging.INFO): - """ - The main entry point to run feeds. - - 'feeds' is a list of :class:`Feed` objects. - 'start' is the datetime to fetch from - 'end' is the datetime to fetch to - """ - end = end or datetime.date.today() - - logging.basicConfig(level=logLevel, - format='%(asctime)s %(levelname)s %(name)s %(message)s') - - failures = [] - stats = dict([(x.name, {}) for x in feeds]) - site_series_count = 0 - for feed_obj in feeds: - feed = feed_obj.name - try: - feed_obj.get_logger().setLevel(logLevel) - feed_obj.parser.get_logger().setLevel(logLevel) - count = feed_obj.go(start, end) - if count == 0: - msg = 'no data updates for feed=%s' % feed - # todo: would be nice to know how many cache_files were downloaded - log.warn(msg) - failures.append(msg) - stats[feed]['errors'] = msg - elif feed_obj.expected_series_count and \ - count < feed_obj.expected_series_count: - msg = 'expected feed item count is %s but only processed '\ - '%s.' % (feed_obj.expected_series_count, - count) - log.warn(msg) - stats[feed]['warning'] = msg - stats[feed]['count'] = count - stats[feed]['obj'] = feed_obj - site_series_count += count - - except ErrorForAllRequests, fetch_ex: - _, t, v, tbinfo = compact_traceback() - msg = 'feed failure for {0}, errors are {1}. traceback is: ({2}:{3} {4})'.format( - feed, str(fetch_ex), t, v, tbinfo) - log.warn(msg) - failures.append(msg) - stats[feed]['errors'] = msg - except Exception, ex: - _, t, v, tbinfo = compact_traceback() - msg = 'feed failure for {0}, errors are {1}. traceback is: ({2}:{3} {4})'.format( - feed, str(ex), t, v, tbinfo) - log.warn(msg) - failures.append(msg) - stats[feed]['errors'] = msg - - clean_stats = [(feed, feed_stats) for (feed, feed_stats) in\ - stats.iteritems() if 'errors' not in feed_stats] - - def statistics_for(key): - return zip(*[(feed, feed_stats) for (feed, feed_stats) \ - in stats.iteritems() if key in feed_stats]) - - if len(clean_stats) > 0: - log.info('statistics=%s' % str(clean_stats)) - if len(failures) > 0: - err_msg = 'Run site=%s failed for %s feeds %s. Errors: %s' - error_feeds = statistics_for('errors') - if len(feeds) == len(failures): - raise ErrorForAllFeeds(err_msg % (name, 'all', - str(error_feeds[0]), str(error_feeds[1]))) - else: - raise ErrorForSomeFeeds(err_msg % (name, 'some', - str(error_feeds[0]), str(error_feeds[1]))) - - return stats - - diff --git a/pull/setup.py b/pull/setup.py deleted file mode 100644 index 6eac135..0000000 --- a/pull/setup.py +++ /dev/null @@ -1,34 +0,0 @@ -import os - -from setuptools import setup, find_packages - -here = os.path.abspath(os.path.dirname(__file__)) -README = open(os.path.join(here, 'README.txt')).read() - -requires = [ - # Testing dependencies - 'coverage', - 'nose', - ] - -setup(name='pull', - version='0.1', - description='Web Scraper scaffolding library', - long_description=README, - classifiers=[ - "Programming Language :: Python", - "Topic :: Internet :: WWW/HTTP", - ], - author='Rowan Shulver', - author_email='rowan.shulver@gmail.com', - url='', - keywords='pull web scraper', - packages=find_packages(), - include_package_data=True, - zip_safe=False, - install_requires = requires, - tests_require= requires, - test_suite="pull", - - ) - diff --git a/pull/tests/__init__.py b/pull/tests/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/pull/tests/test_pull.py b/pull/tests/test_pull.py deleted file mode 100644 index 9d4b6a1..0000000 --- a/pull/tests/test_pull.py +++ /dev/null @@ -1,166 +0,0 @@ -import unittest - -class TestFileListCriteria(unittest.TestCase): - - def _make(self, cache_location=None): - return MyFiles(cache_location) - - def test_ctor(self): - criteria = self._make() - self.assertEqual(criteria.cache_location, - FileListCriteria.temp_dir, - "cache_location should be set to FileListCriteria.temp_dir.") - - def test_ctor_with_cache_loc(self): - criteria = self._make('/foo/bar') - self.assertEqual(criteria.cache_location, - '/foo/bar', - "cache_location should be set to my /foo/bar.") - -class TestFetch(unittest.TestCase): - - def _make(self, criteria): - from pull import UrlProtocol - return UrlProtocol(criteria) - - def test_url_fetch(self): - def build_who_files(cache_location, start, end): - url = "http://www.who.int/csr/don/archive/year/{0}/en/index.html" - files = [] - for year in range(start.year, end.year+1): - files.append((url.format(year), - cache_location+'{0}.html'.format(year))) - return files - - criteria = MyFiles(builder=build_who_files) - protocol = self._make(criteria) - import datetime - start = datetime.date(2011,1,1) - end = datetime.date(2012,1,1) - downloads = protocol.call(start, end) - self.assertTrue(len(downloads)==2, - "should have downloaded 2 year files") - self.assertEqual(downloads, [x[1] for x in criteria.build(start, end)], - "protocol should have dowloaded files specified by criteria.") - - def test_url_fetch_error(self): - def build_files(cache_location, start, end): - return [('you_wont_find_this', 'x')] - - criteria = MyFiles(builder=build_files) - protocol = self._make(criteria) - import datetime - from pull import ErrorForAllRequests - d = datetime.date.today() - self.assertRaises(ErrorForAllRequests, lambda: protocol.call(d, d)) - - def test_url_fetch_warn(self): - def build_files(cache_location, start, end): - return [('you_wont_find_this', 'x'), - ("http://www.who.int/csr/don/archive/year/2012/en/index.html", - '/tmp/delme')] - - criteria = MyFiles(builder=build_files) - protocol = self._make(criteria) - import datetime - d = datetime.date.today() - results = protocol.call(d, d) - self.assert_(len(results) == 1, 'should have dowloaded 1 file') - -class TestFeed(unittest.TestCase): - - def test_build(self): - from pull import build_feed, SkipProtocol - def build_files(): - return [] - criteria = MyFiles(builder=build_files) - feed = build_feed('test', SkipProtocol(criteria)) - self.assert_(feed.name=='test', 'name not set') - - def test_go(self): - from pull import build_feed, SkipProtocol, go, ErrorForAllFeeds - import datetime - def build_files(*args): - return ['x', 'y'] - criteria = MyFiles(builder=build_files) - feed = build_feed('test', SkipProtocol(criteria)) - d = datetime.date.today() - results = go('test_feed', [feed], d) - self.assert_(results['test']['count']==2, 'should have 2 results') - self.assertEqual(results['test']['obj'].updater.data_items, - ['x', 'y'], 'updater should have stored what was input') - - def test_go_no_results(self): - from pull import build_feed, SkipProtocol, go, ErrorForAllFeeds - import datetime - def build_files(*args): - return [] - criteria = MyFiles(builder=build_files) - feed = build_feed('test', SkipProtocol(criteria)) - d = datetime.date.today() - - self.assertRaises(ErrorForAllFeeds, - lambda: go('test_feed', [feed], d)) - - def test_go_fetch_error(self): - def build_files(cache_location, start, end): - return [('you_wont_find_this', 'x')] - - criteria = MyFiles(builder=build_files) - import datetime - from pull import UrlProtocol, build_feed, go, ErrorForAllFeeds - protocol = UrlProtocol(criteria) - feed = build_feed('test', protocol) - from pull import ErrorForAllRequests - d = datetime.date.today() - self.assertRaises(ErrorForAllFeeds, lambda: go('test_feed', [feed], d)) - - def test_go_some_feeds_failed(self): - def build_bad_files(*args): - return [('you_wont_find_this', 'x')] - - def build_ok_files(*args): - return [ - ("http://www.who.int/csr/don/archive/year/2012/en/index.html", - '/tmp/delme')] - - import datetime - from pull import UrlProtocol, build_feed, go, ErrorForSomeFeeds - badfeed = build_feed('badfeed', - UrlProtocol(MyFiles(builder=build_bad_files))) - okfeed = build_feed('okfeed', - UrlProtocol(MyFiles(builder=build_ok_files))) - - d = datetime.date.today() - self.assertRaises(ErrorForSomeFeeds, lambda: go('test_feed', - [badfeed, okfeed], d)) - - def test_go_with_parse(self): - from pull import build_feed, SkipProtocol, go - import datetime - def build_files(*args): - return ['x', 'y'] - - from pull import Parser - class MyParser(Parser): - def parse(self, file_path): - return ['x1', 'y1'] - - criteria = MyFiles(builder=build_files) - feed = build_feed('test', SkipProtocol(criteria), parser=MyParser()) - d = datetime.date.today() - results = go('test_feed', [feed], d) - self.assert_(results['test']['count']==2, 'should have 2 results') - self.assertEqual(results['test']['obj'].updater.data_items, - ['x1', 'y1'], 'updater should have stored what was input') - -from pull import FileListCriteria -class MyFiles(FileListCriteria): - def __init__(self, cache_location=None, builder=None): - FileListCriteria.__init__(self, cache_location=cache_location) - self.builder = builder - - def build(self, start, end): - if self.builder: - return self.builder(self.cache_location, start, end) -