Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
added new files and trying to split the code up a little
- Loading branch information
Showing
3 changed files
with
144 additions
and
118 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
import os, sys | ||
|
||
from scraper import Scraper | ||
|
||
import lxml.html | ||
import lxml.etree | ||
import urllib2 | ||
import urlparse |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
import os, sys | ||
from optparse import OptionParser | ||
from ConfigParser import ConfigParser | ||
import urllib2 | ||
import urlparse | ||
|
||
|
||
class Scraper(object): | ||
|
||
def __init__(self, *args, **kwargs): | ||
parser = OptionParser() | ||
parser.add_option("-c", "--config", dest="config", | ||
help="Path to the configuration file", metavar="FILE") | ||
parser.add_option("-v", "--verbose", | ||
action="store_true", dest="verbose", default=False, | ||
help="Write verbose output") | ||
(self.settings, args) = parser.parse_args() | ||
self.load_config() | ||
|
||
def load_config(self): | ||
if (not self.settings.config) or (not os.path.exists(self.settings.config)): | ||
print """ | ||
Can't run unless we have a config file | ||
Please specify the path to the file with the -c option\n""" | ||
|
||
sys.exit(1) | ||
|
||
config = ConfigParser() | ||
config.readfp( open( self.settings.config ) ) | ||
|
||
output_folder = config.get('scraper_settings', 'pdf_output') | ||
output_folder = os.path.join( os.path.dirname(__file__), output_folder) | ||
output_folder = os.path.abspath(output_folder) | ||
if self.settings.verbose: | ||
print 'Will save output files to %s' % (output_folder,) | ||
self.settings.output_folder = output_folder | ||
|
||
def get_content(self, url): | ||
return urllib2.urlopen(url).read() | ||
|