Skip to content

Commit

Permalink
Add optional mailman.protocol configuration option, which permits other
Browse files Browse the repository at this point in the history
protocols (like https) to be used instead of the default http when
scraping a mailman site.

Fixes: #1
  • Loading branch information
dannyob committed Jun 26, 2011
1 parent 107fe30 commit 8d86415
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 5 deletions.
6 changes: 5 additions & 1 deletion MailmanArchiveScraper-example.cfg
Expand Up @@ -23,6 +23,10 @@
# it's the 'lists.example.com' part.
domain =

# protocol allows you to specify a way of getting the pages that
# isn't http -- if you want http (the usual), leave the next line
# commented out:
# protocol = https

# Name of your mailing list.
# This can be found in the URL of the list info page.
Expand Down Expand Up @@ -131,4 +135,4 @@ hours_to_go_back = 6
# Probably set it to True if you're running on the command line.
# Set to false if running via cron - it'll print something only if something goes wrong.
# 1 or 0
verbose = 1
verbose = 1
9 changes: 5 additions & 4 deletions MailmanArchiveScraper.py
Expand Up @@ -49,9 +49,9 @@ def __init__(self):

# Set the URL for all the archive's pages.
if self.public_list:
self.list_url = 'http://' + self.domain + '/pipermail/' + self.list_name
self.list_url = self.protocol + '://' + self.domain + '/pipermail/' + self.list_name
else:
self.list_url = 'http://' + self.domain + '/mailman/private/' + self.list_name
self.list_url = self.protocol + '://' + self.domain + '/mailman/private/' + self.list_name

# Make the directory in which we'll save all the files on the local machine.
if not os.path.exists(self.publish_dir):
Expand All @@ -68,7 +68,7 @@ def __init__(self):
def loadConfig(self):
"Loads configuration from the MailmanArchiveScraper.cfg file"
config_file = sys.path[0]+'/MailmanArchiveScraper.cfg'
config = ConfigParser.SafeConfigParser()
config = ConfigParser.SafeConfigParser({'protocol': 'http'})

try:
config.readfp(open(config_file))
Expand All @@ -79,6 +79,7 @@ def loadConfig(self):

self.password = config.get('Mailman', 'password')
self.domain = config.get('Mailman', 'domain')
self.protocol = config.get('Mailman', 'protocol')

self.list_name = config.get('Mailman', 'list_name')

Expand Down Expand Up @@ -151,7 +152,7 @@ def prepareRegExps(self):
self.match_list_url = re.compile(r''+self.list_url, re.IGNORECASE)

# Replace the list info url with our custom one from the config
self.match_list_info_url = re.compile(r'http://' + self.domain + '/mailman/listinfo/' + self.list_name, re.IGNORECASE)
self.match_list_info_url = re.compile(self.protocol + '://' + self.domain + '/mailman/listinfo/' + self.list_name, re.IGNORECASE)

# Matches lines that beging with </I>&gt;<i>
# With the number of '&gt;' depending on the level of self.strip_quotes.
Expand Down

0 comments on commit 8d86415

Please sign in to comment.