/
parser.py
48 lines (35 loc) · 1.33 KB
/
parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
"""Module to parse directory listings on a remote FTP server."""
from HTMLParser import HTMLParser
import re
import urllib
class DirectoryParser(HTMLParser):
"""Class to parse directory listings"""
def __init__(self, url):
HTMLParser.__init__(self)
self.entries = [ ]
self.active_url = None
req = urllib.urlopen(url)
self.feed(req.read())
def filter(self, regex):
pattern = re.compile(regex, re.IGNORECASE)
return [entry for entry in self.entries if pattern.match(entry)]
def handle_starttag(self, tag, attrs):
if not tag == 'a':
return
for attr in attrs:
if attr[0] == 'href':
self.active_url = attr[1].strip('/')
return
def handle_endtag(self, tag):
if tag == 'a':
self.active_url = None
def handle_data(self, data):
# Only process the data when we are in an active a tag and have an URL
if not self.active_url:
return
name = urllib.quote(data.strip('/'))
if self.active_url == name:
self.entries.append(self.active_url)