Skip to content

Commit

Permalink
Added support for plain text sources
Browse files Browse the repository at this point in the history
  • Loading branch information
okal committed Jan 7, 2012
1 parent a496b1c commit 57f11ad
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 6 deletions.
2 changes: 1 addition & 1 deletion app.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from json.encoder import JSONEncoder

app = Flask(__name__)
#app.debug = True
app.debug = True
try:
from bundle_config import config
r = redis.Redis(
Expand Down
18 changes: 13 additions & 5 deletions zipfy.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,19 @@
class Corpus:
def __init__(self, url):
h = HTMLParser()
self.page = BeautifulSoup(get(url).content, convertEntities=True)
self.body = self.page.body
text = ' '.join(self.body.findAll(text=True)).strip()
self.text = ' '.join(h.unescape(text).split()).lower()
self.words = re.sub('[^A-Za-z\'\s\-]+','', self.text).__str__()
site = get(url)

if 'html' in site.headers['content-type']:
self.page = BeautifulSoup(site.content, convertEntities=True)
self.body = self.page.body
text = ' '.join(self.body.findAll(text=True)).strip()
text = ' '.join(h.unescape(text).split()).lower()
self.words = re.sub('[^A-Za-z\'\s\-]+','', text).__str__()
elif 'plain' in site.headers['content-type']:
text = site.content.strip()
text = ' '.join(text.split()).lower()
self.words = re.sub('[^A-Za-z\'\s\-]+','', text).__str__()

self.word_list = self.words.split()
self.word_set = set(self.word_list)
self.freq_list = []
Expand Down

0 comments on commit 57f11ad

Please sign in to comment.