Permalink
Browse files

Dropped clean_html and clean_url; use BeautifulSoup instead

1 parent e86e83b commit 39a303e5ddc4cdb1a0b00a3be426239b1c24c8bb @stevenbird stevenbird committed Aug 22, 2013
Showing with 2 additions and 22 deletions.
  1. +2 −22 nltk/util.py
View
@@ -331,30 +331,10 @@ def invert_graph(graph):
##########################################################################
def clean_html(html):
- """
- Remove HTML markup from the given string.
-
- :param html: the HTML string to be cleaned
- :type html: str
- :rtype: str
- """
-
- # First we remove inline JavaScript/CSS:
- cleaned = re.sub(r"(?is)<(script|style).*?>.*?(</\1>)", "", html.strip())
- # Then we remove html comments. This has to be done before removing regular
- # tags since comments can contain '>' characters.
- cleaned = re.sub(r"(?s)<!--(.*?)-->[\n]?", "", cleaned)
- # Next we can remove the remaining tags:
- cleaned = re.sub(r"(?s)<.*?>", " ", cleaned)
- # Finally, we deal with whitespace
- cleaned = re.sub(r"&nbsp;", " ", cleaned)
- cleaned = re.sub(r" ", " ", cleaned)
- cleaned = re.sub(r" ", " ", cleaned)
- return cleaned.strip()
+ raise NotImplementedError ("To remove HTML markup, use BeautifulSoup's get_text() function")
def clean_url(url):
- html = compat.urlopen(url).read()
- return clean_html(html)
+ raise NotImplementedError ("To remove HTML markup, use BeautifulSoup's get_text() function")
##########################################################################
# FLATTEN LISTS

0 comments on commit 39a303e

Please sign in to comment.