Skip to content

Commit

Permalink
Dropped clean_html and clean_url; use BeautifulSoup instead
Browse files Browse the repository at this point in the history
  • Loading branch information
stevenbird committed Aug 22, 2013
1 parent e86e83b commit 39a303e
Showing 1 changed file with 2 additions and 22 deletions.
24 changes: 2 additions & 22 deletions nltk/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -331,30 +331,10 @@ def invert_graph(graph):
##########################################################################

def clean_html(html):
"""
Remove HTML markup from the given string.
:param html: the HTML string to be cleaned
:type html: str
:rtype: str
"""

# First we remove inline JavaScript/CSS:
cleaned = re.sub(r"(?is)<(script|style).*?>.*?(</\1>)", "", html.strip())
# Then we remove html comments. This has to be done before removing regular
# tags since comments can contain '>' characters.
cleaned = re.sub(r"(?s)<!--(.*?)-->[\n]?", "", cleaned)
# Next we can remove the remaining tags:
cleaned = re.sub(r"(?s)<.*?>", " ", cleaned)
# Finally, we deal with whitespace
cleaned = re.sub(r"&nbsp;", " ", cleaned)
cleaned = re.sub(r" ", " ", cleaned)
cleaned = re.sub(r" ", " ", cleaned)
return cleaned.strip()
raise NotImplementedError ("To remove HTML markup, use BeautifulSoup's get_text() function")

def clean_url(url):
html = compat.urlopen(url).read()
return clean_html(html)
raise NotImplementedError ("To remove HTML markup, use BeautifulSoup's get_text() function")

##########################################################################
# FLATTEN LISTS
Expand Down

1 comment on commit 39a303e

@iamkamleshrangi
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nicely Done (Y)

Please sign in to comment.