Dropped clean_html and clean_url; use BeautifulSoup instead

nltk · Aug 22, 2013 · 39a303e · 39a303e · iamkamleshrangi · Feb 5, 2019
1 parent e86e83b
commit 39a303e
Showing 1 changed file with 2 additions and 22 deletions.
diff --git a/nltk/util.py b/nltk/util.py
@@ -331,30 +331,10 @@ def invert_graph(graph):
 ##########################################################################
 
 def clean_html(html):
-    """
-    Remove HTML markup from the given string.
-
-    :param html: the HTML string to be cleaned
-    :type html: str
-    :rtype: str
-    """
-
-    # First we remove inline JavaScript/CSS:
-    cleaned = re.sub(r"(?is)<(script|style).*?>.*?(</\1>)", "", html.strip())
-    # Then we remove html comments. This has to be done before removing regular
-    # tags since comments can contain '>' characters.
-    cleaned = re.sub(r"(?s)<!--(.*?)-->[\n]?", "", cleaned)
-    # Next we can remove the remaining tags:
-    cleaned = re.sub(r"(?s)<.*?>", " ", cleaned)
-    # Finally, we deal with whitespace
-    cleaned = re.sub(r"&nbsp;", " ", cleaned)
-    cleaned = re.sub(r"  ", " ", cleaned)
-    cleaned = re.sub(r"  ", " ", cleaned)
-    return cleaned.strip()
+    raise NotImplementedError ("To remove HTML markup, use BeautifulSoup's get_text() function")
 
 def clean_url(url):
-    html = compat.urlopen(url).read()
-    return clean_html(html)
+    raise NotImplementedError ("To remove HTML markup, use BeautifulSoup's get_text() function")
 
 ##########################################################################
 # FLATTEN LISTS