Skip to content
This repository has been archived by the owner on Aug 28, 2019. It is now read-only.

Commit

Permalink
always remove troublesome html,body,noinclude etc
Browse files Browse the repository at this point in the history
  • Loading branch information
matpalm committed Aug 14, 2011
1 parent b4e274d commit ca9ac8c
Showing 1 changed file with 14 additions and 19 deletions.
33 changes: 14 additions & 19 deletions articleParser.py
Expand Up @@ -52,25 +52,25 @@ def replace_nested(regex, text):
continue

text = xml.find('text').string
# sys.stderr.write("text1 "+text[0:800].encode('utf-8')+"\n")
sys.stderr.write("text1 "+text[0:800].encode('utf-8')+"\n")

# remove all (nested) { }s
text = replace_nested(re.compile('{[^{]*?}'), text)
# sys.stderr.write("text2 "+text[0:800].encode('utf-8')+"\n")
sys.stderr.write("text2 "+text[0:800].encode('utf-8')+"\n")

# remove all (nested) ( )s
# cant just remove all () since it removes () from links eg Letter_(Alphabet)
# text = replace_nested(re.compile('\([^\(]*?\)'), text)

# unescape all XML (this includes comments for the next section)
text = unescape(text, {"'": "'", """: '"'})
# sys.stderr.write("text3 "+text[0:800].encode('utf-8')+"\n")
sys.stderr.write("text3 "+text[0:800].encode('utf-8')+"\n")

# remove italics and bold
text = re.sub(r"''.*?''", ' ', text)
# sys.stderr.write("text3c"+text[0:800].encode('utf-8')+"\n")
sys.stderr.write("text3c"+text[0:800].encode('utf-8')+"\n")
text = re.sub(r"'''.*?'''", ' ', text)
# sys.stderr.write("text3b"+text[0:800].encode('utf-8')+"\n")
sys.stderr.write("text3b"+text[0:2500].encode('utf-8')+"\n")

# remove all comments (never nested)
text = re.sub(r'<!--.*?-->', ' ', text)
Expand All @@ -97,30 +97,25 @@ def replace_nested(regex, text):
# and in this case we want to ignore both the Image:foo.jpg _and_ the [[link]] since it's nested in the image one
# we do this by converting [[blah]] to <link>blah</link> so we can do it with soup
text = re.sub('\[\[','<link>', re.sub('\]\]','</link>', text))
# sys.stderr.write("text5 "+text[0:800].encode('utf-8')+"\n")
sys.stderr.write("text5 "+text[0:800].encode('utf-8')+"\n")

# remove links in ( )s
# primarily this is to address the common case of
# foo (some other <link>bar</link) and then the <link>rest</link>...
# where the first link in brackets (bar) is not a good choice
# this re is a bit funky, needs some more examples me thinks.. (a.eg and allah.eg have been interesting cases)
text = re.sub(r'\(([^\(\)]*?)<link>(.*?)</link>(.*?)\)', ' ', text)
# sys.stderr.write("textX "+text[0:800].encode('utf-8')+"\n")
sys.stderr.write("text6 "+text[0:800].encode('utf-8')+"\n")

# occasionally some wikipedia articles are wrapped in <html><body>, or have bizarre stray br's or something.
# non recursive findall fails on this
# ( there's probably a way to config this to be allowed but more hacktastic to just remove them in this case )
# ( perhaps even just trim away _all_ non link tags? down that path lies madness, really need to sort out the find... )
for tag in ['html','body','noinclude','br','onlyinclude']:
text = re.sub('<'+tag+'>',' ',text)

# parse for <link> (being sure to handle recursive case) and pick first one
links = LinkParser(text).findAll('link', recursive=False)
if not links:
# occasionally some wikipedia articles are wrapped in <html><body>, or have bizarre stray br's or something.
# non recursive findall fails on this
# ( there's probably a way to config this to be allowed but more hacktastic to just remove them in this case )
# ( perhaps even just trim away _all_ non link tags? down that path lies madness, really need to sort out the find... )
for tag in ['html','body','noinclude','br']:
text = re.sub('<'+tag+'>',' ',text)
links = LinkParser(text).findAll('link', recursive=False)
if not links:
sys.stderr.write("reporter:counter:parse,cant_find_any_links,1\n")
sys.stderr.write("ERROR _still_ can't find _any_ links for ["+title.encode('utf-8')+"]; try to trim html/body \n")
continue
# print "links", links

link = first_valid_link(links, title)
Expand Down

0 comments on commit ca9ac8c

Please sign in to comment.