Skip to content

Commit

Permalink
Two fixes:
Browse files Browse the repository at this point in the history
* Use the encoded word for searches in `<audio>` tags as well as simple links.
* Ignore the case is the Urls. Doing it case-sensitive was kind-of on purpose, but i guess even for German words that differ only in capitalization sound the same.
  • Loading branch information
ospalh committed Jun 3, 2013
1 parent 814aaef commit 7344fe6
Showing 1 changed file with 6 additions and 4 deletions.
10 changes: 6 additions & 4 deletions downloadaudio/downloaders/wiktionary.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,8 @@ def __init__(self):
# description pages. Mediawiki builds 256 (0x100) sub-folders
# in the style <hex_digit_1>/<hex_digit_1><hex_digit_2>. Look
# for that pattern.
self.word_ogg_re = r'/([a-f0-9])/\1[a-f0-9]/[^/]*\b{word}\b[^/]*\.ogg$'
self.word_ogg_re = \
ur'/([a-f0-9])/\1[a-f0-9]/[^/]*\b{word}\b[^/]*\.ogg$'
# This seems to work to extract the url from a <button> tag's
# onclick attribute.
self.button_onclick_re = '"videoUrl":"([^"]+)"'
Expand Down Expand Up @@ -72,8 +73,8 @@ def download_files(self, word, base, ruby, split):
continue
# We look for links to ogg files (and not the description
# pages) that contain our word.
if re.search(self.word_ogg_re.format(
word=re.escape(u_word)), href):
if re.search(self.word_ogg_re.format(word=re.escape(u_word)), href,
flags=re.IGNORECASE):
ogg_url_list.append(href)
# Next, look for source and src. Seen those inside audio tags.
# I'm not sure if this is any use, but i guess it does no harm.
Expand All @@ -86,7 +87,8 @@ def download_files(self, word, base, ruby, split):
continue
# We might have other source tags, for whatever. Use the
# same re as above. Should work out fine.
if re.search(self.word_ogg_re.format(word=re.escape(word)), src):
if re.search(self.word_ogg_re.format(word=re.escape(u_word)), src,
flags=re.IGNORECASE):
ogg_url_list.append(src)
# At least from fr.wiktionary.org i got a <button>.
button_list = word_soup.findAll('button')
Expand Down

1 comment on commit 7344fe6

@chdh
Copy link

@chdh chdh commented on 7344fe6 Jun 3, 2013

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the fixes. There is a third use of "word" in the re.search() for the buttons.

Please sign in to comment.