diff --git a/requirements.txt b/requirements.txt index 24576ae..4d17146 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ scraperwiki==0.3.11 python-dateutil==2.1 -BeautifulSoup==3.2.1 \ No newline at end of file +BeautifulSoup==3.2.1 +unidecode==0.04.16 diff --git a/scraper.py b/scraper.py index 00a0735..0e10faa 100644 --- a/scraper.py +++ b/scraper.py @@ -6,6 +6,7 @@ import scraperwiki from time import mktime from dateutil import parser +from unidecode import unidecode from BeautifulSoup import BeautifulSoup BASE_URL = 'http://tinyletter.com/realfuture/letters/' @@ -78,7 +79,7 @@ def load(url): def prep_data(T): data = [] - cnv = lambda x: x.decode('utf-8') if type(x) is str else x + cnv = lambda x: unidecode(x.decode('utf-8') if type(x) is str else x) for dt, ts in T: for t in ts: t.index = '{0}-{1}-{2}.{3}'.format(dt.year, dt.month, dt.day, t.number)