diff --git a/setup.py b/setup.py index 8e908c4..959cd11 100644 --- a/setup.py +++ b/setup.py @@ -40,7 +40,7 @@ def download_jars(datapath, version=boilerpipe_version): }, install_requires=[ 'JPype1', - 'charade', + 'chardet', ], author='Misja Hoebe', author_email='misja.hoebe@gmail.com', diff --git a/src/boilerpipe/extract/__init__.py b/src/boilerpipe/extract/__init__.py index 10b6de8..bcbe143 100644 --- a/src/boilerpipe/extract/__init__.py +++ b/src/boilerpipe/extract/__init__.py @@ -4,7 +4,7 @@ except ImportError: from urllib2 import Request, urlopen import socket -import charade +import chardet import threading socket.setdefaulttimeout(15) @@ -40,7 +40,7 @@ def __init__(self, extractor='DefaultExtractor', **kwargs): self.data = connection.read() encoding = connection.headers['content-type'].lower().split('charset=')[-1] if encoding.lower() == 'text/html': - encoding = charade.detect(self.data)['encoding'] + encoding = chardet.detect(self.data)['encoding'] try: self.data = unicode(self.data, encoding) except NameError: @@ -49,10 +49,10 @@ def __init__(self, extractor='DefaultExtractor', **kwargs): self.data = kwargs['html'] try: if not isinstance(self.data, unicode): - self.data = unicode(self.data, charade.detect(self.data)['encoding']) + self.data = unicode(self.data, chardet.detect(self.data)['encoding']) except NameError: if not isinstance(self.data, str): - self.data = self.data.decode(charade.detect(self.data)['encoding']) + self.data = self.data.decode(chardet.detect(self.data)['encoding']) else: raise Exception('No text or url provided')