Skip to content

Commit

Permalink
python 3 compatibility
Browse files Browse the repository at this point in the history
  • Loading branch information
gutfeeling committed Oct 9, 2016
1 parent d587647 commit ad07cd3
Showing 1 changed file with 22 additions and 12 deletions.
34 changes: 22 additions & 12 deletions src/boilerpipe/extract/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
import jpype
import urllib2
try:
from urllib.request import Request, urlopen
except ImportError:
from urllib2 import Request, urlopen
import socket
import charade
import threading
Expand Down Expand Up @@ -29,20 +32,27 @@ class Extractor(object):
source = None
data = None
headers = {'User-Agent': 'Mozilla/5.0'}

def __init__(self, extractor='DefaultExtractor', **kwargs):
if kwargs.get('url'):
request = urllib2.Request(kwargs['url'], headers=self.headers)
connection = urllib2.urlopen(request)
request = Request(kwargs['url'], headers=self.headers)
connection = urlopen(request)
self.data = connection.read()
encoding = connection.headers['content-type'].lower().split('charset=')[-1]
if encoding.lower() == 'text/html':
encoding = charade.detect(self.data)['encoding']
self.data = unicode(self.data, encoding)
try:
self.data = unicode(self.data, encoding)
except NameError:
self.data = self.data.decode(encoding)
elif kwargs.get('html'):
self.data = kwargs['html']
if not isinstance(self.data, unicode):
self.data = unicode(self.data, charade.detect(self.data)['encoding'])
try:
if not isinstance(self.data, unicode):
self.data = unicode(self.data, charade.detect(self.data)['encoding'])
except NameError:
if not isinstance(self.data, str):
self.data = self.data.decode(charade.detect(self.data)['encoding'])
else:
raise Exception('No text or url provided')

Expand All @@ -52,23 +62,23 @@ def __init__(self, extractor='DefaultExtractor', **kwargs):
if jpype.isThreadAttachedToJVM() == False:
jpype.attachThreadToJVM()
lock.acquire()

self.extractor = jpype.JClass(
"de.l3s.boilerpipe.extractors."+extractor).INSTANCE
finally:
lock.release()

reader = StringReader(self.data)
self.source = BoilerpipeSAXInput(InputSource(reader)).getTextDocument()
self.extractor.process(self.source)

def getText(self):
return self.source.getContent()

def getHTML(self):
highlighter = HTMLHighlighter.newExtractingInstance()
return highlighter.process(self.source, self.data)

def getImages(self):
extractor = jpype.JClass(
"de.l3s.boilerpipe.sax.ImageExtractor").INSTANCE
Expand Down

0 comments on commit ad07cd3

Please sign in to comment.