Skip to content

Commit

Permalink
Merge edf9a0b into 033d92d
Browse files Browse the repository at this point in the history
  • Loading branch information
bergmannf committed Feb 13, 2018
2 parents 033d92d + edf9a0b commit b9a958f
Show file tree
Hide file tree
Showing 17 changed files with 1,273 additions and 194 deletions.
7 changes: 5 additions & 2 deletions .travis.yml
@@ -1,7 +1,9 @@
language: python
python:
- "2.6"
- "2.7"
- "3.4"
- "3.5"
- "3.6"
sudo: false
cache:
directories:
Expand All @@ -10,7 +12,8 @@ cache:
install:
# Get newer pip and whell for binary caching support
- pip install --upgrade pip wheel
- if [ $TRAVIS_PYTHON_VERSION = 2.6 ] ; then pip install "Django<1.7" "pylint<1.4.0" "astroid<1.3" "mechanize<=0.2.5"; fi
# Python-ldap will get a release for python3
- if [ $( echo "$TRAVIS_PYTHON_VERSION > 3.3" | bc) -eq 1 ] ; then sed -i 's/python-ldap/#python-ldap/' requirements.txt; fi
- pip install -r requirements-test.txt
# commands to run tests
script:
Expand Down
2 changes: 1 addition & 1 deletion pylint.rc
Expand Up @@ -66,7 +66,7 @@ confidence=
# I0011 Warning locally suppressed using disable-msg
# I0012 Warning locally suppressed using disable-msg
# unsubscriptable-object and unsupported-assignment-operation are wrongly detected on mechanize.Browser
disable=C0111,W0142,I0011,I0012,wrong-import-order,unsupported-assignment-operation,unsubscriptable-object
disable=C0111,W0142,I0011,I0012,wrong-import-order,unsupported-assignment-operation,unsubscriptable-object,invalid-name,c-extension-no-member


[REPORTS]
Expand Down
16 changes: 10 additions & 6 deletions requirements.txt
@@ -1,8 +1,12 @@
Django>=1.4
mechanize
python-dateutil
suds-jurko
beautifulsoup4
setuptools
pyxdg
Django>=1.4,<2.0
grab
lxml
PySocks
python-dateutil
python-ldap
pyxdg
future
setuptools
six
suds-jurko
104 changes: 49 additions & 55 deletions suseapi/browser.py
Expand Up @@ -21,14 +21,19 @@
'''
Web browser wrapper for convenient scraping of web based services.
'''
import mechanize
import urllib
import urllib2
import httplib
import socket
import cookielib

DEFAULT_TIMEOUT = 5.0
# import mechanize
import grab
# pylint: disable=import-error
from six.moves.http_client import HTTPException
# pylint: disable=import-error
from six.moves.urllib.error import URLError
# pylint: disable=import-error
from six.moves.urllib.parse import urlencode

# The default timeout has to be an integer.
DEFAULT_TIMEOUT = 50


class WebScraperError(Exception):
Expand All @@ -45,79 +50,64 @@ def webscraper_safely(call, *args, **kwargs):
Wrapper to handle errors in HTTP requests.
'''
try:
return call(*args, **kwargs)
except urllib2.URLError as exc:
result = call(*args, **kwargs)
if result.code >= 400:
raise WebScraperError('Status code error: {0!s}'.format(
result.code
), result)
return result
except grab.error.GrabError as exc:
raise WebScraperError('Grab error occurred: {0!s}'.format(exc), exc)
except URLError as exc:
for attrname in ('reason', 'msg', 'message'):
value = getattr(exc, attrname, '')
if value:
raise WebScraperError('URL error: {0!s}'.format(value), exc)
raise WebScraperError('Unknown URL error: {0!s}'.format(exc), exc)
except httplib.HTTPException as exc:
except HTTPException as exc:
raise WebScraperError(
'HTTP error {0!s}: {1!s}'.format(type(exc).__name__, exc),
exc
)
except socket.error as exc:
raise WebScraperError('Socket error: {0!s}'.format(exc), exc)
except IOError as exc:
# There doesn't seem to be an oserror that is already caught here?
except IOError as exc: # pylint: disable=duplicate-except
raise WebScraperError('IO error: {0!s}'.format(exc), exc)


class TimeoutRequest(mechanize.Request):
'''
Request class with defined timeout.
'''
def __init__(self, url, data=None, headers=None,
origin_req_host=None, unverifiable=False, visit=None,
timeout=DEFAULT_TIMEOUT):
if headers is None:
headers = {}
mechanize.Request.__init__(
self, url, data, headers, origin_req_host,
unverifiable, visit, timeout
)
self.timeout = DEFAULT_TIMEOUT


class WebScraper(object):
'''
Web based scraper using mechanize.
'''
def __init__(self, user, password, base, useragent=None):
def __init__(self, user, password, base, useragent=None,
transport='pycurl'):
self.base = base
self.user = user
self.password = password

# Cookie storage
self.cookiejar = cookielib.CookieJar()
self.cookie_set = False

# Browser instance
self.browser = mechanize.Browser(
request_class=TimeoutRequest
self.browser = grab.Grab(
timeout=DEFAULT_TIMEOUT
)

# Set cookies
self.browser.set_cookiejar(self.cookiejar)

# Log information about HTTP redirects and Refreshes.
# self.browser.set_debug_redirects(True)

# Log HTTP response bodies (ie. the HTML, most of the time).
# self.browser.set_debug_responses(True)

# Print HTTP headers.
# self.browser.set_debug_http(True)

# Ignore robots.txt
self.browser.set_handle_robots(False)
self.browser.setup_transport(transport)
if transport == "urllib3":
import urllib3
import certifi
self.browser.transport.pool = urllib3.PoolManager(
cert_reqs='CERT_REQUIRED',
ca_certs=certifi.where()
)
# Grab automatically handles cookies.

# Are we anonymous?
self.anonymous = (user == '')

# Identify ourselves
if useragent is not None:
self.browser.addheaders += [('User-agent', useragent)]
self.browser.setup(headers={'User-agent': useragent})

def _get_req_url(self, action):
'''
Expand All @@ -131,35 +121,39 @@ def request(self, action, paramlist=None, **kwargs):
'''
url = self._get_req_url(action)
if paramlist is not None:
params = urllib.urlencode(paramlist)
params = urlencode(paramlist)
elif kwargs == {}:
params = None
else:
params = urllib.urlencode(kwargs)
params = urlencode(kwargs)
return webscraper_safely(
self.browser.open,
url, params, timeout=DEFAULT_TIMEOUT
self.browser.go,
url, post=params
)

def submit(self):
'''
Submits currently selected browser form.
'''
return webscraper_safely(
self.browser.submit,
request_class=TimeoutRequest
self.browser.doc.submit,
)

def set_cookies(self, cookies):
'''
Sets cookies needed for access.
'''
for cookie in cookies:
self.cookiejar.set_cookie(cookie)
self.browser.cookies.set(cookie.name, cookie.value)
self.cookie_set = True

def get_cookies(self):
'''
Returns cookies set in browser.
'''
return [cookie for cookie in self.cookiejar]
return [cookie for cookie in self.browser.cookies.cookiejar]

def viewing_html(self):
if not self.browser.doc:
return False
return 'text/html' in self.browser.doc.headers['Content-Type']

0 comments on commit b9a958f

Please sign in to comment.