Permalink
Browse files

fixes bug 924417 - Improve timeouts on crontabber ftpscraper, r=rhelmer

  • Loading branch information...
peterbe committed Oct 8, 2013
1 parent df0827f commit 82edd9ff5356d5c0635a234204429075df397f32
Showing with 126 additions and 63 deletions.
  1. +17 −21 socorro/cron/jobs/ftpscraper.py
  2. +109 −42 socorro/unittest/cron/jobs/test_ftpscraper.py
@@ -43,32 +43,31 @@ def urljoin(*parts):
return url
def getLinks(url, startswith=None, endswith=None):
html = ''
results = []
def patient_urlopen(url, max_attempts=4, sleep_time=20):
attempts = 0
while True:
if attempts > 3:
if attempts >= max_attempts:
raise RetriedError(attempts, url)
try:
attempts += 1
page = urllib2.urlopen(url)
except urllib2.HTTPError, err:
# wait half a minute
time.sleep(30)
if err.code == 404:
return results
elif err.code < 500:
if err.code < 500:
raise
time.sleep(sleep_time)
except urllib2.URLError, err:
# wait half a minute
time.sleep(30)
pass
time.sleep(sleep_time)
else:
html = lxml.html.document_fromstring(page.read())
content = page.read()
page.close()
break
return content
def getLinks(url, startswith=None, endswith=None):
html = ''
results = []
content = patient_urlopen(url, sleep_time=30)
html = lxml.html.document_fromstring(content)
for element, attribute, link, pos in html.iterlinks():
if startswith:
@@ -81,10 +80,8 @@ def getLinks(url, startswith=None, endswith=None):
def parseInfoFile(url, nightly=False):
infotxt = urllib2.urlopen(url)
content = infotxt.read()
content = patient_urlopen(url)
contents = content.splitlines()
infotxt.close()
results = {}
bad_lines = []
if nightly:
@@ -110,9 +107,8 @@ def parseB2GFile(url, nightly=False, logger=None):
Example: {"buildid": "20130125070201", "update_channel": "nightly", "version": "18.0"}
TODO handle exception if file does not exist
"""
infotxt = urllib2.urlopen(url)
results = json.load(infotxt)
infotxt.close()
content = patient_urlopen(url)
results = json.loads(content)
# bug 869564: Return None if update_channel is 'default'
if results['update_channel'] == 'default':
@@ -59,36 +59,40 @@ def test_urljoin(self):
'http://google.com/dir1/'
)
def test_getLinks(self):
@mock.patch('socorro.cron.jobs.ftpscraper.time')
def test_patient_urlopen(self, mocked_time):
sleeps = []
def mocked_sleeper(seconds):
sleeps.append(seconds)
mocked_time.sleep = mocked_sleeper
mock_calls = []
@stringioify
def mocked_urlopener(url):
html_wrap = "<html><body>\n%s\n</body></html>"
if 'ONE' in url:
return html_wrap % """
<a href='One.html'>One.html</a>
"""
raise NotImplementedError(url)
mock_calls.append(url)
if len(mock_calls) == 1:
raise urllib2.HTTPError(url, 500, "Server Error", {}, None)
if len(mock_calls) == 2:
raise urllib2.HTTPError(url, 504, "Timeout", {}, None)
if len(mock_calls) == 3:
raise urllib2.URLError("BadStatusLine")
return "<html>content</html>"
self.urllib2.side_effect = mocked_urlopener
self.assertEqual(
ftpscraper.getLinks('ONE'),
[]
)
self.assertEqual(
ftpscraper.getLinks('ONE', startswith='One'),
['One.html']
)
self.assertEqual(
ftpscraper.getLinks('ONE', endswith='.html'),
['One.html']
)
self.assertEqual(
ftpscraper.getLinks('ONE', startswith='Two'),
[]
content = ftpscraper.patient_urlopen(
'http://doesntmatt.er',
sleep_time=25
)
self.assertEqual(content, "<html>content</html>")
self.assertEqual(sleeps, [25, 25, 25])
@mock.patch('socorro.cron.jobs.ftpscraper.time')
def test_getLinks_with_timeout_retries(self, mocked_time):
def test_patient_urlopen_impatient_retriederror(self, mocked_time):
sleeps = []
@@ -109,24 +113,30 @@ def mocked_urlopener(url):
if len(mock_calls) == 3:
raise urllib2.URLError("BadStatusLine")
html_wrap = "<html><body>\n%s\n</body></html>"
if 'ONE' in url:
return html_wrap % """
<a href='One.html'>One.html</a>
"""
raise NotImplementedError(url)
return "<html>content</html>"
self.urllib2.side_effect = mocked_urlopener
self.assertEqual(
ftpscraper.getLinks('ONE', startswith='One'),
['One.html']
# very impatient version
self.assertRaises(
ftpscraper.RetriedError,
ftpscraper.patient_urlopen,
'http://doesntmatt.er',
max_attempts=1
)
# it had to go to sleep 3 times
self.assertEqual(len(sleeps), 3)
self.assertEqual(len(mock_calls), 1)
# less impatient
mock_calls = []
self.assertRaises(
ftpscraper.RetriedError,
ftpscraper.patient_urlopen,
'http://doesntmatt.er',
max_attempts=2
)
self.assertEqual(len(mock_calls), 2)
@mock.patch('socorro.cron.jobs.ftpscraper.time')
def test_getLinks_with_timeout_retries_failing(self, mocked_time):
def test_patient_urlopen_some_raise_errors(self, mocked_time):
sleeps = []
@@ -140,17 +150,74 @@ def mocked_sleeper(seconds):
@stringioify
def mocked_urlopener(url):
mock_calls.append(url)
raise urllib2.HTTPError(url, 500, "Server Error", {}, None)
if len(mock_calls) == 1:
raise urllib2.HTTPError(url, 500, "Server Error", {}, None)
raise urllib2.HTTPError(url, 404, "Page Not Found", {}, None)
self.urllib2.side_effect = mocked_urlopener
# very impatient version
self.assertRaises(
urllib2.HTTPError,
ftpscraper.patient_urlopen,
'http://doesntmatt.er',
)
@mock.patch('socorro.cron.jobs.ftpscraper.time')
def test_patient_urlopen_eventual_retriederror(self, mocked_time):
sleeps = []
def mocked_sleeper(seconds):
sleeps.append(seconds)
mocked_time.sleep = mocked_sleeper
mock_calls = []
@stringioify
def mocked_urlopener(url):
mock_calls.append(url)
if len(mock_calls) % 2:
raise urllib2.HTTPError(url, 500, "Server Error", {}, None)
else:
raise urllib2.URLError("BadStatusLine")
self.urllib2.side_effect = mocked_urlopener
# very impatient version
self.assertRaises(
ftpscraper.RetriedError,
ftpscraper.getLinks,
'ONE',
startswith='One',
ftpscraper.patient_urlopen,
'http://doesntmatt.er',
)
self.assertTrue(len(mock_calls) > 1)
def test_getLinks(self):
@stringioify
def mocked_urlopener(url):
html_wrap = "<html><body>\n%s\n</body></html>"
if 'ONE' in url:
return html_wrap % """
<a href='One.html'>One.html</a>
"""
raise NotImplementedError(url)
self.urllib2.side_effect = mocked_urlopener
self.assertEqual(
ftpscraper.getLinks('ONE'),
[]
)
self.assertEqual(
ftpscraper.getLinks('ONE', startswith='One'),
['One.html']
)
self.assertEqual(
ftpscraper.getLinks('ONE', endswith='.html'),
['One.html']
)
self.assertEqual(
ftpscraper.getLinks('ONE', startswith='Two'),
[]
)
# it had to go to sleep 3 times and failed on the 4th
self.assertEqual(len(sleeps), 4)
def test_parseInfoFile(self):
@stringioify
@@ -329,7 +396,7 @@ def mocked_urlopener(url):
]
)
#==============================================================================
@attr(integration='postgres') # for nosetests
class TestIntegrationFTPScraper(IntegrationTestCaseBase):

0 comments on commit 82edd9f

Please sign in to comment.