Skip to content

Commit

Permalink
test progress bar
Browse files Browse the repository at this point in the history
  • Loading branch information
omorilewa committed Jun 30, 2017
1 parent 94b77bb commit 1c4a821
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 3 deletions.
22 changes: 19 additions & 3 deletions healthtools/scrapers/base_scraper.py
Expand Up @@ -14,6 +14,13 @@
import os
import getpass

from __future__ import print_function

import functools
import random
import sys
import time
import progressbar

class Scraper(object):
def __init__(self):
Expand Down Expand Up @@ -63,19 +70,26 @@ def scrape_site(self):
all_results = []
delete_batch = []
skipped_pages = 0
print ""
widgets = [progressbar.Percentage(), progressbar.Bar()]

print "[{0}] ".format(re.sub(r"(\w)([A-Z])", r"\1 \2", type(self).__name__))
print "[{0}] - Started Scraper.".format(datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
self.get_total_number_of_pages()
bar = progressbar.ProgressBar(widgets=widgets, max_value=10).start()
# for i in range(20):


divisor = self.num_pages_to_scrape / 10

for page_num in range(1, self.num_pages_to_scrape + 1):

if page_num == divisor:
print "[{}] - Scraped {} out of {} pages.".format(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), page_num, self.num_pages_to_scrape)
time.sleep(1)
bar.update(i + 1)
# print "[{}] - Scraped {} out of {} pages.".format(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), page_num, self.num_pages_to_scrape)
divisor = divisor + (self.num_pages_to_scrape / 10)
elif page_num == self.num_pages_to_scrape:
print "[{}] - Scraped {} out of this {} pages.".format(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), page_num, self.num_pages_to_scrape)
print "[{}] - Scraped {} out of {} pages.".format(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), page_num, self.num_pages_to_scrape)
else:
test = 0

Expand All @@ -95,6 +109,8 @@ def scrape_site(self):
skipped_pages += 1
self.print_error("ERROR: scrape_site() - source: {} - page: {} - {}".format(url, page_num, err))
continue

bar.finish()
print "[{0}] - {1} documents retrieved.".format(
datetime.now().strftime("%Y-%m-%d %H:%M:%S"), len(all_results)/2) # don't count indexing data

Expand Down
2 changes: 2 additions & 0 deletions requirements.txt
Expand Up @@ -17,9 +17,11 @@ jmespath==0.9.2
MarkupSafe==1.0
nose==1.3.7
packaging==16.8
progressbar2==3.30.2
pyparsing==2.2.0
python-dateutil==2.6.0
python-memcached==1.58
python-utils==2.1.0
requests==2.13.0
requests-aws4auth==0.9
s3transfer==0.1.10
Expand Down

0 comments on commit 1c4a821

Please sign in to comment.