diff --git a/scraper/src/config/config_loader.py b/scraper/src/config/config_loader.py index 49becef..9bcfc1a 100644 --- a/scraper/src/config/config_loader.py +++ b/scraper/src/config/config_loader.py @@ -6,14 +6,12 @@ """ from collections import OrderedDict -from distutils.util import strtobool import json import os import sys import copy from .config_validator import ConfigValidator -from .nb_hits_updater import NbHitsUpdater from .urls_parser import UrlsParser from .selectors_parser import SelectorsParser from .browser_handler import BrowserHandler @@ -46,7 +44,6 @@ class ConfigLoader: strategy = 'default' strict_redirect = True strip_chars = u".,;:§¶" - update_nb_hits = None use_anchors = False user_agent = 'MeiliSearch docs-scraper' only_content_level = False @@ -111,9 +108,7 @@ def _parse(self): # Parse Env self.app_id = os.environ.get('MEILISEARCH_HOST_URL', None) self.api_key = os.environ.get('MEILISEARCH_API_KEY', None) - self.update_nb_hits = os.environ.get('UPDATE_NB_HITS', None) - if self.update_nb_hits is not None: - self.update_nb_hits = bool(strtobool(self.update_nb_hits)) + if self.index_uid_tmp is None: self.index_uid_tmp = os.environ.get('index_uid_TMP', self.index_uid + '_tmp') @@ -128,15 +123,5 @@ def _parse(self): self.allowed_domains = UrlsParser.build_allowed_domains( self.start_urls, self.stop_urls) - def update_nb_hits_value(self, nb_hits): - if self.config_file is not None: - # config loaded from file - previous_nb_hits = None if 'nb_hits' not in self.config_content else \ - self.config_content['nb_hits'] - nb_hit_updater = NbHitsUpdater(self.config_file, - self.config_content, - previous_nb_hits, nb_hits) - nb_hit_updater.update(self.update_nb_hits) - def get_extra_facets(self): return UrlsParser.get_extra_facets(self.start_urls) diff --git a/scraper/src/config/nb_hits_updater.py b/scraper/src/config/nb_hits_updater.py deleted file mode 100644 index 85c22ca..0000000 --- a/scraper/src/config/nb_hits_updater.py +++ /dev/null @@ -1,47 +0,0 @@ -import json -import copy - - -class NbHitsUpdater: - new_nb_hit = None - previous_nb_hits = None - config_file = None - config_content = None - - def __init__(self, config_file, config_content, previous_nb_hits, - new_nb_hit): - self.config_file = config_file - self.config_content = copy.deepcopy(config_content) - self.new_nb_hit = new_nb_hit - self.previous_nb_hits = previous_nb_hits - - def update(self, perform_update): - if self._update_needed(): - print('previous nb_hits: {}\n'.format(self.previous_nb_hits)) - if perform_update is None: - # if sys.stdout.isatty(): - # perform_update = confirm( - # 'Do you want to update the nb_hits in {} ?'.format( - # self.config_file)) - # else: - # perform_update = True - perform_update = True - - if perform_update: - try: - self._update_config() - print( - '\n[OK] {} has been updated'.format(self.config_file)) - except Exception: - print( - '\n[KO] Was not able to update {}'.format( - self.config_file)) - - def _update_needed(self): - return self.previous_nb_hits is None or self.previous_nb_hits != self.new_nb_hit - - def _update_config(self): - self.config_content['nb_hits'] = self.new_nb_hit - with open(self.config_file, 'w') as f: - f.write(json.dumps(self.config_content, indent=2, - separators=(',', ': '))) diff --git a/scraper/src/index.py b/scraper/src/index.py index 609e00b..f8ae1b7 100644 --- a/scraper/src/index.py +++ b/scraper/src/index.py @@ -103,7 +103,6 @@ def run_config(config): if DocumentationSpider.NB_INDEXED > 0: # meilisearch_helper.commit_tmp_index() print('Nb hits: {}'.format(DocumentationSpider.NB_INDEXED)) - config.update_nb_hits_value(DocumentationSpider.NB_INDEXED) else: print('Crawling issue: nbHits 0 for ' + config.index_uid) # meilisearch_helper.report_crawling_issue()