diff --git a/Pipfile b/Pipfile index 9d1c7d4..ee50a07 100644 --- a/Pipfile +++ b/Pipfile @@ -10,6 +10,7 @@ pytest = "==6.2.3" meilisearch = "==0.16.1" requests-iap = "==0.2.0" python-keycloak-client = "==0.2.3" +webdriver-manager = "==3.4.2" [dev-packages] pylint = "==2.8.2" diff --git a/Pipfile.lock b/Pipfile.lock index b517ea9..82287b7 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "ea77b4eb779cc50037a45290eee3541be4b952f1fb19bd87a8f987cd4a4729b0" + "sha256": "862df4ee159793ef2cb6fd3088a52c3311dc5826993c28e4c3088b17762be8c6" }, "pipfile-spec": 6, "requires": {}, @@ -14,6 +14,14 @@ ] }, "default": { + "atomicwrites": { + "hashes": [ + "sha256:6d1784dea7c0c8d4a5172b6c620f40b6e4cbfdf96d783691f2e1302a7b88e197", + "sha256:ae70396ad1a434f9c7046fd2dd196fc04b12f9e91ffb859164193be8b6168a7a" + ], + "markers": "sys_platform == 'win32'", + "version": "==1.4.0" + }, "attrs": { "hashes": [ "sha256:149e90d6d8ac20db7a955ad60cf0e6881a3f20d37096140088356da6c716b0b1", @@ -88,11 +96,27 @@ }, "charset-normalizer": { "hashes": [ - "sha256:0c8911edd15d19223366a194a513099a302055a962bca2cec0f54b8b63175d8b", - "sha256:f23667ebe1084be45f6ae0538e4a5a865206544097e4e8bbcacf42cd02a348f3" + "sha256:5d209c0a931f215cee683b6445e2d77677e7e75e159f78def0db09d68fafcaa6", + "sha256:5ec46d183433dcbd0ab716f2d7f29d8dee50505b3fdb40c6b985c7c4f5a3591f" ], "markers": "python_version >= '3'", - "version": "==2.0.4" + "version": "==2.0.6" + }, + "colorama": { + "hashes": [ + "sha256:5941b2b48a20143d2267e95b1c2a7603ce057ee39fd88e7329b0c292aa16869b", + "sha256:9f47eda37229f68eee03b24b9748937c7dc3868f906e8ba69fbcbdd3bc5dc3e2" + ], + "markers": "sys_platform == 'win32'", + "version": "==0.4.4" + }, + "configparser": { + "hashes": [ + "sha256:85d5de102cfe6d14a5172676f09d19c465ce63d6019cf0a4ef13385fc535e828", + "sha256:af59f2cdd7efbdd5d111c1976ecd0b82db9066653362f0962d7bf1d3ab89a1fa" + ], + "markers": "python_version >= '3.6'", + "version": "==5.0.2" }, "constantly": { "hashes": [ @@ -101,28 +125,38 @@ ], "version": "==15.1.0" }, + "crayons": { + "hashes": [ + "sha256:bd33b7547800f2cfbd26b38431f9e64b487a7de74a947b0fafc89b45a601813f", + "sha256:e73ad105c78935d71fe454dd4b85c5c437ba199294e7ffd3341842bc683654b1" + ], + "version": "==0.4.0" + }, "cryptography": { "hashes": [ - "sha256:0a7dcbcd3f1913f664aca35d47c1331fce738d44ec34b7be8b9d332151b0b01e", - "sha256:1eb7bb0df6f6f583dd8e054689def236255161ebbcf62b226454ab9ec663746b", - "sha256:21ca464b3a4b8d8e86ba0ee5045e103a1fcfac3b39319727bc0fc58c09c6aff7", - "sha256:34dae04a0dce5730d8eb7894eab617d8a70d0c97da76b905de9efb7128ad7085", - "sha256:3520667fda779eb788ea00080124875be18f2d8f0848ec00733c0ec3bb8219fc", - "sha256:3fa3a7ccf96e826affdf1a0a9432be74dc73423125c8f96a909e3835a5ef194a", - "sha256:5b0fbfae7ff7febdb74b574055c7466da334a5371f253732d7e2e7525d570498", - "sha256:8695456444f277af73a4877db9fc979849cd3ee74c198d04fc0776ebc3db52b9", - "sha256:94cc5ed4ceaefcbe5bf38c8fba6a21fc1d365bb8fb826ea1688e3370b2e24a1c", - "sha256:94fff993ee9bc1b2440d3b7243d488c6a3d9724cc2b09cdb297f6a886d040ef7", - "sha256:9965c46c674ba8cc572bc09a03f4c649292ee73e1b683adb1ce81e82e9a6a0fb", - "sha256:a00cf305f07b26c351d8d4e1af84ad7501eca8a342dedf24a7acb0e7b7406e14", - "sha256:a305600e7a6b7b855cd798e00278161b681ad6e9b7eca94c721d5f588ab212af", - "sha256:cd65b60cfe004790c795cc35f272e41a3df4631e2fb6b35aa7ac6ef2859d554e", - "sha256:d2a6e5ef66503da51d2110edf6c403dc6b494cc0082f85db12f54e9c5d4c3ec5", - "sha256:d9ec0e67a14f9d1d48dd87a2531009a9b251c02ea42851c060b25c782516ff06", - "sha256:f44d141b8c4ea5eb4dbc9b3ad992d45580c1d22bf5e24363f2fbf50c2d7ae8a7" + "sha256:07bb7fbfb5de0980590ddfc7f13081520def06dc9ed214000ad4372fb4e3c7f6", + "sha256:18d90f4711bf63e2fb21e8c8e51ed8189438e6b35a6d996201ebd98a26abbbe6", + "sha256:1ed82abf16df40a60942a8c211251ae72858b25b7421ce2497c2eb7a1cee817c", + "sha256:22a38e96118a4ce3b97509443feace1d1011d0571fae81fc3ad35f25ba3ea999", + "sha256:2d69645f535f4b2c722cfb07a8eab916265545b3475fdb34e0be2f4ee8b0b15e", + "sha256:4a2d0e0acc20ede0f06ef7aa58546eee96d2592c00f450c9acb89c5879b61992", + "sha256:54b2605e5475944e2213258e0ab8696f4f357a31371e538ef21e8d61c843c28d", + "sha256:7075b304cd567694dc692ffc9747f3e9cb393cc4aa4fb7b9f3abd6f5c4e43588", + "sha256:7b7ceeff114c31f285528ba8b390d3e9cfa2da17b56f11d366769a807f17cbaa", + "sha256:7eba2cebca600a7806b893cb1d541a6e910afa87e97acf2021a22b32da1df52d", + "sha256:928185a6d1ccdb816e883f56ebe92e975a262d31cc536429041921f8cb5a62fd", + "sha256:9933f28f70d0517686bd7de36166dda42094eac49415459d9bdf5e7df3e0086d", + "sha256:a688ebcd08250eab5bb5bca318cc05a8c66de5e4171a65ca51db6bd753ff8953", + "sha256:abb5a361d2585bb95012a19ed9b2c8f412c5d723a9836418fab7aaa0243e67d2", + "sha256:c10c797ac89c746e488d2ee92bd4abd593615694ee17b2500578b63cad6b93a8", + "sha256:ced40344e811d6abba00295ced98c01aecf0c2de39481792d87af4fa58b7b4d6", + "sha256:d57e0cdc1b44b6cdf8af1d01807db06886f10177469312fbde8f44ccbb284bc9", + "sha256:d99915d6ab265c22873f1b4d6ea5ef462ef797b4140be4c9d8b179915e0985c6", + "sha256:eb80e8a1f91e4b7ef8b33041591e6d89b2b8e122d787e87eeb2b08da71bb16ad", + "sha256:ebeddd119f526bcf323a89f853afb12e225902a24d29b55fe18dd6fcb2838a76" ], "markers": "python_version >= '3.6'", - "version": "==3.4.8" + "version": "==35.0.0" }, "cssselect": { "hashes": [ @@ -383,11 +417,11 @@ }, "pyopenssl": { "hashes": [ - "sha256:4c231c759543ba02560fcd2480c48dcec4dae34c9da7d3747c508227e0624b51", - "sha256:818ae18e06922c066f777a33f1fca45786d85edfe71cd043de6379337a7f274b" + "sha256:5e2d8c5e46d0d865ae933bef5230090bdaf5506281e9eec60fa250ee80600cb3", + "sha256:8935bd4920ab9abfebb07c41a4f58296407ed77f04bd1a92914044b848ba1ed6" ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", - "version": "==20.0.1" + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'", + "version": "==21.0.0" }, "pyparsing": { "hashes": [ @@ -502,6 +536,24 @@ "markers": "python_full_version >= '3.6.7'", "version": "==21.7.0" }, + "twisted-iocpsupport": { + "hashes": [ + "sha256:306becd6e22ab6e8e4f36b6bdafd9c92e867c98a5ce517b27fdd27760ee7ae41", + "sha256:3c61742cb0bc6c1ac117a7e5f422c129832f0c295af49e01d8a6066df8cfc04d", + "sha256:72068b206ee809c9c596b57b5287259ea41ddb4774d86725b19f35bf56aa32a9", + "sha256:7d972cfa8439bdcb35a7be78b7ef86d73b34b808c74be56dfa785c8a93b851bf", + "sha256:81b3abe3527b367da0220482820cb12a16c661672b7bcfcde328902890d63323", + "sha256:851b3735ca7e8102e661872390e3bce88f8901bece95c25a0c8bb9ecb8a23d32", + "sha256:985c06a33f5c0dae92c71a036d1ea63872ee86a21dd9b01e1f287486f15524b4", + "sha256:9dbb8823b49f06d4de52721b47de4d3b3026064ef4788ce62b1a21c57c3fff6f", + "sha256:b435857b9efcbfc12f8c326ef0383f26416272260455bbca2cd8d8eca470c546", + "sha256:b76b4eed9b27fd63ddb0877efdd2d15835fdcb6baa745cb85b66e5d016ac2878", + "sha256:b9fed67cf0f951573f06d560ac2f10f2a4bbdc6697770113a2fc396ea2cb2565", + "sha256:bf4133139d77fc706d8f572e6b7d82871d82ec7ef25d685c2351bdacfb701415" + ], + "markers": "platform_system == 'Windows'", + "version": "==1.0.2" + }, "typing-extensions": { "hashes": [ "sha256:49f75d16ff11f1cd258e1b988ccff82a3ca5570217d7ad8c5f48205dd99a677e", @@ -512,11 +564,11 @@ }, "urllib3": { "hashes": [ - "sha256:39fb8672126159acb139a7718dd10806104dec1e2f0f6c88aab05d17df10c8d4", - "sha256:f57b4c16c62fa2760b7e3d97c35b255512fb6b59a259730f36ba32ce9f8e342f" + "sha256:4987c65554f7a2dbf30c18fd48778ef124af6fab771a377103da0585e2336ece", + "sha256:c4fdf4019605b6e5423637e01bc9fe4daef873709a7973e195ceba0a62bbc844" ], "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'", - "version": "==1.26.6" + "version": "==1.26.7" }, "w3lib": { "hashes": [ @@ -525,6 +577,14 @@ ], "version": "==1.22.0" }, + "webdriver-manager": { + "hashes": [ + "sha256:50a6e174106542f5335cacc387cec7ada26812babc1aeca61c208a1bab2ac2c5", + "sha256:c6d81590aae6fc0fb10cf7dd20c8c1b9bb043501f9cf62c316a854a0de841e32" + ], + "index": "pypi", + "version": "==3.4.2" + }, "zope.interface": { "hashes": [ "sha256:08f9636e99a9d5410181ba0729e0408d3d8748026ea938f3b970a0249daa8192", @@ -607,19 +667,28 @@ ], "version": "==2021.5.30" }, + "colorama": { + "hashes": [ + "sha256:5941b2b48a20143d2267e95b1c2a7603ce057ee39fd88e7329b0c292aa16869b", + "sha256:9f47eda37229f68eee03b24b9748937c7dc3868f906e8ba69fbcbdd3bc5dc3e2" + ], + "markers": "sys_platform == 'win32'", + "version": "==0.4.4" + }, "distlib": { "hashes": [ - "sha256:106fef6dc37dd8c0e2c0a60d3fca3e77460a48907f335fa28420463a6f799736", - "sha256:23e223426b28491b1ced97dc3bbe183027419dfc7982b4fa2f05d5f3ff10711c" + "sha256:c8b54e8454e5bf6237cc84c20e8264c3e991e824ef27e8f1e81049867d861e31", + "sha256:d982d0751ff6eaaab5e2ec8e691d949ee80eddf01a62eaa96ddb11531fe16b05" ], - "version": "==0.3.2" + "version": "==0.3.3" }, "filelock": { "hashes": [ - "sha256:18d82244ee114f543149c66a6e0c14e9c4f8a1044b5cdaadd0f82159d6a6ff59", - "sha256:929b7d63ec5b7d6b71b0fa5ac14e030b3f70b75747cef1b10da9b879fef15836" + "sha256:8c7eab13dc442dc249e95158bcc12dec724465919bdc9831fdbf0660f03d1785", + "sha256:bbc6a0382fe8ec4744ecdf6683a2e07f65eb10ff1aff53fc02a202565446cde0" ], - "version": "==3.0.12" + "markers": "python_version >= '3.6'", + "version": "==3.3.0" }, "isort": { "hashes": [ @@ -682,11 +751,11 @@ }, "platformdirs": { "hashes": [ - "sha256:15b056538719b1c94bdaccb29e5f81879c7f7f0f4a153f46086d155dffcd4f0f", - "sha256:8003ac87717ae2c7ee1ea5a84a1a61e87f3fbd16eb5aadba194ea30a9019f648" + "sha256:367a5e80b3d04d2428ffa76d33f124cf11e8fff2acdaa9b43d545f5c7d661ef2", + "sha256:8868bbe3c3c80d42f20156f22e7131d2fb321f5bc86a2a345375c6481a67021d" ], "markers": "python_version >= '3.6'", - "version": "==2.3.0" + "version": "==2.4.0" }, "pluggy": { "hashes": [ @@ -754,11 +823,11 @@ }, "virtualenv": { "hashes": [ - "sha256:9ef4e8ee4710826e98ff3075c9a4739e2cb1040de6a2a8d35db0055840dc96a0", - "sha256:e4670891b3a03eb071748c569a87cceaefbf643c5bac46d996c5a45c34aa0f06" + "sha256:10062e34c204b5e4ec5f62e6ef2473f8ba76513a9a617e873f1f8fb4a519d300", + "sha256:bcc17f0b3a29670dd777d6f0755a4c04f28815395bca279cdcb213b97199a6b8" ], "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", - "version": "==20.7.2" + "version": "==20.8.1" }, "virtualenv-clone": { "hashes": [ diff --git a/README.md b/README.md index 6aca8fe..3cfc7a4 100644 --- a/README.md +++ b/README.md @@ -466,7 +466,8 @@ If used, `min_indexed_level` is ignored. When `js_render` is set to `true`, the scraper will use ChromeDriver. This is needed for pages that are rendered with JavaScript, for example, pages generated with React, Vue, or applications that are running in development mode: `autoreload` `watch`. -After installing ChromeDriver, provide the path to the bin using the following environment variable `CHROMEDRIVER_PATH` (default value is `/usr/bin/chromedriver`). +After installing ChromeDriver, provide the path to the bin using the following environment variable `CHROMEDRIVER_PATH`. If the variable is not set, the scraper +will automatically download and use a compatible version of ChromeDriver. The default value of `js_render` is `false`. diff --git a/scraper/src/config/browser_handler.py b/scraper/src/config/browser_handler.py index fb0dad2..fbdf9df 100644 --- a/scraper/src/config/browser_handler.py +++ b/scraper/src/config/browser_handler.py @@ -1,8 +1,11 @@ import re import os +import sys +from distutils.util import strtobool from selenium import webdriver from selenium.webdriver.chrome.options import Options +from webdriver_manager.chrome import ChromeDriverManager from ..custom_downloader_middleware import CustomDownloaderMiddleware from ..js_executor import JsExecutor @@ -26,12 +29,42 @@ def init(config_original_content, js_render, user_agent): chrome_options.add_argument('--headless') chrome_options.add_argument('user-agent={0}'.format(user_agent)) - CHROMEDRIVER_PATH = os.environ.get('CHROMEDRIVER_PATH', - "/usr/bin/chromedriver") - if not os.path.isfile(CHROMEDRIVER_PATH): - raise Exception( - "Env CHROMEDRIVER_PATH='{}' is not a path to a file".format( - CHROMEDRIVER_PATH)) + CHROMEDRIVER_PATH = os.environ.get('CHROMEDRIVER_PATH', '') + if not CHROMEDRIVER_PATH or not os.path.isfile(CHROMEDRIVER_PATH): + print("Could not find ChromeDriver.") + print("Either the Env CHROMEDRIVER_PATH='{}' path is incorrect or " + "ChromeDriver is not installed.".format(CHROMEDRIVER_PATH)) + print("Do you want to automatically download ChromeDriver?") + while(True): + user_input = input("[Y/n]: ") + try: + yes = strtobool(user_input) + break + except ValueError: + print("Please enter a valid input.") + continue + if yes: + try: + CHROMEDRIVER_PATH = ChromeDriverManager().install() + + except Exception as e: + print("Could not download ChromeDriver. " + "Please install ChromeDriver manually.") + print(e) + if sys.platform == "linux" or sys.platform == "darwin": + os.system('read -s -n 1 -p "Press any key to continue..."') + if sys.platform == "win32": + os.system('pause') + sys.exit(1) + else: + print("Please install ChromeDriver and set the CHROMEDRIVER_PATH " + "environment variable or remove the render_js option.") + if sys.platform == "linux" or sys.platform == "darwin": + os.system('read -s -n 1 -p "Press any key to continue..."') + if sys.platform == "win32": + os.system('pause') + sys.exit(1) + driver = webdriver.Chrome( CHROMEDRIVER_PATH, options=chrome_options) diff --git a/tests/config_loader/get_extra_facets_test.py b/tests/config_loader/get_extra_facets_test.py index d2c2f0f..ae839c8 100644 --- a/tests/config_loader/get_extra_facets_test.py +++ b/tests/config_loader/get_extra_facets_test.py @@ -5,6 +5,7 @@ from .abstract import config from .mocked_init import MockedInit + class TestGetExtraFacets: def test_extra_facets_should_be_empty_by_default(self): c = config() @@ -19,6 +20,7 @@ def test_extra_facets_should_be_set_from_start_urls_variables_browser(self, monkeypatch): monkeypatch.setattr("selenium.webdriver.chrome", lambda x: MockedInit()) + monkeypatch.setattr('builtins.input', lambda _: "y") c = config({ "start_urls": [ @@ -43,6 +45,7 @@ def test_extra_facets_should_be_set_from_start_urls_variables_with_two_start_url self, monkeypatch): monkeypatch.setattr("selenium.webdriver.chrome", lambda x: MockedInit()) + monkeypatch.setattr('builtins.input', lambda _: "y") c = config({ "js-render": True, @@ -74,6 +77,7 @@ def test_extra_facets_should_be_set_from_start_urls_variables_with_multiple_tags self, monkeypatch): monkeypatch.setattr("selenium.webdriver.chrome", lambda x: MockedInit()) + monkeypatch.setattr('builtins.input', lambda _: "y") c = config({ "start_urls": [ diff --git a/tests/config_loader/open_selenium_browser_test.py b/tests/config_loader/open_selenium_browser_test.py index 8136485..b616bbc 100644 --- a/tests/config_loader/open_selenium_browser_test.py +++ b/tests/config_loader/open_selenium_browser_test.py @@ -21,6 +21,7 @@ def test_browser_not_needed_by_default(self): def test_browser_needed_when_js_render_true(self, monkeypatch): monkeypatch.setattr("selenium.webdriver.chrome", lambda x: MockedInit()) + monkeypatch.setattr('builtins.input', lambda _: "y") # When c = config({ "js_render": True @@ -37,6 +38,7 @@ def test_browser_needed_when_config_contains_automatic_tag(self, monkeypatch): monkeypatch.setattr("selenium.webdriver.chrome", lambda x: MockedInit()) + monkeypatch.setattr('builtins.input', lambda _: "y") # When c = config({ diff --git a/tests/config_loader/start_urls_test.py b/tests/config_loader/start_urls_test.py index 2fdfec5..5408c6e 100644 --- a/tests/config_loader/start_urls_test.py +++ b/tests/config_loader/start_urls_test.py @@ -75,6 +75,7 @@ def test_start_urls_should_be_generated_when_there_is_automatic_tagging_browser( self, monkeypatch): monkeypatch.setattr("selenium.webdriver.chrome", lambda x: MockedInit()) + monkeypatch.setattr('builtins.input', lambda _: "y") # When c = config({