change selenium to playwright

new-village · May 5, 2023 · e2a5274 · e2a5274
1 parent 0a236ce
commit e2a5274
Show file tree

Hide file tree

Showing 6 changed files with 45 additions and 78 deletions.
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
@@ -18,16 +18,14 @@ jobs:
         with:
           python-version: ${{ matrix.python-version }}
 
-      - name: Install Chrome
-        run: |
-          sudo wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | sudo apt-key add -
-          sudo apt update
-          sudo apt-get install google-chrome-stable
-
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
-          pip install selenium webdriver_manager chromedriver-binary beautifulsoup4
+          pip install playwright
+      
+      - name: Set up Playwright
+        run: |
+          playwright install
 
       - name: Test with unittest
         run: |

diff --git a/.vscode/extensions.json b/.vscode/extensions.json
@@ -2,6 +2,7 @@
     "recommendations": [
         "ms-python.python",
         "ms-python.vscode-pylance",
-        "ms-python.pylint"
+        "ms-python.pylint",
+        "visualstudioexptteam.vscodeintellicode"
     ]
 }
diff --git a/README.md b/README.md
@@ -13,12 +13,9 @@ nsloader is tested by Python `3.10`.
 
 ### Dependencies
 ----------------------
-- [beautifulsoup4](https://www.crummy.com/software/BeautifulSoup/bs4/doc/#)
-- [selenium](https://www.selenium.dev/)
-- [webdriver_manager](https://github.com/SergeyPirogov/webdriver_manager)
-- [chromedriver_binary](https://github.com/danielkaiser/python-chromedriver-binary)
+- [playwright](https://playwright.dev/python/)
 
-Additionaly, you have to install `google-chrome-stable` in your execution environment.  
+Additionaly, you have to execute `install playwright` in your execution environment.  
 
 
 ### Usage

diff --git a/nsloader/wsj.py b/nsloader/wsj.py
@@ -1,25 +1,22 @@
-""" load.py
+""" wsj.py
 """
-import logging
 import os
-
-import chromedriver_binary
-from bs4 import BeautifulSoup
-from selenium import webdriver
-from selenium.common.exceptions import TimeoutException
-from selenium.webdriver.chrome.options import Options
-from selenium.webdriver.chrome.service import Service as ChromeService
-from selenium.webdriver.common.by import By
-from selenium.webdriver.support import expected_conditions as EC
-from selenium.webdriver.support.ui import WebDriverWait
-from webdriver_manager.chrome import ChromeDriverManager
-
+from playwright.sync_api import sync_playwright
 
 class Article():
+    """ Article class
+    """
     def __init__(self, username=None, password=None):
-        logging.info('Initialize the Article class')
-        self.driver = self._login(username, password)
-        self.soup = None
+        # Initialize playwright
+        self.playwright = sync_playwright().start()
+        self.browser = self.playwright.chromium.launch()
+        self.page = self.browser.new_page()
+        # Get Environment Parameters
+        _usr = os.environ['WSJ_USERNAME'] if os.environ.get('WSJ_USERNAME') else username
+        _pwd = os.environ['WSJ_PASSWORD'] if os.environ.get('WSJ_PASSWORD') else password
+        # Login the Wall Street Journal
+        self._login(_usr, _pwd)
+        # Initialize return value
         self.url = None
         self.title = None
         self.sub_title = None
@@ -30,73 +27,49 @@ def __init__(self, username=None, password=None):
         self.body = None
 
     def __del__(self):
-        self.driver.close()
-        self.driver.quit()
+        self.browser.close()
+        self.playwright.stop()
 
-    def _login(self, username=None, password=None):
+    def _login(self, username, password):
         """ Get authenticated session info of the Wall Street Journal.
         :param username: registrated user name or email address
         :param password: registrated password
-        :return: :class: `driver` object
         """
-        # Set Parameters
-        usr = os.environ['WSJ_USERNAME'] if os.environ['WSJ_USERNAME'] else username
-        pwd = os.environ['WSJ_PASSWORD'] if os.environ['WSJ_PASSWORD'] else password
-        url = "https://www.wsj.com/"
-        # Initialize browser
-        options = Options()
-        options.add_argument('--headless')
-        options.add_argument('--no-sandbox')
-        options.add_argument("--disable-dev-shm-usage")
-        # Create Firefox's webdriver object
-        driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
-        # Access to initial page
-        driver.get(url)
-        wait = WebDriverWait(driver=driver, timeout=10)
-        try:
-            # Go to Sign-in page
-            wait.until(EC.element_to_be_clickable((By.LINK_TEXT, "SIGN IN"))).click()
-            # Login Site
-            page1 = [usr, '//*[@id="username"]', '//*[@id="basic-login"]/div[1]/form/div[2]/div[6]/div[1]/button[2]']
-            page2 = [pwd, '//*[@id="password-login-password"]', '//*[@id="password-login"]/div/form/div/div[5]/div[1]/button']
-            for i in [page1, page2]:
-                wait.until(EC.element_to_be_clickable((By.XPATH, i[1]))).send_keys(i[0])
-                wait.until(EC.element_to_be_clickable((By.XPATH, i[2]))).click()
-            wait.until(EC.title_contains("The Wall Street Journal"))
-            # driver.save_screenshot('screenshot.png')
-        except TimeoutException:
-            logging.warning("Timeout: Username or Password input failed. Check your credentials.")
-
-        return driver
+        # Access to sign in page
+        url = 'https://accounts.wsj.com/login'
+        self.page.goto(url, timeout=0)
+        # Sign In Page 1
+        self.page.locator('input#username').fill(username)
+        self.page.locator('span[data-token="continuewithpassword"]').click()
+        # Sign In Page 2
+        self.page.locator('input#password-login-password').fill(password)
+        self.page.get_by_role("button", name="Sign In").click()
+        self.page.wait_for_url('https://www.wsj.com/')
 
     def load(self, url):
-        # Get HTML and convert soup object
-        logging.info(f'Start to collect %s' % url)
-        self.driver.get(url)
-        self.soup = BeautifulSoup(self.driver.page_source.encode('utf-8'), 'html.parser')
+        """ load target web site
+        """
+        self.page.goto(url, timeout=0)
 
-        # Extract each properties
         self.url = url
         self.title = self._extract('h1[class*="StyledHeadline"]')
         self.sub_title = self._extract('h2[class*="Dek-Dek"]')
         self.date_published = self._extract('time[class*="Timestamp-Timestamp"]',"datetime")
         self.authors = self._extract('span[class*="AuthorContainer"]')
         self.profile = self._extract('p[data-type="paragraph"] > em[data-type="emphasis"]')
         # Extract body
-        body = [i.text for i in self.soup.select('p[data-type="paragraph"]')]
+        body = [i.text_content() for i in self.page.query_selector_all('p[data-type="paragraph"]')]
         # if there is a profile, delete profile from the document
         if len(self.profile) > 0:
             body.remove(self.profile)
         self.body = '\n'.join(body)
 
-        return self
-
     def _extract(self, selector, extract_attribute=None) -> list:
-        target = self.soup.select(selector)
+        target = self.page.query_selector_all(selector)
         if extract_attribute is None:
-            contents = ", ".join([i.text for i in target] if len(target) > 0 else list())
+            contents = ", ".join([i.text_content() for i in target] if len(target) > 0 else list())
         else:
-            contents = ", ".join([i[extract_attribute] for i in target] if len(target) > 0 else list())
+            contents = ", ".join([i.get_attribute(extract_attribute) for i in target] if len(target) > 0 else list())
         return contents
 
     def to_dict(self) -> dict:

diff --git a/setup.py b/setup.py
@@ -7,13 +7,13 @@
 
 setup(
     name='nsloader',
-    version='1.0.0',
+    version='1.1.0',
     author='new-village',
     url='https://github.com/new-village/nsloader',
     description='This script collects articles from Wall Street Journal and returns it in dict format.',
     long_description=long_description,
     long_description_content_type="text/markdown",
-    install_requires=['beautifulsoup4', 'selenium', 'webdriver_manager', 'chromedriver_binary'],
+    install_requires=['playwright'],
     packages=find_packages(),
     package_data={'': ['config/*.json']},
 )
diff --git a/test/test_wsj.py b/test/test_wsj.py
@@ -40,7 +40,6 @@ def test_editorial(self):
             result[key] = len(result[key])
         self.assertDictEqual(result, expect)
 
-
     def test_commentary(self):
         """ testing commentary case
         """
@@ -66,6 +65,5 @@ def test_commentary(self):
             result[key] = len(result[key])
         self.assertDictEqual(result, expect)
 
-
 if __name__ == "__main__":
     unittest.main()