Skip to content

Commit

Permalink
change selenium to playwright
Browse files Browse the repository at this point in the history
  • Loading branch information
new-village committed May 5, 2023
1 parent 0a236ce commit e2a5274
Show file tree
Hide file tree
Showing 6 changed files with 45 additions and 78 deletions.
12 changes: 5 additions & 7 deletions .github/workflows/test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,16 +18,14 @@ jobs:
with:
python-version: ${{ matrix.python-version }}

- name: Install Chrome
run: |
sudo wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | sudo apt-key add -
sudo apt update
sudo apt-get install google-chrome-stable
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install selenium webdriver_manager chromedriver-binary beautifulsoup4
pip install playwright
- name: Set up Playwright
run: |
playwright install
- name: Test with unittest
run: |
Expand Down
3 changes: 2 additions & 1 deletion .vscode/extensions.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
"recommendations": [
"ms-python.python",
"ms-python.vscode-pylance",
"ms-python.pylint"
"ms-python.pylint",
"visualstudioexptteam.vscodeintellicode"
]
}
7 changes: 2 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,9 @@ nsloader is tested by Python `3.10`.

### Dependencies
----------------------
- [beautifulsoup4](https://www.crummy.com/software/BeautifulSoup/bs4/doc/#)
- [selenium](https://www.selenium.dev/)
- [webdriver_manager](https://github.com/SergeyPirogov/webdriver_manager)
- [chromedriver_binary](https://github.com/danielkaiser/python-chromedriver-binary)
- [playwright](https://playwright.dev/python/)

Additionaly, you have to install `google-chrome-stable` in your execution environment.
Additionaly, you have to execute `install playwright` in your execution environment.


### Usage
Expand Down
95 changes: 34 additions & 61 deletions nsloader/wsj.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,22 @@
""" load.py
""" wsj.py
"""
import logging
import os

import chromedriver_binary
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager

from playwright.sync_api import sync_playwright

class Article():
""" Article class
"""
def __init__(self, username=None, password=None):
logging.info('Initialize the Article class')
self.driver = self._login(username, password)
self.soup = None
# Initialize playwright
self.playwright = sync_playwright().start()
self.browser = self.playwright.chromium.launch()
self.page = self.browser.new_page()
# Get Environment Parameters
_usr = os.environ['WSJ_USERNAME'] if os.environ.get('WSJ_USERNAME') else username
_pwd = os.environ['WSJ_PASSWORD'] if os.environ.get('WSJ_PASSWORD') else password
# Login the Wall Street Journal
self._login(_usr, _pwd)
# Initialize return value
self.url = None
self.title = None
self.sub_title = None
Expand All @@ -30,73 +27,49 @@ def __init__(self, username=None, password=None):
self.body = None

def __del__(self):
self.driver.close()
self.driver.quit()
self.browser.close()
self.playwright.stop()

def _login(self, username=None, password=None):
def _login(self, username, password):
""" Get authenticated session info of the Wall Street Journal.
:param username: registrated user name or email address
:param password: registrated password
:return: :class: `driver` object
"""
# Set Parameters
usr = os.environ['WSJ_USERNAME'] if os.environ['WSJ_USERNAME'] else username
pwd = os.environ['WSJ_PASSWORD'] if os.environ['WSJ_PASSWORD'] else password
url = "https://www.wsj.com/"
# Initialize browser
options = Options()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument("--disable-dev-shm-usage")
# Create Firefox's webdriver object
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
# Access to initial page
driver.get(url)
wait = WebDriverWait(driver=driver, timeout=10)
try:
# Go to Sign-in page
wait.until(EC.element_to_be_clickable((By.LINK_TEXT, "SIGN IN"))).click()
# Login Site
page1 = [usr, '//*[@id="username"]', '//*[@id="basic-login"]/div[1]/form/div[2]/div[6]/div[1]/button[2]']
page2 = [pwd, '//*[@id="password-login-password"]', '//*[@id="password-login"]/div/form/div/div[5]/div[1]/button']
for i in [page1, page2]:
wait.until(EC.element_to_be_clickable((By.XPATH, i[1]))).send_keys(i[0])
wait.until(EC.element_to_be_clickable((By.XPATH, i[2]))).click()
wait.until(EC.title_contains("The Wall Street Journal"))
# driver.save_screenshot('screenshot.png')
except TimeoutException:
logging.warning("Timeout: Username or Password input failed. Check your credentials.")

return driver
# Access to sign in page
url = 'https://accounts.wsj.com/login'
self.page.goto(url, timeout=0)
# Sign In Page 1
self.page.locator('input#username').fill(username)
self.page.locator('span[data-token="continuewithpassword"]').click()
# Sign In Page 2
self.page.locator('input#password-login-password').fill(password)
self.page.get_by_role("button", name="Sign In").click()
self.page.wait_for_url('https://www.wsj.com/')

def load(self, url):
# Get HTML and convert soup object
logging.info(f'Start to collect %s' % url)
self.driver.get(url)
self.soup = BeautifulSoup(self.driver.page_source.encode('utf-8'), 'html.parser')
""" load target web site
"""
self.page.goto(url, timeout=0)

# Extract each properties
self.url = url
self.title = self._extract('h1[class*="StyledHeadline"]')
self.sub_title = self._extract('h2[class*="Dek-Dek"]')
self.date_published = self._extract('time[class*="Timestamp-Timestamp"]',"datetime")
self.authors = self._extract('span[class*="AuthorContainer"]')
self.profile = self._extract('p[data-type="paragraph"] > em[data-type="emphasis"]')
# Extract body
body = [i.text for i in self.soup.select('p[data-type="paragraph"]')]
body = [i.text_content() for i in self.page.query_selector_all('p[data-type="paragraph"]')]
# if there is a profile, delete profile from the document
if len(self.profile) > 0:
body.remove(self.profile)
self.body = '\n'.join(body)

return self

def _extract(self, selector, extract_attribute=None) -> list:
target = self.soup.select(selector)
target = self.page.query_selector_all(selector)
if extract_attribute is None:
contents = ", ".join([i.text for i in target] if len(target) > 0 else list())
contents = ", ".join([i.text_content() for i in target] if len(target) > 0 else list())
else:
contents = ", ".join([i[extract_attribute] for i in target] if len(target) > 0 else list())
contents = ", ".join([i.get_attribute(extract_attribute) for i in target] if len(target) > 0 else list())
return contents

def to_dict(self) -> dict:
Expand Down
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,13 @@

setup(
name='nsloader',
version='1.0.0',
version='1.1.0',
author='new-village',
url='https://github.com/new-village/nsloader',
description='This script collects articles from Wall Street Journal and returns it in dict format.',
long_description=long_description,
long_description_content_type="text/markdown",
install_requires=['beautifulsoup4', 'selenium', 'webdriver_manager', 'chromedriver_binary'],
install_requires=['playwright'],
packages=find_packages(),
package_data={'': ['config/*.json']},
)
2 changes: 0 additions & 2 deletions test/test_wsj.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@ def test_editorial(self):
result[key] = len(result[key])
self.assertDictEqual(result, expect)


def test_commentary(self):
""" testing commentary case
"""
Expand All @@ -66,6 +65,5 @@ def test_commentary(self):
result[key] = len(result[key])
self.assertDictEqual(result, expect)


if __name__ == "__main__":
unittest.main()

0 comments on commit e2a5274

Please sign in to comment.