### Testing your website with scrapers


In [None]:
import unittest


class TestAddition(unittest.TestCase):
    def setUp(self):
        print('Setting up the test')

    def tearDown(self):
        print('Tearing down the test')

    def test_twoPlusTwo(self):
        total = 2+2
        self.assertEqual(4, total)

# python
# if __name__ == '__main__':
#     unittest.main()

# jupyter
# resets memory and destroys all variables
if __name__ == '__main__':
    unittest.main(argv=[''], exit=False)
    %reset 

In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import unittest


class TestWikipedia(unittest.TestCase):
    bs = None

    def setUpClass():
        # attention
        url = 'http://en.wikipedia.org/wiki/Monty_Python'
        TestWikipedia.bs = BeautifulSoup(urlopen(url), 'html.parser')

    def test_titleText(self):
        pageTitle = TestWikipedia.bs.find('h1').get_text()
        self.assertEqual('Monty Python', pageTitle)

    def test_contentExists(self):
        content = TestWikipedia.bs.find('div', {'id': 'mw-content-text'})
        self.assertIsNotNone(content)



if __name__ == '__main__':
    unittest.main(argv=[''], exit=False)
    %reset 

In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import unittest
import re
import random
from urllib.parse import unquote


class TestWikipedia(unittest.TestCase):

    def test_PageProperties(self):
        self.url = 'http://en.wikipedia.org/wiki/Monty_Python'
        # Test the first 10 pages found
        for i in range(1, 10):
            self.bs = BeautifulSoup(urlopen(self.url), 'html.parser')
            titles = self.titleMatchesURL()
            self.assertEqual(titles[0], titles[1])
            self.assertTrue(self.contentExists())
            self.url = self.getNextLink()
        print('Done!')

    def titleMatchesURL(self) -> list[str]:
        pageTitle = self.bs.find('h1').get_text()
        urlTitle = self.url[(self.url.index('/wiki/') + 6):]
        urlTitle = urlTitle.replace('_', ' ')
        urlTitle = unquote(urlTitle)
        return [pageTitle.lower(), urlTitle.lower()]

    def contentExists(self) -> bool:
        content = self.bs.find('div', {'id': 'mw-content-text'})
        if content is not None:
            return True
        return False

    def getNextLink(self) -> str:
        # Returns a random link from the page, using the technique shown in Chapter 3
        links = self.bs.find('div', {
            'id': 'bodyContent'
        }).find_all('a', href=re.compile('^(/wiki/)((?!:).)*$'))
        randomLink = random.SystemRandom().choice(links)
        return 'https://wikipedia.org{}'.format(randomLink.attrs['href'])

if __name__ == '__main__':
    unittest.main(argv=[''], exit=False)
    %reset 

### Interacting with the website


In [None]:
from selenium import webdriver
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.common.keys import Keys
from selenium.webdriver import ActionChains

driver = webdriver.Chrome(executable_path='<Path to chrome>')
driver.get('http://pythonscraping.com/pages/files/form.html')

firstnameField = driver.find_element_by_name('firstname')
lastnameField = driver.find_element_by_name('lastname')
submitButton = driver.find_element_by_id('submit')

### METHOD 1 ###
firstnameField.send_keys('Ryan')
lastnameField.send_keys('Mitchell')
submitButton.click()

### METHOD 2 ###
actions = ActionChains(driver).click(firstnameField).send_keys('John').click(
    lastnameField).send_keys('Wayne').send_keys(Keys.RETURN)
actions.perform()

print(driver.find_element_by_tag_name('body').text)
driver.close()

### drag-and-drop


In [None]:
from selenium import webdriver
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver import ActionChains

driver = webdriver.Chrome(executable_path='<Path to chrome>')
driver.get('http://pythonscraping.com/pages/files/form.html')

print(driver.find_element_by_id('message').text)
element = driver.find_element_by_id('draggable')
target = driver.find_element_by_id('div2')
actions = ActionChains(driver)
actions.drag_and_drop(element, target).perform()
print(driver.find_element_by_id('message').text)

### Capturing Screenshots


In [None]:
from selenium import webdriver
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver import ActionChains

driver = webdriver.Chrome(executable_path='<Path to chrome>')
driver.get('http://pythonscraping.com/pages/files/form.html')

driver.get_screenshot_as_file('tmp/pythonscraping.png')

### unittest or Selenium?


In [None]:
from selenium import webdriver as wd
from selenium.webdriver.common.by import By
from selenium.webdriver import ActionChains
import unittest

chrome_options = wd.ChromeOptions()
chrome_options.add_argument('--headless')

class TestDragAndDrop(unittest.TestCase):
    driver = None

    def setUp(self):
        self.driver = wd.Chrome(options=chrome_options)
        url = 'http://pythonscraping.com/pages/javascript/draggableDemo.html'
        self.driver.get(url)

    def tearDown(self):
        print("Tearing down the test")

    def test_drag(self):
        element = self.driver.find_element(By.ID,'draggable')
        target = self.driver.find_element(By.ID,'div2')
        actions = ActionChains(self.driver)
        actions.drag_and_drop(element, target).perform()
        self.assertEqual('You are definitely not a bot!',
                         self.driver.find_element(By.ID,'message').text)

if __name__ == '__main__':
    unittest.main(argv=[''], exit=False)
    %reset 