#### API --> API allow communication between sofwares written in different languages and have different structures <br>
eg : <br> 
Web browser communicating with a server

In [6]:
from urllib.request import urlopen
import json
from urllib.error import HTTPError


def getCountry(ipAddress):
    try:
        response = urlopen('http://freegeoip.net/json/'+ipAddress).read().decode('utf-8')
        responseJson = json.loads(response) 
        return responseJson.get('country_code')
    except HTTPError as e:
        # Handle the error
        print(f"An error occured: {e}")
    except Exception as e:
        # Handle other exceptions
        print(f"An error occured: {e}")
    
print(getCountry('50.78.253.58'))

An error occured: Expecting value: line 1 column 1 (char 0)
None


In [9]:
from urllib.request import urlopen
import json

def getCountry(ipAddress):
    try:
        response = urlopen('http://freegeoip.net/json/'+ipAddress).read().decode('utf-8')
        if response:
            responseJson = json.loads(response) 
            return responseJson.get('country_code')
        else:
            return None
    except HTTPError as e:
        # Handle the error
        print(f"An error occured: {e}")
    except json.decoder.JSONDecodeError as e:
        # Handle json.loads error
        print(f"An error occured: {e}")
    except Exception as e:
        # Handle other exceptions
        print(f"An error occured: {e}")
    
print(getCountry('50.78.253.58'))

An error occured: Expecting value: line 1 column 1 (char 0)
None


In [7]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import json
import datetime
import random
import re

random.seed(datetime.datetime.now())
def getLinks(articleUrl):
    html = urlopen('http://en.wikipedia.org{}'.format(articleUrl))
    bs = BeautifulSoup(html, 'html.parser')
    return bs.find('div', {'id':'bodyContent'}).findAll('a', 
        href=re.compile('^(/wiki/)((?!:).)*$'))

def getHistoryIPs(pageUrl):
    #Format of revision history pages is: 
    #http://en.wikipedia.org/w/index.php?title=Title_in_URL&action=history
    pageUrl = pageUrl.replace('/wiki/', '')
    historyUrl = 'http://en.wikipedia.org/w/index.php?title={}&action=history'.format(pageUrl)
    print('history url is: {}'.format(historyUrl))
    html = urlopen(historyUrl)
    bs = BeautifulSoup(html, 'html.parser')
    #finds only the links with class "mw-anonuserlink" which has IP addresses 
    #instead of usernames
    ipAddresses = bs.findAll('a', {'class':'mw-anonuserlink'})
    addressList = set()
    for ipAddress in ipAddresses:
        addressList.add(ipAddress.get_text())
    return addressList

links = getLinks('/wiki/Python_(programming_language)')

while(len(links) > 0):
    for link in links:
        print('-'*20) 
        historyIPs = getHistoryIPs(link.attrs['href'])
        for historyIP in historyIPs:
            print(historyIP)

    newLink = links[random.randint(0, len(links)-1)].attrs['href']
    links = getLinks(newLink)

since Python 3.9 and will be removed in a subsequent version. The only 
supported seed types are: None, int, float, str, bytes, and bytearray.
  random.seed(datetime.datetime.now())


--------------------
history url is: http://en.wikipedia.org/w/index.php?title=Programming_paradigm&action=history
2806:1016:d:54b6:8950:4501:c00b:507a
2806:108e:d:bd2c:a577:db4f:2867:2b5c
106.203.236.74
36.255.87.160
2409:4042:e8f:8d39:b50c:f4ca:91b8:eb9d
2405:201:2009:80b0:41bc:366f:a49c:52f2
85.107.66.187
172.115.220.47
2001:171b:226c:d550:d837:b286:5892:7f22
2405:201:400b:7058:b128:89fd:5248:f249
115.186.189.53
5.239.162.101
106.215.120.243
2603:6011:1100:a1d0:31bd:8a11:a0c8:e4c3
223.25.63.145
107.190.108.84
--------------------
history url is: http://en.wikipedia.org/w/index.php?title=Multi-paradigm_programming_language&action=history
75.139.254.117
98.197.198.46
--------------------
history url is: http://en.wikipedia.org/w/index.php?title=Object-oriented_programming&action=history
106.1.248.137
131.119.15.14
2a01:cb1e:75:31ab:a2:ba91:d0e:6965
103.243.67.86
82.151.115.17
2405:9800:b923:2b6:b4b7:730b:405b:a1e0
2a00:23c8:2c00:4d01:9d37:3187:6b3c:6d7b
115.241.201.77
2a00:23ee:1050:2

KeyboardInterrupt: 

In [8]:
def getCountry(ipAddress):
    try:
        response = urlopen(
            'http://freegeoip.net/json/{}'.format(ipAddress)).read().decode('utf-8')
    except HTTPError:
        return None
    responseJson = json.loads(response)
    return responseJson.get('country_code')
    
links = getLinks('/wiki/Python_(programming_language)')

while(len(links) > 0):
    for link in links:
        print('-'*20) 
        historyIPs = getHistoryIPs(link.attrs["href"])
        for historyIP in historyIPs:
            country = getCountry(historyIP)
            if country is not None:
                print('{} is from {}'.format(historyIP, country))

    newLink = links[random.randint(0, len(links)-1)].attrs['href']
    links = getLinks(newLink)

--------------------
history url is: http://en.wikipedia.org/w/index.php?title=Programming_paradigm&action=history


JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [10]:
def getCountry(ipAddress):
    try:
        response = urlopen(
            'http://freegeoip.net/json/{}'.format(ipAddress)).read().decode('utf-8')
    except HTTPError as e:
        # Handle the error
        print(f"An error occured while making the request: {e}")
        return None
    try:
        responseJson = json.loads(response)
        return responseJson.get('country_code')
    except json.decoder.JSONDecodeError as e:
        # Handle json.loads error
        print(f"An error occured while decoding the JSON response: {e}")
        return None

links = getLinks('/wiki/Python_(programming_language)')

while(len(links) > 0):
    for link in links:
        print('-'*20) 
        historyIPs = getHistoryIPs(link.attrs["href"])
        for historyIP in historyIPs:
            country = getCountry(historyIP)
            if country is not None:
                print('{} is from {}'.format(historyIP, country))
            else:
                print(f"Could not find country for IP: {historyIP}")

    newLink = links[random.randint(0, len(links)-1)].attrs['href']
    links = getLinks(newLink)


--------------------
history url is: http://en.wikipedia.org/w/index.php?title=Programming_paradigm&action=history
An error occured while decoding the JSON response: Expecting value: line 1 column 1 (char 0)
Could not find country for IP: 2806:1016:d:54b6:8950:4501:c00b:507a
An error occured while decoding the JSON response: Expecting value: line 1 column 1 (char 0)
Could not find country for IP: 2806:108e:d:bd2c:a577:db4f:2867:2b5c
An error occured while decoding the JSON response: Expecting value: line 1 column 1 (char 0)
Could not find country for IP: 106.203.236.74
An error occured while decoding the JSON response: Expecting value: line 1 column 1 (char 0)
Could not find country for IP: 36.255.87.160
An error occured while decoding the JSON response: Expecting value: line 1 column 1 (char 0)
Could not find country for IP: 2409:4042:e8f:8d39:b50c:f4ca:91b8:eb9d
An error occured while decoding the JSON response: Expecting value: line 1 column 1 (char 0)
Could not find country for IP:

KeyboardInterrupt: 

#### API --> Application Programming Interface <br>
API is a set of functions and procedures that allow the creation of applications that access the features or data of an operating system, application, or other service. <br>
APIs are used when programming graphical user interface (GUI) components. <br>
APIs are also used extensively in web services. <br>
APIs can be classified into three main types based on their scope: <br>
- Application Programming Interface (API) <br>
- System Programming Interface (SPI) <br>
- Hardware Abstraction Layer (HAL) <br>



## Image Processing and Text Recognition

In [None]:
%pip install pytesseract

In [2]:
from PIL import Image
import pytesseract
from pytesseract import Output

print(pytesseract.image_to_data(Image.open('files/'
    'textOriginal.png'),
    output_type=Output.DICT))
print(pytesseract.image_to_string(Image.open('files/textOriginal.png'), output_type=Output.BYTES))

{'level': [1, 2, 3, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 5, 5, 5, 5, 5, 5], 'page_num': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'block_num': [0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'par_num': [0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'line_num': [0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2], 'word_num': [0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0, 1, 2, 3, 4, 5, 6], 'left': [0, 23, 23, 23, 23, 76, 99, 164, 215, 295, 317, 378, 425, 467, 500, 555, 23, 23, 140, 198, 239, 304, 407], 'top': [0, 26, 26, 26, 26, 26, 30, 26, 26, 26, 26, 26, 26, 26, 26, 26, 54, 55, 55, 59, 59, 55, 54], 'width': [600, 555, 555, 555, 44, 15, 57, 42, 70, 15, 52, 40, 32, 24, 45, 23, 513, 107, 49, 32, 57, 92, 129], 'height': [103, 51, 51, 22, 17, 17, 13, 21, 17, 17, 21, 17, 17, 17, 17, 22, 23, 17, 17, 13, 13, 22, 23], 'conf': [-1, -1, -1, -1, 96, 96, 96, 95, 96, 96, 96, 96, 96, 96, 96, 96

In [3]:
from PIL import Image
import pytesseract

def cleanFile(filePath, newFilePath):
    image = Image.open(filePath)

    #Set a threshold value for the image, and save
    image = image.point(lambda x: 0 if x < 143 else 255)
    image.save(newFilePath)
    return image

image = cleanFile('files/textBad.png', 'files/textCleaned.png')

#call tesseract to do OCR on the newly created image
print(pytesseract.image_to_string(image))

This is some text, written in Arial, that will be read by
Tesseract Here are some symbols: |@#$%*&"(}



In [None]:
from PIL import Image
import subprocess

def cleanFile(filePath, newFilePath):
    image = Image.open(filePath)

    #Set a threshold value for the image, and save
    image = image.point(lambda x: 0 if x<135 else 255)
    image.save(newFilePath)

    #call tesseract to do OCR on the newly created image
    subprocess.call(['tesseract', newFilePath, 'output'])
    
    #Open and read the resulting data file
    outputFile = open('output.txt', 'r')
    print(outputFile.read())
    outputFile.close()

cleanFile('files/textOriginal.png', 'files/text_2.png')

In [4]:
from PIL import  Image
import subprocess

def cleanFile(filePath, newFilePath):
    image = Image.open(filePath)

    #Set a threshold value for the image, and save
    image = image.point(lambda x: 0 if x < 135 else 255)
    image.save(newFilePath)

    # call tesseract to do OCR on the newly created image
    subprocess.call(['tesseract',newFilePath,'output'])

    # Open and read the resulting data file
    outputFile = open('output.txt', 'r')
    print(outputFile.read())
    outputFile.close()

cleanFile('files/textOriginal.png', 'files/text_2.png')

This is some text, written in Arial, that will be read by
Tesseract. Here are some symbols: !|@#$%*&*()



Estimating resolution as 218


This code is using the Python Imaging Library (PIL) to open an image file specified by the filePath variable, and then using the point() method to set a threshold value of 135 for the image. Any pixels with a value less than 135 are set to 0 (black), and any pixels with a value greater than or equal to 135 are set to 255 (white). The resulting image is then saved to the file specified by the newFilePath variable.

Then, the code uses the subprocess module to call the command-line program Tesseract OCR, passing in the path of the newly created image file and the output file name. Tesseract OCR is an optical character recognition tool that can be used to extract text from images.

After that, the code opens the output file, reads its content and prints it out. And finally, the output file is closed.

In [6]:
import pytesseract
from pytesseract import Output
from PIL import Image
import numpy as np

def cleanFile(filePath, threshold):
    image = Image.open(filePath)
    #Set a threshold value for the image, and save
    image = image.point(lambda x: 0 if x<threshold else 255)
    return image

def getConfidence(image):
    data = pytesseract.image_to_data(image, output_type=Output.DICT)
    text = data['text']
    confidences = []
    numChars = []
    
    for i in range(len(text)):
        if data['conf'][i] > -1:
            confidences.append(data['conf'][i])
            numChars.append(len(text[i]))
            
    return np.average(confidences, weights=numChars), sum(numChars)
    
filePath = 'files/textBad.png'

start = 80
step = 5
end = 200

for threshold in range(start, end, step):
    image = cleanFile(filePath, threshold)
    scores = getConfidence(image)
    print('threshold: {}, confidence: {}, numChars {}'.format(str(threshold), str(scores[0]), str(scores[1])))

threshold: 80, confidence: 48.666666666666664, numChars 18
threshold: 85, confidence: 64.65217391304348, numChars 23
threshold: 90, confidence: 45.5945945945946, numChars 37
threshold: 95, confidence: 39.48717948717949, numChars 39
threshold: 100, confidence: 56.310344827586206, numChars 58
threshold: 105, confidence: 52.943661971830984, numChars 71
threshold: 110, confidence: 68.16867469879519, numChars 83
threshold: 115, confidence: 73.82926829268293, numChars 82
threshold: 120, confidence: 75.52380952380952, numChars 84
threshold: 125, confidence: 82.45882352941176, numChars 85
threshold: 130, confidence: 75.22619047619048, numChars 84
threshold: 135, confidence: 87.78313253012048, numChars 83
threshold: 140, confidence: 83.44705882352942, numChars 85
threshold: 145, confidence: 88.72619047619048, numChars 84
threshold: 150, confidence: 81.7710843373494, numChars 83
threshold: 155, confidence: 87.72151898734177, numChars 79
threshold: 160, confidence: 84.275, numChars 80
threshold: 

This script uses the pytesseract library to perform OCR (Optical Character Recognition) on an image file located at 'filePath' variable. The image is first cleaned by setting a threshold value for the image, where any pixel with a value less than the threshold is set to 0 (black) and any pixel with a value greater than or equal to the threshold is set to 255 (white). This is done to make the text in the image more clear and easy for pytesseract to read.

Then the script loops through different threshold values ranging from 'start' to 'end' in increments of 'step'. For each threshold value, the script applies the threshold to the image and then uses pytesseract to extract text and confidence values from the image. The script then calculates the average confidence value, weighted by the number of characters in each extracted text, and the total number of characters extracted. The script then prints out the threshold value, the calculated average confidence, and the total number of characters extracted.

In [7]:
import time
from urllib.request import urlretrieve
from PIL import Image
import pytesseract
from selenium import webdriver
from PIL import Image

# Create new Selenium driver
driver = webdriver.Chrome(executable_path='drivers/chromedriver/chromedriver')

driver.get(
    'https://www.amazon.com/Death-Ivan-Ilyich-Nikolayevich-Tolstoy/dp/1427027277')
time.sleep(2)

# Click on the book preview button
driver.find_element_by_id('imgBlkFront').click()
imageList = []

# Wait for the page to load
time.sleep(5)

while 'pointer' in driver.find_element_by_id('sitbReaderRightPageTurner').get_attribute('style'):
    # While the right arrow is available for clicking, turn through pages
    driver.find_element_by_id('sitbReaderRightPageTurner').click()
    time.sleep(2)
    # Get any new pages that have loaded (multiple pages can load at once,
    # but duplicates will not be added to a set)
    pages = driver.find_elements_by_xpath(
        '//div[@class=\'pageImage\']/div/img')
    if not len(pages):
        print('No pages found')
    for page in pages:
        image = page.get_attribute('src')
        print('Found image: {}'.format(image))
        if image not in imageList:
            urlretrieve(image, 'page.jpg')
            imageList.append(image)
            print(pytesseract.image_to_string(Image.open('page.jpg')))

driver.quit()

  driver = webdriver.Chrome(executable_path='drivers/chromedriver/chromedriver')


AttributeError: 'WebDriver' object has no attribute 'find_element_by_id'

In [10]:
%pip install -U selenium

Collecting selenium
  Downloading selenium-4.8.0-py3-none-any.whl (6.3 MB)
[K     |████████████████████████████████| 6.3 MB 19 kB/s eta 0:00:0104     |████████                        | 1.6 MB 9.9 kB/s eta 0:07:59
Installing collected packages: selenium
  Attempting uninstall: selenium
    Found existing installation: selenium 4.7.2
    Uninstalling selenium-4.7.2:
      Successfully uninstalled selenium-4.7.2
Successfully installed selenium-4.8.0
Note: you may need to restart the kernel to use updated packages.


In [9]:
import time
from urllib.request import urlretrieve
from PIL import Image
import pytesseract
from selenium import webdriver

# Create new Selenium driver
driver = webdriver.Chrome(executable_path='drivers/chromedriver.exe')

driver.get(
    'https://www.amazon.com/Death-Ivan-Ilyich-Nikolayevich-Tolstoy/dp/1427027277')
time.sleep(2)

# Click on the book preview button
driver.find_element_by_id('imgBlkFront').click()
image_list = []

# Wait for the page to load
time.sleep(5)

while True:
    try:
        # While the right arrow is available for clicking, turn through pages
        driver.find_element_by_id('sitbReaderRightPageTurner').click()
        time.sleep(2)
        # Get any new pages that have loaded (multiple pages can load at once,
        # but duplicates will not be added to a set)
        pages = driver.find_elements_by_xpath(
            '//div[@class=\'pageImage\']/div/img')
        if not pages:
            print('No pages found')
        for page in pages:
            image = page.get_attribute('src')
            print('Found image: {}'.format(image))
            if image not in image_list:
                urlretrieve(image, 'page.jpg')
                image_list.append(image)
                print(pytesseract.image_to_string(Image.open('page.jpg')))
    except:
        break

driver.quit()


  driver = webdriver.Chrome(executable_path='drivers/chromedriver.exe')


AttributeError: 'WebDriver' object has no attribute 'find_element_by_id'

In [8]:
from urllib.request import urlretrieve
from urllib.request import urlopen
from bs4 import BeautifulSoup
import subprocess
import requests
from PIL import Image
from PIL import ImageOps

def cleanImage(imagePath):
    image = Image.open(imagePath)
    image = image.point(lambda x: 0 if x<143 else 255)
    borderImage = ImageOps.expand(image,border=20,fill='white')
    borderImage.save(imagePath)

html = urlopen('http://www.pythonscraping.com/humans-only')
bs = BeautifulSoup(html, 'html.parser')
#Gather prepopulated form values
imageLocation = bs.find('img', {'title': 'Image CAPTCHA'})['src']
formBuildId = bs.find('input', {'name':'form_build_id'})['value']
captchaSid = bs.find('input', {'name':'captcha_sid'})['value']
captchaToken = bs.find('input', {'name':'captcha_token'})['value']

captchaUrl = 'http://pythonscraping.com'+imageLocation
urlretrieve(captchaUrl, 'captcha.jpg')
cleanImage('captcha.jpg')
p = subprocess.Popen(['tesseract', 'captcha.jpg', 'captcha'], stdout=
    subprocess.PIPE,stderr=subprocess.PIPE)
p.wait()
f = open('captcha.txt', 'r')

#Clean any whitespace characters
captchaResponse = f.read().replace(' ', '').replace('\n', '')
print('Captcha solution attempt: '+captchaResponse)

if len(captchaResponse) == 5:
    params = {'captcha_token':captchaToken, 'captcha_sid':captchaSid,   
              'form_id':'comment_node_page_form', 'form_build_id': formBuildId, 
              'captcha_response':captchaResponse, 'name':'Ryan Mitchell', 
              'subject': 'I come to seek the Grail', 
              'comment_body[und][0][value]': 
               '...and I am definitely not a bot'}
    r = requests.post('http://www.pythonscraping.com/comment/reply/10', 
                          data=params)
    responseObj = BeautifulSoup(r.text, 'html.parser')
    if responseObj.find('div', {'class':'messages'}) is not None:
        print(responseObj.find('div', {'class':'messages'}).get_text())
else:
    print('There was a problem reading the CAPTCHA correctly!')

HTTPError: HTTP Error 404: Not Found

### Scraping Traps

In [None]:
import requests
from bs4 import BeautifulSoup

session = requests.Session()
headers = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5)'\
           'AppleWebKit 537.36 (KHTML, like Gecko) Chrome',
           'Accept':'text/html,application/xhtml+xml,application/xml;'\
           'q=0.9,image/webp,*/*;q=0.8'}
url = 'https://www.whatismybrowser.com/'\
'developers/what-http-headers-is-my-browser-sending'
req = session.get(url, headers=headers)

bs = BeautifulSoup(req.text, 'html.parser')
print(bs.find('table',{'class':'table-striped'}).get_text)

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
chrome_options = Options()
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(
    executable_path='drivers/chromedriver', 
    chrome_options=chrome_options)
driver.get('http://pythonscraping.com')
driver.implicitly_wait(1)
print(driver.get_cookies())

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

chrome_options = Options()
chrome_options.add_argument("--headless")

driver = webdriver.Chrome(
    executable_path='drivers/chromedriver', 
    chrome_options=chrome_options)
driver.get('http://pythonscraping.com')
driver.implicitly_wait(1)

savedCookies = driver.get_cookies()
print(savedCookies)

driver2 = webdriver.Chrome(
    executable_path='drivers/chromedriver',
    chrome_options=chrome_options)

driver2.get('http://pythonscraping.com')
driver2.delete_all_cookies()
for cookie in savedCookies:
    driver2.add_cookie(cookie)

driver2.get('http://pythonscraping.com')
driver.implicitly_wait(1)
print(driver2.get_cookies())

In [None]:
from selenium import webdriver
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.chrome.options import Options

driver = webdriver.Chrome(
    executable_path='drivers/chromedriver',
    chrome_options=chrome_options)
driver.get('http://pythonscraping.com/pages/itsatrap.html')
links = driver.find_elements_by_tag_name('a')
for link in links:
    if not link.is_displayed():
        print('The link {} is a trap'.format(link.get_attribute('href')))

fields = driver.find_elements_by_tag_name('input')
for field in fields:
    if not field.is_displayed():
        print('Do not change value of {}'.format(field.get_attribute('name')))

### Testing

In [None]:
import unittest

class TestAddition(unittest.TestCase):
    def setUp(self):
        print('Setting up the test')

    def tearDown(self):
        print('Tearing down the test')

    def test_twoPlusTwo(self):
        total = 2+2
        self.assertEqual(4, total);

if __name__ == '__main__':
    unittest.main(argv=[''], exit=False)

In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import unittest

class TestWikipedia(unittest.TestCase):
    bs = None
    def setUpClass():
        url = 'http://en.wikipedia.org/wiki/Monty_Python'
        TestWikipedia.bs = BeautifulSoup(urlopen(url), 'html.parser')

    def test_titleText(self):
        pageTitle = TestWikipedia.bs.find('h1').get_text()
        self.assertEqual('Monty Python', pageTitle);

    def test_contentExists(self):
        content = TestWikipedia.bs.find('div',{'id':'mw-content-text'})
        self.assertIsNotNone(content)


if __name__ == '__main__':
    unittest.main(argv=[''], exit=False)
    %reset

In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import unittest
import re
import random
from urllib.parse import unquote

class TestWikipedia(unittest.TestCase):

    def test_PageProperties(self):
        self.url = 'http://en.wikipedia.org/wiki/Monty_Python'
        #Test the first 10 pages we encounter
        for i in range(1, 10):
            self.bs = BeautifulSoup(urlopen(self.url), 'html.parser')
            titles = self.titleMatchesURL()
            self.assertEqual(titles[0], titles[1])
            self.assertTrue(self.contentExists())
            self.url = self.getNextLink()
        print('Done!')

    def titleMatchesURL(self):
        pageTitle = self.bs.find('h1').get_text()
        urlTitle = self.url[(self.url.index('/wiki/')+6):]
        urlTitle = urlTitle.replace('_', ' ')
        urlTitle = unquote(urlTitle)
        return [pageTitle.lower(), urlTitle.lower()]

    def contentExists(self):
        content = self.bs.find('div',{'id':'mw-content-text'})
        if content is not None:
            return True
        return False

    def getNextLink(self):
        # Returns random link on page, using technique from Chapter 3
        links = self.bs.find('div', {'id':'bodyContent'}).find_all('a', href=re.compile('^(/wiki/)((?!:).)*$'))
        randomLink = random.SystemRandom().choice(links)
        return 'https://wikipedia.org{}'.format(randomLink.attrs['href'])
    

if __name__ == '__main__':
    unittest.main(argv=[''], exit=False)
    %reset

In [None]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver import ActionChains
from selenium.webdriver.chrome.options import Options

chrome_options = Options()
chrome_options.add_argument("--headless")

driver = webdriver.Chrome(
    executable_path='drivers/chromedriver',
    options=chrome_options)
driver.get('http://pythonscraping.com/pages/files/form.html')

firstnameField = driver.find_element_by_name('firstname')
lastnameField = driver.find_element_by_name('lastname')
submitButton = driver.find_element_by_id('submit')

### METHOD 1 ###
firstnameField.send_keys('Ryan')
lastnameField.send_keys('Mitchell')
submitButton.click()
################

### METHOD 2 ###
#actions = ActionChains(driver).click(firstnameField).send_keys('Ryan').click(lastnameField).send_keys('Mitchell').send_keys(Keys.RETURN)
#actions.perform()
################

print(driver.find_element_by_tag_name('body').text)

driver.close()

In [None]:
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.chrome.options import Options
import unittest


class TestAddition(unittest.TestCase):
    driver = None

    def setUp(self):
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        self.driver = webdriver.Chrome(
            executable_path='drivers/chromedriver',
            options=chrome_options)
        url = 'http://pythonscraping.com/pages/javascript/draggableDemo.html'
        self.driver.get(url)

    def tearDown(self):
        self.driver.close()

    def test_drag(self):
        element = self.driver.find_element_by_id("draggable")
        target = self.driver.find_element_by_id("div2")
        actions = ActionChains(self.driver)
        actions.drag_and_drop(element, target).perform()
        self.assertEqual("You are definitely not a bot!",
                         self.driver.find_element_by_id("message").text)

if __name__ == '__main__':
    unittest.main(argv=[''], exit=False)
    %reset

In [None]:
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.chrome.options import Options
import unittest

class TestDragAndDrop(unittest.TestCase):
    driver = None
    def setUp(self):
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        self.driver = webdriver.Chrome(
            executable_path='drivers/chromedriver',
            options=chrome_options)
        url = 'http://pythonscraping.com/pages/javascript/draggableDemo.html'
        self.driver.get(url)

    def tearDown(self):
        self.driver.close()

    def test_drag(self):
        element = self.driver.find_element_by_id('draggable')
        target = self.driver.find_element_by_id('div2')
        actions = ActionChains(self.driver)
        actions.drag_and_drop(element, target).perform()
        self.assertEqual('You are definitely not a bot!',
            self.driver.find_element_by_id('message').text)

if __name__ == '__main__':
    unittest.main(argv=[''], exit=False)
    %reset

### Scraping Remotely

In [None]:
# Must have the TOR service running on port 9150 while running this
import socks
import socket
from urllib.request import urlopen

socks.set_default_proxy(socks.SOCKS5, "localhost", 9150)
socket.socket = socks.socksocket
print(urlopen('http://icanhazip.com').read())

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--proxy-server=socks5://127.0.0.1:9150")
driver = webdriver.Chrome(executable_path='drivers/chromedriver', options=chrome_options)

driver.get('http://icanhazip.com')
print(driver.page_source)
driver.close()