## Cleaning Dirty data

In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

def getNgrams(content, n):
    content = content.split(' ')
    output = []
    for i in range(len(content)-n+1):
        output.append(content[i:i+n])
    return output

html = urlopen('http://en.wikipedia.org/wiki/Python_(programming_language)')
bs = BeautifulSoup(html, 'html.parser')
content = bs.find('div', {'id':'mw-content-text'}).get_text()
ngrams = getNgrams(content, 2)
print(ngrams)
print('2-grams count is: '+str(len(ngrams)))

The getNgrams function takes in an input string, splits it into a sequence of words (assuming all words are separated by spaces), and adds the n-gram (in this case, a 2- gram) that each word starts into an array.

In [None]:
import re 

def getNgrams(content, n):
    content = re.sub('\n|[[\d+]]',' ',content)
    content = bytes(content, 'UTF-8')
    content = content.decode('ascii', 'ignore')
    content = content.split(' ')
    content = [word for word in content if word != '']
    output = []
    for i in range(len(content)-n+1):
        output.append(content[i:i+n])
    return output

html = urlopen('http://en.wikipedia.org/wiki/Python_(programming_language)')
bs =  BeautifulSoup(html, 'html.parser')
content = bs.find('div', {'id':'mw-content-text'}).get_text()
ngrams =  getNgrams(content, 2)
print(ngrams)
print('2-grams count is: '+str(len(ngrams)))

Using regular expressions to remove escape characters (such as \n) and filtering to remove any Unicode characters, you can clean up the output somewhat:

In [5]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import string

def cleanSentence(sentence):
    sentence = sentence.split(' ')
    sentence = [word.strip(string.punctuation+string.whitespace) for word in sentence]
    sentence = [word for word in sentence if len(word) > 1 or (word.lower() == 'a' or word.lower() == 'i')]
    return sentence

def cleanInput(content):
    content = content.upper()
    content = re.sub('\n|[[\d+\]]', ' ', content)
    content = bytes(content, 'UTF-8')
    content = content.decode("ascii", 'ignore')
    sentences = content.split('. ')
    return [cleanSentence(sentence) for sentence in sentences]

def getNgramsFromSentence(content, n):
    output = []
    for i in range(len(content)-n+1):
        output.append(content[i:i+n])
    return output

def getNgrams(content, n):
    content = cleanInput(content)
    ngrams = []
    for sentence in content:
        ngrams.extend(getNgramsFromSentence(sentence, n))
    return(ngrams)

html = urlopen('http://en.wikipedia.org/wiki/Python_(programming_language)')
bs = BeautifulSoup(html, 'html.parser')
content = bs.find('div', {'id':'mw-content-text'}).get_text()
print(len(getNgrams(content, 2))) #

9535


  content = re.sub('\n|[[\d+\]]', ' ', content)


In [None]:
from collections import Counter

def getNgrams(content, n):
    content = cleanInput(content)
    ngrams = Counter()
    ngrams_list = []
    for sentence in content:
        newNgrams = [' '.join(ngram) for ngram in getNgramsFromSentence(sentence, n)]
        ngrams_list.extend(newNgrams)
        ngrams.update(newNgrams)
    return(ngrams)
    
print(getNgrams(content, 2))

## Natural Language Processing

In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import string
from collections import Counter

def cleanSentence(sentence):
    sentence = sentence.split(' ')
    sentence = [word.strip(string.punctuation+string.whitespace) for word in sentence]
    sentence = [word for word in sentence if len(word) > 1 or (word.lower() == 'a' or word.lower() == 'i')]
    return sentence

def cleanInput(content):
    content = content.upper()
    content = re.sub('\n', ' ', content)
    content = bytes(content, 'UTF-8')
    content = content.decode('ascii', 'ignore')
    sentences = content.split('. ')
    return [cleanSentence(sentence) for sentence in sentences]

def getNgramsFromSentence(content, n):
    output = []
    for i in range(len(content)-n+1):
        output.append(content[i:i+n])
    return output

def getNgrams(content, n):
    content = cleanInput(content)
    ngrams = Counter()
    ngrams_list = []
    for sentence in content:
        newNgrams = [' '.join(ngram) for ngram in getNgramsFromSentence(sentence, n)]
        ngrams_list.extend(newNgrams)
        ngrams.update(newNgrams)
    return(ngrams)


content = str(
      urlopen('http://pythonscraping.com/files/inaugurationSpeech.txt').read(),
              'utf-8')
ngrams = getNgrams(content, 3)
print(ngrams)

In [None]:
def isCommon(ngram):
    commonWords = ['THE', 'BE', 'AND', 'OF', 'A', 'IN', 'TO', 'HAVE', 'IT', 'I', 'THAT', 'FOR', 'YOU', 'HE', 'WITH', 'ON', 'DO', 'SAY', 'THIS', 'THEY', 'IS', 'AN', 'AT', 'BUT', 'WE', 'HIS', 'FROM', 'THAT', 'NOT', 'BY', 'SHE', 'OR', 'AS', 'WHAT', 'GO', 'THEIR', 'CAN', 'WHO', 'GET', 'IF', 'WOULD', 'HER', 'ALL', 'MY', 'MAKE', 'ABOUT', 'KNOW', 'WILL', 'AS', 'UP', 'ONE', 'TIME', 'HAS', 'BEEN', 'THERE', 'YEAR', 'SO', 'THINK', 'WHEN', 'WHICH', 'THEM', 'SOME', 'ME', 'PEOPLE', 'TAKE', 'OUT', 'INTO', 'JUST', 'SEE', 'HIM', 'YOUR', 'COME', 'COULD', 'NOW', 'THAN', 'LIKE', 'OTHER', 'HOW', 'THEN', 'ITS', 'OUR', 'TWO', 'MORE', 'THESE', 'WANT', 'WAY', 'LOOK', 'FIRST', 'ALSO', 'NEW', 'BECAUSE', 'DAY', 'MORE', 'USE', 'NO', 'MAN', 'FIND', 'HERE', 'THING', 'GIVE', 'MANY', 'WELL']
    for word in ngram:
        if word in commonWords:
            return True
    return False

def getNgramsFromSentence(content, n):
    output = []
    for i in range(len(content)-n+1):
        if not isCommon(content[i:i+n]):
            output.append(content[i:i+n])
    return output

ngrams = getNgrams(content, 3)
print(ngrams)

In [9]:
def getFirstSentenceContaining(ngram, content):
    #print(ngram)
    sentences = content.upper().split(". ")
    for sentence in sentences: 
        if ngram in sentence:
            return sentence+'\n'
    return ""


print(getFirstSentenceContaining('EXCLUSIVE METALLIC CURRENCY', content))
print(getFirstSentenceContaining('EXECUTIVE DEPARTMENT', content))
print(getFirstSentenceContaining('GENERAL GOVERNMENT', content))
print(getFirstSentenceContaining('CALLED UPON', content))
print(getFirstSentenceContaining('CHIEF MAGISTRATE', content))

IF THERE IS ONE MEASURE BETTER CALCULATED THAN ANOTHER TO PRODUCE THAT STATE OF THINGS SO MUCH DEPRECATED BY ALL TRUE REPUBLICANS, BY WHICH THE RICH ARE DAILY ADDING TO THEIR HOARDS AND THE POOR SINKING DEEPER INTO PENURY, IT IS AN EXCLUSIVE METALLIC CURRENCY

SUCH A ONE WAS AFFORDED BY THE EXECUTIVE DEPARTMENT CONSTITUTED BY THE CONSTITUTION

THE GENERAL GOVERNMENT HAS SEIZED UPON NONE OF THE RESERVED RIGHTS OF THE STATES

CALLED FROM A RETIREMENT WHICH I HAD SUPPOSED WAS TO CONTINUE FOR THE RESIDUE OF MY LIFE TO FILL THE CHIEF EXECUTIVE OFFICE OF THIS GREAT AND FREE NATION, I APPEAR BEFORE YOU, FELLOW-CITIZENS, TO TAKE THE OATHS WHICH THE CONSTITUTION PRESCRIBES AS A NECESSARY QUALIFICATION FOR THE PERFORMANCE OF ITS DUTIES; AND IN OBEDIENCE TO A CUSTOM COEVAL WITH OUR GOVERNMENT AND WHAT I BELIEVE TO BE YOUR EXPECTATIONS I PROCEED TO PRESENT TO YOU A SUMMARY OF THE PRINCIPLES WHICH WILL GOVERN ME IN THE DISCHARGE OF THE DUTIES WHICH I SHALL BE CALLED UPON TO PERFORM.

IT WAS THE REM

In [10]:
from urllib.request import urlopen
from random import randint

def wordListSum(wordList):
    sum = 0
    for word, value in wordList.items():
        sum += value
    return sum

def retrieveRandomWord(wordList):
    randIndex = randint(1, wordListSum(wordList))
    for word, value in wordList.items():
        randIndex -= value
        if randIndex <= 0:
            return word

def buildWordDict(text):
    # Remove newlines and quotes
    text = text.replace('\n', ' ');
    text = text.replace('"', '');

    # Make sure punctuation marks are treated as their own "words,"
    # so that they will be included in the Markov chain
    punctuation = [',','.',';',':']
    for symbol in punctuation:
        text = text.replace(symbol, ' {} '.format(symbol));

    words = text.split(' ')
    # Filter out empty words
    words = [word for word in words if word != '']

    wordDict = {}
    for i in range(1, len(words)):
        if words[i-1] not in wordDict:
                # Create a new dictionary for this word
            wordDict[words[i-1]] = {}
        if words[i] not in wordDict[words[i-1]]:
            wordDict[words[i-1]][words[i]] = 0
        wordDict[words[i-1]][words[i]] += 1
    return wordDict

text = str(urlopen('http://pythonscraping.com/files/inaugurationSpeech.txt')
          .read(), 'utf-8')
wordDict = buildWordDict(text)

#Generate a Markov chain of length 100
length = 100
chain = ['I']
for i in range(0, length):
    newWord = retrieveRandomWord(wordDict[chain[-1]])
    chain.append(newWord)

print(' '.join(chain))

I have amply maintained their own sphere , or classed with the head and tolerant and fostering a union that 'most' of things so extensive Confederacy , it will . Men blinded by the remembrance of the acknowledged to the very remote period so distinctly drawn as it is the object . The presses in the influence of the personal liberty . It is the spoils and , to keep down a full of feeling , within the field , suggested by my opinion may be observed . Upward of the means in the character of a vestige of these patriots


In [None]:
import pymysql

conn = pymysql.connect(
    host='127.0.0.1', 
    unix_socket='/tmp/mysql.sock', 
    user='root', 
    passwd=input('Enter your password'), 
    db='mysql', 
    charset='utf8')

cur = conn.cursor()
cur.execute('USE wikipedia')

def getUrl(pageId):
    cur.execute('SELECT url FROM pages WHERE id = %s', (int(pageId)))
    return cur.fetchone()[0]

def getLinks(fromPageId):
    cur.execute('SELECT toPageId FROM links WHERE fromPageId = %s', (int(fromPageId)))
    if cur.rowcount == 0:
        return []
    return [x[0] for x in cur.fetchall()]

def searchBreadth(targetPageId, paths=[[1]]):
    newPaths = []
    for path in paths:
        links = getLinks(path[-1])
        for link in links:
            if link == targetPageId:
                return path + [link]
            else:
                newPaths.append(path+[link])
    return searchBreadth(targetPageId, newPaths)
                
nodes = getLinks(1)
targetPageId = 28624
pageIds = searchBreadth(targetPageId)
for pageId in pageIds:
    print(getUrl(pageId))

## Crawling through Forms and Logins

In [3]:
import requests

params = {'firstname': 'Kilua', 'lastname': 'Gon'}
r = requests.post("http://pythonscraping.com/pages/processing.php", data=params)
print(r.text)

Hello there,  !


In [None]:
import requests
params = {'email_addr': 'ryan.e.mitchell@gmail.com'}
r = requests.post("http://post.oreilly.com/client/o/oreilly/forms/quicksignup.cgi",
                   data=params)
print(r.text)

In [None]:
import requests

files = {'uploadFile': open('files/Python-logo.png', 'rb')}
r = requests.post('http://pythonscraping.com/pages/processing2.php', files=files)
print(r.text)

Uploading a file or an image to a website using the post method

In [9]:
import requests

params = {'username': 'KILUA', 'password': 'password'}
r = requests.post('http://pythonscraping.com/pages/cookies/welcome.php', params)
print('Cookie is set to:')
print(r.cookies.get_dict())
print('Going to profile page...')
r = requests.get('http://pythonscraping.com/pages/cookies/profile.php', 
                 cookies=r.cookies)
print(r.text)

Cookie is set to:
{}
Going to profile page...
You're not logged into the site!<br>Visit <a href="login.html">the login page</a> to log in


Cookies keep track of who is logged in and what they have access to. The requests library has a built-in cookie jar that can be used to store cookies and send them with subsequent requests.

In [8]:
import requests

session = requests.Session()

params = {'username': 'Kilua', 'password': 'password'}
s = session.post('http://pythonscraping.com/pages/cookies/welcome.php', params)
print("Cookie is set to:")
print(s.cookies.get_dict())
print('Going to profile page...')
s = session.get('http://pythonscraping.com/pages/cookies/profile.php')
print(s.text)

Cookie is set to:
{}
Going to profile page...
You're not logged into the site!<br>Visit <a href="login.html">the login page</a> to log in


In [7]:
import requests
from requests.auth import AuthBase
from requests.auth import HTTPBasicAuth

auth = HTTPBasicAuth('ryan', 'password')
r = requests.post(
    url='http://pythonscraping.com/pages/auth/login.php', auth=auth)
print(r.text)

<p>Hello ryan.</p><p>You entered password as your password.</p>


## JavaScript, Selenium, and PhantomJS

In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import time

chrome_options = Options()
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(
    executable_path='drivers/chromedriver', 
    options=chrome_options)
driver.get('http://pythonscraping.com/pages/javascript/ajaxDemo.html')
time.sleep(3)
print(driver.find_element(By.ID, 'content').text)
driver.close()

  driver = webdriver.Chrome(


Here is some important text you want to retrieve!
A button to click!


In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

chrome_options = Options()
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(
    executable_path='drivers/chromedriver',
    options=chrome_options)

driver.get('http://pythonscraping.com/pages/javascript/ajaxDemo.html')
try:
    element = WebDriverWait(driver, 10).until(
                       EC.presence_of_element_located((By.ID, 'loadedButton')))
finally:
    print(driver.find_element(By.ID, 'content').text)
    driver.close()

  driver = webdriver.Chrome(


Here is some important text you want to retrieve!
A button to click!


In [4]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.remote.webelement import WebElement
from selenium.common.exceptions import StaleElementReferenceException
import time

def waitForLoad(driver):
    elem = driver.find_element(By.TAG_NAME,"html")
    count = 0
    while True:
        count += 1
        if count > 20:
            print("Timing out after 10 seconds and returning")
            return
        time.sleep(.5)
        try:
            elem == driver.find_element(By.TAG_NAME ,"html")
        except StaleElementReferenceException:
            return
chrome_options = Options()
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(
    executable_path='drivers/chromedriver',
    options=chrome_options)
driver.get("http://pythonscraping.com/pages/javascript/redirectDemo1.html")
waitForLoad(driver)
print(driver.page_source)
driver.close()

  driver = webdriver.Chrome(


Timing out after 10 seconds and returning
<html><head>
<title>The Destination Page!</title>

</head>
<body>
This is the page you are looking for!

</body></html>


In [2]:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

chrome_options = Options()
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(
    executable_path='drivers/chromedriver', 
    options=chrome_options)
driver.get('http://pythonscraping.com/pages/javascript/redirectDemo1.html')
try:
    bodyElement = WebDriverWait(driver, 15).until(EC.presence_of_element_located(
        (By.XPATH, '//body[contains(text(), "This is the page you are looking for!")]')))
    print(bodyElement.text)
except TimeoutException:
    print('Did not find the element')

  driver = webdriver.Chrome(


This is the page you are looking for!


PhantomJS is a headless browser that can be used to render JavaScript and execute AJAX requests. It is a good alternative to Selenium when you don’t need to interact with the page.

In [3]:
from selenium import webdriver
import time
from selenium.webdriver.remote.webelement import WebElement
from selenium.common.exceptions import StaleElementReferenceException
def waitForLoad(driver):
    elem = driver.find_element(By.TAG_NAME, "html") 
    count = 0
    while True:
        count += 1
        if count > 20:
            print('Timing out after 10 seconds and returning')
            return

        time.sleep(.5) 
        try:
            elem == driver.find_element(By.TAG_NAME,'html') 
        except StaleElementReferenceException:
            return

driver = webdriver.PhantomJS(executable_path='/Users/oscar/desktop/Android-kotlin/DataScience/phantomjs-2.1.1-macosx/bin/phantomjs') 
driver.get('http://pythonscraping.com/pages/javascript/redirectDemo1.html') 
waitForLoad(driver)
print(driver.page_source)


AttributeError: module 'selenium.webdriver' has no attribute 'PhantomJS'

The error is likely caused by the fact that the PhantomJS project is no longer maintained and the library is not being updated. This means that it is not compatible with the latest version of Selenium or web browsers. As a result, the "PhantomJS" class is no longer available in the webdriver module.
One alternative solution is to use the headless chrome browser by using "Chrome" or "Firefox" webdrivers with the option '--headless'
Another alternative solution is to use other headless browser like "puppeteer"

In [None]:
%pip install pyppeteer

In [None]:
# using puppeteer 
import asyncio
from pyppeteer import launch

async def wait_for_load(page):
    await page.waitForSelector('html')

async def main():
    browser = await launch()
    page = await browser.newPage()
    await page.goto('http://pythonscraping.com/pages/javascript/redirectDemo1.html')
    await wait_for_load(page)
    page_source = await page.content()
    print(page_source)
    await browser.close()

asyncio.get_event_loop().run_until_complete(main())


In this code, the pyppeteer library is used to interact with a headless version of Chromium (Chrome's open-source core) via the launch function. The newPage function is used to create a new page in the browser, and goto method is used to navigate to the URL. The waitForSelector function is used to wait for the element with the tag "html" to be loaded. Finally, the page source is obtained with the content() function and printed to the console. Once the script is done, the browser instance is closed using the close() function.

In [None]:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC 
from selenium.common.exceptions import TimeoutException
    
driver = webdriver.PhantomJS(executable_path=
        '/Users/oscar/desktop/Android-kotlin/DataScience/phantomjs-2.1.1-macosx/bin/phantomjs')

driver.get('http://pythonscraping.com/pages/javascript/redirectDemo1.html')

try:
    bodyElement = WebDriverWait(driver, 15).until(EC.presence_of_element_located(
            (By.XPATH, '//body[contains(text(),
            "This is the page you are looking for!)]")))
    print(bodyElement.text) 
except TimeoutException:
    print('Did not find the element')

PhantomJs is no longer supported 

In [None]:
import asyncio
from pyppeteer import launch

async def main():
    browser = await launch()
    page = await browser.newPage()
    await page.goto('http://pythonscraping.com/pages/javascript/redirectDemo1.html')
    try:
        body_element = await page.waitForXPath('//body[contains(text(),"This is the page you are looking for!)]', {'timeout': 15000})
        body_text = await page.evaluate('(element) => element.textContent', body_element)
        print(body_text)
    except TimeoutError:
        print('Did not find the element')
    await browser.close()

asyncio.get_event_loop().run_until_complete(main())


In this code, the pyppeteer library is used to interact with a headless version of Chromium (Chrome's open-source core) via the launch function. The newPage function is used to create a new page in the browser, and goto method is used to navigate to the URL. The waitForXPath function is used to wait for the element with the xpath '//body[contains(text(),"This is the page you are looking for!)]' to be loaded and the 'evaluate' method is used to extract the text of the body element. Finally, the text is printed to the console. If the element is not found, a 'TimeoutError' will be raised and the message 'Did not find the element' will be printed. Once the script is done, the browser instance is closed using the close() function.