### Data storage


In [None]:
from urllib.request import urlretrieve
from urllib.request import urlopen
from bs4 import BeautifulSoup

html = urlopen('http://www.pythonscraping.com')
bs = BeautifulSoup(html, 'html.parser')

imageLocation = bs.find('a', {
    'href': 'https://pythonscraping.com'
}).find('img')['src']
urlretrieve(imageLocation, 'logo.jpg')

# output: stores it as logo.jpg in the same directory where the script is running.

### Code for illustrative purposes only, risks of damage to the computer through unwanted downloads


In [None]:
import os
from urllib.request import urlretrieve
from urllib.request import urlopen
from bs4 import BeautifulSoup

downloadDirectory = 'downloaded'
baseUrl: str = 'http://pythonscraping.com'


def getAbsoluteURL(baseUrl: str, source: str):
    if source.startswith('http://www.'):
        url = 'http://{}'.format(source[11:])
    elif source.startswith('http://'):
        url = source
    elif source.startswith('www.'):
        url = source[4:]
        url = 'http://{}'.format(source)
    else:
        url = '{}/{}'.format(baseUrl, source)
    if baseUrl not in url:
        return None
    return url


def getDownloadPath(baseUrl: str, absoluteUrl: str, downloadDirectory: str):
    path = absoluteUrl.replace('www.', '')
    path = path.replace(baseUrl, '')
    path = downloadDirectory + path
    directory = os.path.dirname(path)

    if not os.path.exists(directory):
        os.makedirs(directory)
    return path


html = urlopen('http://www.pythonscraping.com')
bs = BeautifulSoup(html, 'html.parser')
downloadList = bs.findAll(src=True)

for download in downloadList:
    fileUrl = getAbsoluteURL(baseUrl, download['src'])
    if fileUrl is not None:
        print(fileUrl)

urlretrieve(fileUrl, getDownloadPath(baseUrl, fileUrl, downloadDirectory))

In [6]:
import csv

csvFile = open('test.csv', 'w+')

try:
    writer = csv.writer(csvFile, delimiter=';')
    writer.writerow(('number', 'number plus 2', 'number times 2'))
    for i in range(10):
        writer.writerow((i, i + 2, i * 2))
finally:
    csvFile.close()

### Html Table in CSV


In [None]:
import csv
from urllib.request import urlopen
from bs4 import BeautifulSoup

html = urlopen('http://en.wikipedia.org/wiki/'
               'Comparison_of_text_editors')

bs = BeautifulSoup(html, 'html.parser')
# The main comparison table is currently the first table on the page
table = bs.findAll('table', {'class': 'wikitable'})[0]
rows = table.findAll('tr')

csvFile = open('editors.csv', 'wt+')
writer = csv.writer(csvFile)

try:
    for row in rows:
        csvRow = []
        for cell in row.findAll(['td', 'th']):
            csvRow.append(cell.get_text())
            writer.writerow(csvRow)
finally:
    csvFile.close()

### Integration with MySql


In [None]:
import pymysql

conn = pymysql.connect(host='127.0.0.1',
                       #    unix_socket='/tmp/mysql.sock',
                       user='root',
                       passwd=None,
                       db='mysql')

cur = conn.cursor()
cur.execute('USE scraping')
cur.execute('SELECT * FROM pages WHERE id=1')
print(cur.fetchone())
cur.close()
conn.close()

In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import datetime
import random
import pymysql
import re

conn = pymysql.connect(
    host='127.0.0.1',
    #    unix_socket='/tmp/mysql.sock',
    user='root',
    passwd=None,
    db='mysql',
    charset='utf8')  # Attention

cur = conn.cursor()
cur.execute("USE scraping")

random.seed(datetime.datetime.now().timestamp())


def store(title, content):
    cur.execute('INSERT INTO pages (title, content) VALUES '
                '("%s", "%s")', (title, content))
    cur.connection.commit()  # Attention


def getLinks(articleUrl):
    html = urlopen('http://en.wikipedia.org' + articleUrl)
    bs = BeautifulSoup(html, 'html.parser')
    title = bs.find('h1').get_text()
    content = bs.find('div', {'id': 'mw-content-text'}).find('p').get_text()
    store(title, content)
    return bs.find('div', {
        'id': 'bodyContent'
    }).findAll('a', href=re.compile('^(/wiki/)((?!:).)*$'))


links = getLinks('/wiki/Kevin_Bacon')

try:
    while len(links) > 0:
        newArticle = links[random.randint(0, len(links) - 1)].attrs['href']
        print(newArticle)
        links = getLinks(newArticle)
finally:
    cur.close()
    conn.close()  # Attention

### Email


In [None]:
import smtplib
from email.mime.text import MIMEText


def sendMail(subject: str, body: str):
    msg = MIMEText(body)
    msg['Subject'] = subject
    msg['From'] = 'from@email.com'
    msg['To'] = 'to@email.com'

    s = smtplib.SMTP('localhost')
    s.send_message(msg)
    s.quit()