# [Webscraping](https://www.datacamp.com/courses/web-scraping-with-python)

In [2]:
# Scrapy - Selector Objects with XPath's
import scrapy

html = '''
            <html>
              <body>
                <div class="class1">
                  <p id="p1"> Hello World!</p>
                  <p id="p2"> Enjoy DataCamp!</p>
                  <p id="p3"> Try <a href="http://www.datacamp.com">DataCamp</a> today!</p>
                </div>
              </body>
            </html>
       '''

selector = scrapy.Selector(text=html)

a = '*' # Returns a list of all elements
b = '//p' # Returns a list of all paragraph elements
c = '/html/body/div/p[1]/text()'# Returns 'Hello World!'
d = '/html/body/div/p[2]/text()'# Returns 'Enjoy DataCamp!'
e = '//div[@class="class1"]' # Returns all div elements with class 'class1'
f = '//*[contains(@id,"p")]' # Returns all elements with a 'p' in their id
g = '/html/body/div/p[@id="p3"]/a/@href' # Returns www.datacamp.com
h = selector.xpath('/html/body').xpath('./div[1]/p').extract() # Same as selector.xpath('/html/body/div[1]/p')

selector.xpath(g).extract()

['http://www.datacamp.com']

In [107]:
# Scrapy - Select Objects with CSS Locator
import scrapy

html = '''
            <html>
              <body>
                <div class="hello datacamp">
                  <p id="p1"> Hello World!</p>
                  <p id="p2"> Enjoy DataCamp!</p>
                  <p id="p3"> Try <a href="http://www.datacamp.com">DataCamp</a> today!</p>
                </div>
              </body>
            </html>
       '''

a = '*' # Returns a list of all elements
b = ' p'# Returns a list of all paragraph elemnts
c = 'html > body > div > p:nth-of-type(1)::text' # Returns Hello World!
d = 'html > body > div > p:nth-of-type(2)::text' # Returns Enjoy DataCamp!
e = ' div.class1' # Returns all div elements with class 'class1'
f = ' p#p3 > a::attr(href)' # Returns www.datacamp.com

selector.css(e).extract()

['<div class="class1">\n                  <p id="p1"> Hello World!</p>\n                  <p id="p2"> Enjoy DataCamp!</p>\n                  <p id="p3"> Try <a href="http://www.datacamp.com">DataCamp</a> today!</p>\n                </div>']

In [3]:
# Scrapy - Create a Spider

import scrapy
from scrapy.crawler import CrawlerProcess

class DC_Chapter_Spider(scrapy.Spider):
    '''This scrapes https://www.datacamp.com/courses/all and returns a dictionary containing each class title and their list of courses'''

    name = "dc_chapter_spider"

    def start_requests(self):
        url = 'https://www.datacamp.com/courses/all'
        yield scrapy.Request(url=url,callback=self.parse_front)

    def parse_front(self,response):
        ## Code to parse the front courses page
        # Navigate to course blocks
        course_blocks = response.css('div.course-block')
        # Get list of course links
        links_to_follow = course_blocks.xpath('./a/@href').extract()
        # Follow the links to the next parser
        for url in links_to_follow:
            yield response.follow(url=url,callback=self.parse_pages)

    def parse_pages(self,response):
        # Direct to the course title text
        crs_title = response.xpath('//h1[contains(@class,"title")]/text()')
        # Extract and clean the course title text
        crs_title_ext = crs_title.extract_first().strip()
        # Direct to the chapter titles text
        ch_titles = response.css('h4.chapter__title::text')
        # Extract and clean the chapter titles text
        ch_titles_ext = [t.strip() for t in ch_titles.extract()]
        # Store this in our dictionary
        dc_dict[crs_title_ext] = ch_titles_ext

dc_dict = dict()

process = CrawlerProcess()
process.crawl(DC_Chapter_Spider)
process.start()

list(dc_dict.items())[:3]

2019-10-31 20:44:54 [scrapy.utils.log] INFO: Scrapy 1.6.0 started (bot: scrapybot)
2019-10-31 20:44:54 [scrapy.utils.log] INFO: Versions: lxml 4.4.1.0, libxml2 2.9.9, cssselect 1.1.0, parsel 1.5.2, w3lib 1.21.0, Twisted 19.7.0, Python 3.7.4 (default, Aug 13 2019, 15:17:50) - [Clang 4.0.1 (tags/RELEASE_401/final)], pyOpenSSL 19.0.0 (OpenSSL 1.1.1d  10 Sep 2019), cryptography 2.7, Platform Darwin-19.0.0-x86_64-i386-64bit
2019-10-31 20:44:54 [scrapy.crawler] INFO: Overridden settings: {}
2019-10-31 20:44:54 [scrapy.extensions.telnet] INFO: Telnet Password: ae4f6ac3ea7b0e63
2019-10-31 20:44:54 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.logstats.LogStats']
2019-10-31 20:44:54 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.Dow

2019-10-31 20:45:03 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.datacamp.com/courses/joining-data-with-dplyr-in-r> (referer: https://www.datacamp.com/courses/all)
2019-10-31 20:45:04 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.datacamp.com/courses/financial-forecasting-in-python> (referer: https://www.datacamp.com/courses/all)
2019-10-31 20:45:04 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.datacamp.com/courses/creating-robust-python-workflows> (referer: https://www.datacamp.com/courses/all)
2019-10-31 20:45:04 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.datacamp.com/courses/introduction-to-data-visualization-with-ggplot2> (referer: https://www.datacamp.com/courses/all)
2019-10-31 20:45:04 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.datacamp.com/courses/time-series-with-datatable-in-r> (referer: https://www.datacamp.com/courses/all)
2019-10-31 20:45:04 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.da

2019-10-31 20:45:08 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.datacamp.com/courses/analyzing-iot-data-in-python> (referer: https://www.datacamp.com/courses/all)
2019-10-31 20:45:09 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.datacamp.com/courses/conditional-formatting-in-spreadsheets> (referer: https://www.datacamp.com/courses/all)
2019-10-31 20:45:09 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.datacamp.com/courses/improving-query-performance-in-postgresql> (referer: https://www.datacamp.com/courses/all)
2019-10-31 20:45:09 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.datacamp.com/courses/introduction-to-spark-sql> (referer: https://www.datacamp.com/courses/all)
2019-10-31 20:45:09 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.datacamp.com/courses/financial-modeling-in-spreadsheets> (referer: https://www.datacamp.com/courses/all)
2019-10-31 20:45:09 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.datac

2019-10-31 20:45:13 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.datacamp.com/courses/analyzing-social-media-data-in-python> (referer: https://www.datacamp.com/courses/all)
2019-10-31 20:45:13 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.datacamp.com/courses/object-oriented-programming-in-python> (referer: https://www.datacamp.com/courses/all)
2019-10-31 20:45:13 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.datacamp.com/courses/advanced-dimensionality-reduction-in-r> (referer: https://www.datacamp.com/courses/all)
2019-10-31 20:45:13 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.datacamp.com/courses/longitudinal-analysis-in-r> (referer: https://www.datacamp.com/courses/all)
2019-10-31 20:45:14 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.datacamp.com/courses/fraud-detection-in-python> (referer: https://www.datacamp.com/courses/all)
2019-10-31 20:45:14 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.datacamp

2019-10-31 20:45:19 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.datacamp.com/courses/network-science-in-r-a-tidy-approach> (referer: https://www.datacamp.com/courses/all)
2019-10-31 20:45:19 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.datacamp.com/courses/analyzing-survey-data-in-r> (referer: https://www.datacamp.com/courses/all)
2019-10-31 20:45:19 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.datacamp.com/courses/ab-testing-in-r> (referer: https://www.datacamp.com/courses/all)
2019-10-31 20:45:19 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.datacamp.com/courses/intro-to-python-for-finance> (referer: https://www.datacamp.com/courses/all)
2019-10-31 20:45:19 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.datacamp.com/courses/hyperparameter-tuning-in-r> (referer: https://www.datacamp.com/courses/all)
2019-10-31 20:45:19 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.datacamp.com/courses/machine-learning-for

2019-10-31 20:45:27 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.datacamp.com/courses/hierarchical-and-mixed-effects-models> (referer: https://www.datacamp.com/courses/all)
2019-10-31 20:45:27 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.datacamp.com/courses/communicating-with-data-in-the-tidyverse> (referer: https://www.datacamp.com/courses/all)
2019-10-31 20:45:27 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.datacamp.com/courses/building-dashboards-with-flexdashboard> (referer: https://www.datacamp.com/courses/all)
2019-10-31 20:45:27 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.datacamp.com/courses/visualizing-big-data-with-trelliscope> (referer: https://www.datacamp.com/courses/all)
2019-10-31 20:45:28 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.datacamp.com/courses/intro-to-portfolio-risk-management-in-python> (referer: https://www.datacamp.com/courses/all)
2019-10-31 20:45:28 [scrapy.core.engine] DEBUG: Crawle

2019-10-31 20:45:35 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.datacamp.com/courses/data-visualization-in-r-with-lattice> (referer: https://www.datacamp.com/courses/all)
2019-10-31 20:45:35 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.datacamp.com/courses/extreme-gradient-boosting-with-xgboost> (referer: https://www.datacamp.com/courses/all)
2019-10-31 20:45:35 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.datacamp.com/courses/forecasting-using-r> (referer: https://www.datacamp.com/courses/all)
2019-10-31 20:45:35 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.datacamp.com/courses/writing-efficient-r-code> (referer: https://www.datacamp.com/courses/all)
2019-10-31 20:45:35 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.datacamp.com/courses/machine-learning-with-tree-based-models-in-r> (referer: https://www.datacamp.com/courses/all)
2019-10-31 20:45:35 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.datacamp.c

2019-10-31 20:45:39 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.datacamp.com/courses/intro-to-python-for-data-science> (referer: https://www.datacamp.com/courses/all)
2019-10-31 20:45:39 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.datacamp.com/courses/data-visualization-with-ggplot2-2> (referer: https://www.datacamp.com/courses/all)
2019-10-31 20:45:39 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.datacamp.com/courses/intermediate-python-for-data-science> (referer: https://www.datacamp.com/courses/all)
2019-10-31 20:45:39 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.datacamp.com/courses/intermediate-r-practice> (referer: https://www.datacamp.com/courses/all)
2019-10-31 20:45:40 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.datacamp.com/courses/data-visualization-with-ggplot2-1> (referer: https://www.datacamp.com/courses/all)
2019-10-31 20:45:40 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.datacamp.com/c

[('Introduction to R', []),
 ('Introduction to Data',
  ['Language of data',
   'Sampling strategies and experimental design',
   'Study types and cautionary tales',
   'Case study',
   'Language of data',
   'Study types and cautionary tales',
   'Sampling strategies and experimental design',
   'Case study']),
 ('Introduction to Data Visualization with Python',
  ['Customizing plots',
   'Statistical plots with Seaborn',
   'Plotting 2D arrays',
   'Analyzing time series and images',
   'Customizing plots',
   'Plotting 2D arrays',
   'Statistical plots with Seaborn',
   'Analyzing time series and images'])]

In [6]:
# BeautifulSoup
from bs4 import BeautifulSoup
import requests
import csv
import time

headers = {'User-Agent': 'Mozilla/5.0'}

url = "https://old.reddit.com/r/datascience/"

page = requests.get(url, headers=headers)

soup = BeautifulSoup(page.text, 'html.parser')

attrs = {'class': 'thing', 'data-domain': 'self.datascience'}

post = soup.find_all('div', attrs=attrs)[0]

title = post.find('p', class_="title").text

author = post.find('a', class_='author').text

comments = post.find('a', class_='comments').text

likes = post.find("div", attrs={"class": "score likes"}).text

type(post)

2019-10-31 20:59:15 [urllib3.connectionpool] DEBUG: Starting new HTTPS connection (1): old.reddit.com:443
2019-10-31 20:59:16 [urllib3.connectionpool] DEBUG: https://old.reddit.com:443 "GET /r/datascience/ HTTP/1.1" 200 31412


bs4.element.Tag

In [None]:
# [Webscraping](https://www.datacamp.com/courses/web-scraping-with-python)

# Scrapy - Selector Objects with XPath's
import scrapy

html = '''
            <html>
              <body>
                <div class="class1">
                  <p id="p1"> Hello World!</p>
                  <p id="p2"> Enjoy DataCamp!</p>
                  <p id="p3"> Try <a href="http://www.datacamp.com">DataCamp</a> today!</p>
                </div>
              </body>
            </html>
       '''

selector = scrapy.Selector(text=html)

a = '*' # Returns a list of all elements
b = '//p' # Returns a list of all paragraph elements
c = '/html/body/div/p[1]/text()'# Returns 'Hello World!'
d = '/html/body/div/p[2]/text()'# Returns 'Enjoy DataCamp!'
e = '//div[@class="class1"]' # Returns all div elements with class 'class1'
f = '//*[contains(@id,"p")]' # Returns all elements with a 'p' in their id
g = '/html/body/div/p[@id="p3"]/a/@href' # Returns www.datacamp.com
h = selector.xpath('/html/body').xpath('./div[1]/p').extract() # Same as selector.xpath('/html/body/div[1]/p')

selector.xpath(g).extract()

# Scrapy - Select Objects with CSS Locator
import scrapy

html = '''
            <html>
              <body>
                <div class="hello datacamp">
                  <p id="p1"> Hello World!</p>
                  <p id="p2"> Enjoy DataCamp!</p>
                  <p id="p3"> Try <a href="http://www.datacamp.com">DataCamp</a> today!</p>
                </div>
              </body>
            </html>
       '''

a = '*' # Returns a list of all elements
b = ' p'# Returns a list of all paragraph elemnts
c = 'html > body > div > p:nth-of-type(1)::text' # Returns Hello World!
d = 'html > body > div > p:nth-of-type(2)::text' # Returns Enjoy DataCamp!
e = ' div.class1' # Returns all div elements with class 'class1'
f = ' p#p3 > a::attr(href)' # Returns www.datacamp.com

selector.css(e).extract()

# Scrapy - Create a Spider

import scrapy
from scrapy.crawler import CrawlerProcess

class DC_Chapter_Spider(scrapy.Spider):
    '''This scrapes https://www.datacamp.com/courses/all and returns a dictionary containing each class title and their list of courses'''

    name = "dc_chapter_spider"

    def start_requests(self):
        url = 'https://www.datacamp.com/courses/all'
        yield scrapy.Request(url=url,callback=self.parse_front)

    def parse_front(self,response):
        ## Code to parse the front courses page
        # Navigate to course blocks
        course_blocks = response.css('div.course-block')
        # Get list of course links
        links_to_follow = course_blocks.xpath('./a/@href').extract()
        # Follow the links to the next parser
        for url in links_to_follow:
            yield response.follow(url=url,callback=self.parse_pages)

    def parse_pages(self,response):
        # Direct to the course title text
        crs_title = response.xpath('//h1[contains(@class,"title")]/text()')
        # Extract and clean the course title text
        crs_title_ext = crs_title.extract_first().strip()
        # Direct to the chapter titles text
        ch_titles = response.css('h4.chapter__title::text')
        # Extract and clean the chapter titles text
        ch_titles_ext = [t.strip() for t in ch_titles.extract()]
        # Store this in our dictionary
        dc_dict[crs_title_ext] = ch_titles_ext

dc_dict = dict()

process = CrawlerProcess()
process.crawl(DC_Chapter_Spider)
process.start()

list(dc_dict.items())[:3]

# BeautifulSoup
from bs4 import BeautifulSoup
import requests
import csv
import time

headers = {'User-Agent': 'Mozilla/5.0'}

url = "https://old.reddit.com/r/datascience/"

page = requests.get(url, headers=headers)

soup = BeautifulSoup(page.text, 'html.parser')

attrs = {'class': 'thing', 'data-domain': 'self.datascience'}

post = soup.find_all('div', attrs=attrs)[0]

title = post.find('p', class_="title").text

author = post.find('a', class_='author').text

comments = post.find('a', class_='comments').text

likes = post.find("div", attrs={"class": "score likes"}).text

type(post)