In [1]:
from scrapy import Selector
html='''
<html>
  <body>
    <div id="div1" class="class-1">
      <p class="class-1 class-2">Hello World!</p>
      <div id="div2">
        <p id="p2" class="class-2">Choose 
            <a href="http://datacamp.com">DataCamp!</a>!
        </p>
      </div>
    </div>
    <div id="div3" class="class-2">
      <p class="class-2">Thanks for Watching!</p>
    </div>
  </body>
</html>
'''

# Create a Selector selecting html as the HTML document
sel = Selector( text = html )

# Create a SelectorList of all div elements in the HTML document
divs = sel.xpath( '//p' )

# Chain together xpath methods to select desired p element
ps = sel.xpath( '//div' ).xpath( './p' ).extract()

divsText = sel.xpath( '//p/text()' ).extract()

divsTexts = sel.xpath( '//p//text()' ).extract()

print(divs)
print('****************')
print(ps)
print('****************')
print(divsText)
print('****************')
print(divsTexts)

[<Selector xpath='//p' data='<p class="class-1 class-2">Hello Worl...'>, <Selector xpath='//p' data='<p id="p2" class="class-2">Choose \n  ...'>, <Selector xpath='//p' data='<p class="class-2">Thanks for Watchin...'>]
****************
['<p class="class-1 class-2">Hello World!</p>', '<p id="p2" class="class-2">Choose \n            <a href="http://datacamp.com">DataCamp!</a>!\n        </p>', '<p class="class-2">Thanks for Watching!</p>']
****************
['Hello World!', 'Choose \n            ', '!\n        ', 'Thanks for Watching!']
****************
['Hello World!', 'Choose \n            ', 'DataCamp!', '!\n        ', 'Thanks for Watching!']


In [2]:
url = 'https://assets.datacamp.com/production/repositories/2560/datasets/19a0a26daa8d9db1d920b5d5607c19d6d8094b3b/all_short'

# Import requests
import requests

# Create the string html containing the HTML source
html = requests.get( url ).content

# Create the Selector object sel from html
sel = Selector( text=html )

# Print out the number of elements in the HTML document
print( "There are 1020 elements in the HTML document.")
print( "You have found: ", len( sel.xpath('//*') ) )

There are 1020 elements in the HTML document.
You have found:  1020


## XPath vs CSS Locator

In [3]:
# Create the XPath string equivalent to the CSS Locator 
xpath = '/html/body/span[1]//a'

# Create the CSS Locator string equivalent to the XPath
css_locator = 'html>body>span:nth-of-type(1) a'

# Create the XPath string equivalent to the CSS Locator 
xpath = '//div[@id="uid"]/span//h4'

# Create the CSS Locator string equivalent to the XPath
css_locator = 'div#uid > span h4'

ps = sel.xpath( '//div' ).xpath( './p' )
ps = sel.css( 'div' ).css( 'p' )

# p element with id equal to p3, which includes the text of future generations of this p element.
xpath = '//p[@id="p3"]//text()'

# p element with id equal to p3, which includes the text of future generations of this p element.
css_locator = 'p#p3 ::text'

# not have to include future generation of p
xpath = '//p[@id="p3"]/text()'

# not have to include future generation of p
css_locator = 'p#p3::text'

In [4]:
# Fill in the blank
css_locator = 'li.mobile-nav__item > a::attr(href)'
xpath = '//li[@class="mobile-nav__item"]/a/@href'

print(sel.css(css_locator))
print(sel.xpath(xpath))
print(sel.css(css_locator).extract())
print(sel.xpath(xpath).extract())

[<Selector xpath="descendant-or-self::li[@class and contains(concat(' ', normalize-space(@class), ' '), ' mobile-nav__item ')]/a/@href" data='/courses'>, <Selector xpath="descendant-or-self::li[@class and contains(concat(' ', normalize-space(@class), ' '), ' mobile-nav__item ')]/a/@href" data='/tracks/skill'>, <Selector xpath="descendant-or-self::li[@class and contains(concat(' ', normalize-space(@class), ' '), ' mobile-nav__item ')]/a/@href" data='/tracks/career'>, <Selector xpath="descendant-or-self::li[@class and contains(concat(' ', normalize-space(@class), ' '), ' mobile-nav__item ')]/a/@href" data='/instructors'>, <Selector xpath="descendant-or-self::li[@class and contains(concat(' ', normalize-space(@class), ' '), ' mobile-nav__item ')]/a/@href" data='/pricing'>, <Selector xpath="descendant-or-self::li[@class and contains(concat(' ', normalize-space(@class), ' '), ' mobile-nav__item ')]/a/@href" data='/groups/business'>, <Selector xpath="descendant-or-self::li[@class and contain

In [5]:
# Create an XPath string to the desired text.
xpath = '//symbol[@id="command"]/text()'

# Create a CSS Locator string to the desired text.
css_locator = 'symbol#command::text'

print(sel.css(css_locator))
print(sel.xpath(xpath))
print(sel.css(css_locator).extract())
print(sel.xpath(xpath).extract())

[<Selector xpath="descendant-or-self::symbol[@id = 'command']/text()" data='\n\t\t\t'>, <Selector xpath="descendant-or-self::symbol[@id = 'command']/text()" data='\n\t\t'>]
[<Selector xpath='//symbol[@id="command"]/text()' data='\n\t\t\t'>, <Selector xpath='//symbol[@id="command"]/text()' data='\n\t\t'>]
['\n\t\t\t', '\n\t\t']
['\n\t\t\t', '\n\t\t']


In [6]:
from scrapy.http import HtmlResponse

url_var = 'https://www.datacamp.com/courses/all'
html = requests.get( url_var ).content
response = HtmlResponse(url=url_var, body=html)

# Get the URL to the website loaded in response
this_url = response.url

# Get the title of the website loaded in response
this_title = response.xpath('//title/text()').extract_first()

print( this_url, '\n', this_title )

https://www.datacamp.com/courses/all 
 Data Science Courses: R & Python Analysis Tutorials | DataCamp


In [7]:
# Select all desired div elements
divs = response.css( 'div.course-block' )

# Take the first div element
first_div = divs[0]

# Extract the text from the h4 element in first_div
h4_text = first_div.css('h4::text').extract_first()

# Print out the text
print( "The text from the h4 element is:", h4_text )

The text from the h4 element is: Introduction to R


#### The variable divs was a SelectorList, so the first element (first_div) in this list is a Selector object.

In [22]:
# Import scrapy library
import scrapy

url_short = 'https://assets.datacamp.com/production/repositories/2560/datasets/19a0a26daa8d9db1d920b5d5607c19d6d8094b3b/all_short'

# Create the Spider class
class DCspider( scrapy.Spider ):
    name = 'dcspider'
    # start_requests method
    def start_requests( self ):
        yield scrapy.Request( url = url_short, callback = self.parse )
    # parse method
    def parse( self, response ):
        # Create an extracted list of course author names
        author_names = response.css( 'p.course-block__author-name::text' ).extract()
        # Here we will just return the list of Authors
        return author_names

a= DCspider().parse(response)
print(a[0:4])

['Jonathan Cornelissen', 'Filip Schouwenaars', 'Gilles Inghelbrecht', 'Nick Carchedi']


In [1]:
# Import scrapy
import scrapy

# Import the CrawlerProcess: for running the spider
from scrapy.crawler import CrawlerProcess

# Create the Spider class
class DCdescr( scrapy.Spider ):
    name = 'dcdescr'
    # start_requests method
    def start_requests( self ):
        yield scrapy.Request( url = url_short, callback = self.parse )
  
    # First parse method
    def parse( self, response ):
        links = response.css( 'div.course-block > a::attr(href)' ).extract()
        # Follow each of the extracted links
        for link in links:
            yield response.follow( url = link, callback = self.parse_descr )
      
    # Second parsing method
    def parse_descr( self, response ):
        # Extract course description
        course_descr = response.css( 'p.course__description::text' ).extract()
        # For now, just yield the course description
        print( course_descr[0] )

url_short = 'https://assets.datacamp.com/production/repositories/2560/datasets/19a0a26daa8d9db1d920b5d5607c19d6d8094b3b/all_short'
# Run the Spider
process = CrawlerProcess()
process.crawl(DCdescr)
process.start()



2020-05-09 19:23:33 [scrapy.utils.log] INFO: Scrapy 1.6.0 started (bot: scrapybot)
2020-05-09 19:23:33 [scrapy.utils.log] INFO: Versions: lxml 4.3.4.0, libxml2 2.9.9, cssselect 1.1.0, parsel 1.5.2, w3lib 1.21.0, Twisted 18.9.0, Python 3.7.3 (default, Apr 24 2019, 15:29:51) [MSC v.1915 64 bit (AMD64)], pyOpenSSL 19.0.0 (OpenSSL 1.1.1c  28 May 2019), cryptography 2.7, Platform Windows-10-10.0.18362-SP0
2020-05-09 19:23:34 [scrapy.crawler] INFO: Overridden settings: {}
2020-05-09 19:23:34 [scrapy.extensions.telnet] INFO: Telnet Password: a420ca248b78d0e6
2020-05-09 19:23:34 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.logstats.LogStats']
2020-05-09 19:23:35 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaul

In this introduction to R, you will master the basics of this beautiful open source language, including factors, lists and data frames. With the knowledge gained in this course, you will be ready to undertake your first very own data analysis. With over 2 million users worldwide R is rapidly becoming the leading programming language in statistics and data science. Every year, the number of R users grows by 40% and an increasing number of organizations are using it in their day-to-day activities. Leverage the power of R by completing this free R online course today!


2020-05-09 19:23:37 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://assets.datacamp.com/production/repositories/2560/datasets/60f4cecb02a7e8e78c74643f095e3c913348da9b/dplyr-data-manipulation-r-tutorial> (referer: https://assets.datacamp.com/production/repositories/2560/datasets/19a0a26daa8d9db1d920b5d5607c19d6d8094b3b/all_short)
2020-05-09 19:23:38 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://assets.datacamp.com/production/repositories/2560/datasets/537eff88062cdcedb77b51d1622aa7675e2baf21/data-table-data-manipulation-r-tutorial> (referer: https://assets.datacamp.com/production/repositories/2560/datasets/19a0a26daa8d9db1d920b5d5607c19d6d8094b3b/all_short)


It's commonly said that data scientists spend 80% of their time cleaning and manipulating data and only 20% of their time actually analyzing it. For this reason, it is critical to become familiar with the data cleaning process and all of the tools available to you along the way. This course provides a very basic introduction to cleaning data in R using the tidyr, dplyr, and stringr packages. After taking the course you'll be able to go from raw data to awesome insights as quickly and painlessly as possible!


2020-05-09 19:23:38 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://assets.datacamp.com/production/repositories/2560/datasets/9f9f9be002e7c66df8c1733b8796943fa77b2236/introduction-to-machine-learning-with-r> (referer: https://assets.datacamp.com/production/repositories/2560/datasets/19a0a26daa8d9db1d920b5d5607c19d6d8094b3b/all_short)


In this interactive tutorial, you will learn how to perform sophisticated dplyr techniques to carry out your data manipulation with R. First you will master the five verbs of R data manipulation with dplyr: select, mutate, filter, arrange and summarise. Next, you will learn how you can chain your dplyr operations using the pipe operator of the magrittr package. In the final section, the focus is on practicing how to subset your data using the group_by function, and how you can access data stored outside of R in a database. All said and done, you will be familiar with data manipulation tools and techniques that will allow you to efficiently manipulate data.
The R data.table package is rapidly making its name as the number one choice for handling large datasets in R. This online data.table tutorial will bring you from data.table novice to expert in no time. Once you are introduced to the general form of a data.table query, you will learn the techniques to subset your data.table, how to u

2020-05-09 19:23:38 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://assets.datacamp.com/production/repositories/2560/datasets/5938f64803ca4a1275f63bf68ac3d300cd9f1c4a/intro-to-python-for-data-science> (referer: https://assets.datacamp.com/production/repositories/2560/datasets/19a0a26daa8d9db1d920b5d5607c19d6d8094b3b/all_short)
2020-05-09 19:23:38 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://assets.datacamp.com/production/repositories/2560/datasets/77b65bcd1c82bf793e2de583c861a077dcec2246/ggvis-data-visualization-r-tutorial> (referer: https://assets.datacamp.com/production/repositories/2560/datasets/19a0a26daa8d9db1d920b5d5607c19d6d8094b3b/all_short)
2020-05-09 19:23:38 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://assets.datacamp.com/production/repositories/2560/datasets/6d40286d4abe480763ff8e8ac2246c01861f8c27/intermediate-r-practice> (referer: https://assets.datacamp.com/production/repositories/2560/datasets/19a0a26daa8d9db1d920b5d5607c19d6d8094b3b/all_short

This online machine learning course is perfect for those who have a solid basis in R and statistics, but are complete beginners with machine learning. After a broad overview of the discipline's most common techniques and applications, you'll gain more insight into the assessment and training of different machine learning models. The rest of the course is dedicated to a first reconnaissance with three of the most basic machine learning tasks: classification, regression and clustering.


2020-05-09 19:23:38 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://assets.datacamp.com/production/repositories/2560/datasets/844f6aa70cafd81cbc92f344baff641173604229/reporting-with-r-markdown> (referer: https://assets.datacamp.com/production/repositories/2560/datasets/19a0a26daa8d9db1d920b5d5607c19d6d8094b3b/all_short)
2020-05-09 19:23:38 [scrapy.core.engine] DEBUG: Crawled (403) <GET https://assets.datacamp.com/courses/predicting-customer-churn-in-python> (referer: https://assets.datacamp.com/production/repositories/2560/datasets/19a0a26daa8d9db1d920b5d5607c19d6d8094b3b/all_short)


Python is a general-purpose programming language that is becoming more and more popular for doing data science. Companies worldwide are using Python to harvest insights from their data and get a competitive edge. Unlike any other Python tutorial, this course focuses on Python specifically for data science. In our Intro to Python class, you will learn about powerful ways to store and manipulate data as well as cool data science tools to start your own analyses. Enter DataCamp’s online Python curriculum.


2020-05-09 19:23:39 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://assets.datacamp.com/production/repositories/2560/datasets/97b54f43a3d96a7c35defb3c757ecf5471152941/intermediate-r> (referer: https://assets.datacamp.com/production/repositories/2560/datasets/19a0a26daa8d9db1d920b5d5607c19d6d8094b3b/all_short)


Learn to create interactive graphs to display distributions, relationships, model fits, and more using ggvis.
This follow-up course on Intermediate R does not cover new programming concepts. Instead, you will strengthen your knowledge of the topics in Intermediate R with a bunch of new and fun exercises.


2020-05-09 19:23:39 [scrapy.spidermiddlewares.httperror] INFO: Ignoring response <403 https://assets.datacamp.com/courses/predicting-customer-churn-in-python>: HTTP status code is not handled or not allowed


Learn how to write a data report quickly and effectively with the R Markdown package, and share your results with your friends, colleagues or the rest of the world.  Learn how you can author your own R Markdown reports, and how to automate the reporting process so that you have your own reproducible reports. By the end of the interactive data analysis reporting tutorial, you will be able to generate reports straight from your R code, documenting your work — and its results — as an HTML, pdf, slideshow or Microsoft Word document.


2020-05-09 19:23:39 [scrapy.core.engine] INFO: Closing spider (finished)
2020-05-09 19:23:39 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 5215,
 'downloader/request_count': 12,
 'downloader/request_method_count/GET': 12,
 'downloader/response_bytes': 2466139,
 'downloader/response_count': 12,
 'downloader/response_status_count/200': 11,
 'downloader/response_status_count/403': 1,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2020, 5, 9, 17, 23, 39, 734539),
 'httperror/response_ignored_count': 1,
 'httperror/response_ignored_status_count/403': 1,
 'log_count/DEBUG': 12,
 'log_count/INFO': 10,
 'request_depth_max': 1,
 'response_received_count': 12,
 'scheduler/dequeued': 12,
 'scheduler/dequeued/memory': 12,
 'scheduler/enqueued': 12,
 'scheduler/enqueued/memory': 12,
 'start_time': datetime.datetime(2020, 5, 9, 17, 23, 36, 97960)}
2020-05-09 19:23:39 [scrapy.core.engine] INFO: Spider closed (finished)


The intermediate R course is the logical next stop on your journey in the R programming language. In this R training you will learn about conditional statements, loops and functions to power your own R scripts. Next, you can make your R code more efficient and readable using the apply functions. Finally, the utilities chapter gets you up to speed with regular expressions in the R programming language, data structure manipulations and times and dates. This R tutorial will allow you to learn R and take the next step in advancing your overall knowledge and capabilities while programming in R.


In [1]:
# Import scrapy
import scrapy

# Import the CrawlerProcess: for running the spider
from scrapy.crawler import CrawlerProcess

# Create the Spider class
class DC_Chapter_Spider(scrapy.Spider):
    name = "dc_chapter_spider"
    # start_requests method
    def start_requests(self):
        yield scrapy.Request(url = url_short,
                         callback = self.parse_front)
    # First parsing method
    def parse_front(self, response):
        course_blocks = response.css('div.course-block')
        course_links = course_blocks.xpath('./a/@href')
        links_to_follow = course_links.extract()
        for url in links_to_follow:
            yield response.follow(url = url,
                            callback = self.parse_pages)
    # Second parsing method
    def parse_pages(self, response):
        crs_title = response.xpath('//h1[contains(@class,"title")]/text()')
        crs_title_ext = crs_title.extract_first().strip()
        ch_titles = response.css('h4.chapter__title::text')
        ch_titles_ext = [t.strip() for t in ch_titles.extract()]
        dc_dict[ crs_title_ext ] = ch_titles_ext
        print(dc_dict)

# Initialize the dictionary **outside** of the Spider class
dc_dict = dict()
url_short = 'https://assets.datacamp.com/production/repositories/2560/datasets/19a0a26daa8d9db1d920b5d5607c19d6d8094b3b/all_short'

# Run the Spider
process = CrawlerProcess()
process.crawl(DC_Chapter_Spider)
process.start()


2020-05-09 19:09:44 [scrapy.utils.log] INFO: Scrapy 1.6.0 started (bot: scrapybot)
2020-05-09 19:09:44 [scrapy.utils.log] INFO: Versions: lxml 4.3.4.0, libxml2 2.9.9, cssselect 1.1.0, parsel 1.5.2, w3lib 1.21.0, Twisted 18.9.0, Python 3.7.3 (default, Apr 24 2019, 15:29:51) [MSC v.1915 64 bit (AMD64)], pyOpenSSL 19.0.0 (OpenSSL 1.1.1c  28 May 2019), cryptography 2.7, Platform Windows-10-10.0.18362-SP0
2020-05-09 19:09:44 [scrapy.crawler] INFO: Overridden settings: {}
2020-05-09 19:09:45 [scrapy.extensions.telnet] INFO: Telnet Password: 6363987dfcaf38b7
2020-05-09 19:09:45 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.logstats.LogStats']
2020-05-09 19:09:46 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaul

{'Cleaning Data in R': ['Introduction and exploring raw data', 'Preparing data for analysis', 'Tidying data', 'Putting it all together', 'Introduction and exploring raw data', 'Tidying data', 'Preparing data for analysis', 'Putting it all together']}
{'Cleaning Data in R': ['Introduction and exploring raw data', 'Preparing data for analysis', 'Tidying data', 'Putting it all together', 'Introduction and exploring raw data', 'Tidying data', 'Preparing data for analysis', 'Putting it all together'], 'Data Manipulation in R with dplyr': ['Introduction to dplyr and tbls', 'Filter and arrange', 'Group_by and working with databases', 'Select and mutate', 'Summarize and the pipe operator', 'Introduction to dplyr and tbls', 'Select and mutate', 'Filter and arrange', 'Summarize and the pipe operator', 'Group_by and working with databases']}


2020-05-09 19:09:49 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://assets.datacamp.com/production/repositories/2560/datasets/77b65bcd1c82bf793e2de583c861a077dcec2246/ggvis-data-visualization-r-tutorial> (referer: https://assets.datacamp.com/production/repositories/2560/datasets/19a0a26daa8d9db1d920b5d5607c19d6d8094b3b/all_short)
2020-05-09 19:09:49 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://assets.datacamp.com/production/repositories/2560/datasets/ffbc79c0169a150a45a0b503bd19662cb4d44790/free-introduction-to-r> (referer: https://assets.datacamp.com/production/repositories/2560/datasets/19a0a26daa8d9db1d920b5d5607c19d6d8094b3b/all_short)
2020-05-09 19:09:49 [scrapy.spidermiddlewares.httperror] INFO: Ignoring response <403 https://assets.datacamp.com/courses/predicting-customer-churn-in-python>: HTTP status code is not handled or not allowed
2020-05-09 19:09:49 [scrapy.downloadermiddlewares.retry] DEBUG: Retrying <GET https://assets.datacamp.com/production/repositorie

{'Cleaning Data in R': ['Introduction and exploring raw data', 'Preparing data for analysis', 'Tidying data', 'Putting it all together', 'Introduction and exploring raw data', 'Tidying data', 'Preparing data for analysis', 'Putting it all together'], 'Data Manipulation in R with dplyr': ['Introduction to dplyr and tbls', 'Filter and arrange', 'Group_by and working with databases', 'Select and mutate', 'Summarize and the pipe operator', 'Introduction to dplyr and tbls', 'Select and mutate', 'Filter and arrange', 'Summarize and the pipe operator', 'Group_by and working with databases'], 'Data Visualization in R with ggvis': ['The Grammar of Graphics', 'Transformations', 'Customizing Axes, Legends, and Scales', 'Lines and Syntax', 'Interactivity and Layers', 'The Grammar of Graphics', 'Lines and Syntax', 'Transformations', 'Interactivity and Layers', 'Customizing Axes, Legends, and Scales']}
{'Cleaning Data in R': ['Introduction and exploring raw data', 'Preparing data for analysis', 'Tid

2020-05-09 19:09:50 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://assets.datacamp.com/production/repositories/2560/datasets/6d40286d4abe480763ff8e8ac2246c01861f8c27/intermediate-r-practice> (referer: https://assets.datacamp.com/production/repositories/2560/datasets/19a0a26daa8d9db1d920b5d5607c19d6d8094b3b/all_short)
2020-05-09 19:09:50 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://assets.datacamp.com/production/repositories/2560/datasets/5938f64803ca4a1275f63bf68ac3d300cd9f1c4a/intro-to-python-for-data-science> (referer: https://assets.datacamp.com/production/repositories/2560/datasets/19a0a26daa8d9db1d920b5d5607c19d6d8094b3b/all_short)


{'Cleaning Data in R': ['Introduction and exploring raw data', 'Preparing data for analysis', 'Tidying data', 'Putting it all together', 'Introduction and exploring raw data', 'Tidying data', 'Preparing data for analysis', 'Putting it all together'], 'Data Manipulation in R with dplyr': ['Introduction to dplyr and tbls', 'Filter and arrange', 'Group_by and working with databases', 'Select and mutate', 'Summarize and the pipe operator', 'Introduction to dplyr and tbls', 'Select and mutate', 'Filter and arrange', 'Summarize and the pipe operator', 'Group_by and working with databases'], 'Data Visualization in R with ggvis': ['The Grammar of Graphics', 'Transformations', 'Customizing Axes, Legends, and Scales', 'Lines and Syntax', 'Interactivity and Layers', 'The Grammar of Graphics', 'Lines and Syntax', 'Transformations', 'Interactivity and Layers', 'Customizing Axes, Legends, and Scales'], 'Introduction to R': ['Intro to basics', 'Vectors', 'Matrices', 'Factors', 'Data frames', 'Lists']

2020-05-09 19:09:50 [scrapy.downloadermiddlewares.retry] DEBUG: Gave up retrying <GET https://assets.datacamp.com/production/repositories/2560/datasets/844f6aa70cafd81cbc92f344baff641173604229/reporting-with-r-markdown> (failed 3 times): 503 Service Unavailable
2020-05-09 19:09:50 [scrapy.core.engine] DEBUG: Crawled (503) <GET https://assets.datacamp.com/production/repositories/2560/datasets/844f6aa70cafd81cbc92f344baff641173604229/reporting-with-r-markdown> (referer: https://assets.datacamp.com/production/repositories/2560/datasets/19a0a26daa8d9db1d920b5d5607c19d6d8094b3b/all_short)


{'Cleaning Data in R': ['Introduction and exploring raw data', 'Preparing data for analysis', 'Tidying data', 'Putting it all together', 'Introduction and exploring raw data', 'Tidying data', 'Preparing data for analysis', 'Putting it all together'], 'Data Manipulation in R with dplyr': ['Introduction to dplyr and tbls', 'Filter and arrange', 'Group_by and working with databases', 'Select and mutate', 'Summarize and the pipe operator', 'Introduction to dplyr and tbls', 'Select and mutate', 'Filter and arrange', 'Summarize and the pipe operator', 'Group_by and working with databases'], 'Data Visualization in R with ggvis': ['The Grammar of Graphics', 'Transformations', 'Customizing Axes, Legends, and Scales', 'Lines and Syntax', 'Interactivity and Layers', 'The Grammar of Graphics', 'Lines and Syntax', 'Transformations', 'Interactivity and Layers', 'Customizing Axes, Legends, and Scales'], 'Introduction to R': ['Intro to basics', 'Vectors', 'Matrices', 'Factors', 'Data frames', 'Lists']

2020-05-09 19:09:50 [scrapy.spidermiddlewares.httperror] INFO: Ignoring response <503 https://assets.datacamp.com/production/repositories/2560/datasets/844f6aa70cafd81cbc92f344baff641173604229/reporting-with-r-markdown>: HTTP status code is not handled or not allowed
2020-05-09 19:09:50 [scrapy.core.engine] INFO: Closing spider (finished)
2020-05-09 19:09:50 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 6113,
 'downloader/request_count': 14,
 'downloader/request_method_count/GET': 14,
 'downloader/response_bytes': 2293404,
 'downloader/response_count': 14,
 'downloader/response_status_count/200': 10,
 'downloader/response_status_count/403': 1,
 'downloader/response_status_count/503': 3,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2020, 5, 9, 17, 9, 50, 906832),
 'httperror/response_ignored_count': 2,
 'httperror/response_ignored_status_count/403': 1,
 'httperror/response_ignored_status_count/503': 1,
 'log_count/DEBUG': 15,
 'log_