# A simple web crawler

### Loading dependency packages

In [1]:
## scraper packages
!pip install scrapy


Collecting scrapy
  Downloading Scrapy-2.5.1-py2.py3-none-any.whl (254 kB)
Collecting h2<4.0,>=3.0
  Downloading h2-3.2.0-py2.py3-none-any.whl (65 kB)
Collecting service-identity>=16.0.0
  Downloading service_identity-21.1.0-py2.py3-none-any.whl (12 kB)
Collecting parsel>=1.5.0
  Downloading parsel-1.6.0-py2.py3-none-any.whl (13 kB)
Collecting queuelib>=1.4.2
  Downloading queuelib-1.6.2-py2.py3-none-any.whl (13 kB)
Collecting itemloaders>=1.0.1
  Downloading itemloaders-1.0.4-py3-none-any.whl (11 kB)
Collecting Twisted[http2]>=17.9.0
  Downloading Twisted-21.2.0-py3-none-any.whl (3.1 MB)
Collecting w3lib>=1.17.0
  Downloading w3lib-1.22.0-py2.py3-none-any.whl (20 kB)
Collecting cssselect>=0.9.1
  Downloading cssselect-1.1.0-py2.py3-none-any.whl (16 kB)
Collecting zope.interface>=4.1.3
  Downloading zope.interface-5.4.0-cp36-cp36m-win_amd64.whl (210 kB)
Collecting PyDispatcher>=2.0.5
  Downloading PyDispatcher-2.0.5.zip (47 kB)
  Preparing metadata (setup.py): started
  Preparing metad

### Scraping

We use the package `scrapy` for scraping data from web. 

In [2]:
import scrapy


class BrickSetSpider(scrapy.Spider):
    name = 'brick_spider'
    start_urls = ['http://brickset.com/sets/year-2016']

    def parse(self, response):
        SET_SELECTOR = '.set'
        for brickset in response.css(SET_SELECTOR):

            NAME_SELECTOR = 'h1 ::text'
            PIECES_SELECTOR = './/dl[dt/text() = "Pieces"]/dd/a/text()'
            MINIFIGS_SELECTOR = './/dl[dt/text() = "Minifigs"]/dd[2]/a/text()'
            IMAGE_SELECTOR = 'img ::attr(src)'
            yield {
                'name': brickset.css(NAME_SELECTOR).extract_first(),
                'pieces': brickset.xpath(PIECES_SELECTOR).extract_first(),
                'minifigs': brickset.xpath(MINIFIGS_SELECTOR).extract_first(),
                'image': brickset.css(IMAGE_SELECTOR).extract_first(),
            }

        NEXT_PAGE_SELECTOR = '.next a ::attr(href)'
        next_page = response.css(NEXT_PAGE_SELECTOR).extract_first()
        if next_page:
            yield scrapy.Request(
                response.urljoin(next_page),
                callback=self.parse
            )

In [8]:
from scrapy.crawler import CrawlerProcess

process = CrawlerProcess(settings={
    "FEEDS": {
        "items.json": {"format": "json"},
    },
})

process.crawl(BrickSetSpider)
process.start() # the script will block here until the crawling is finished

2022-02-06 23:02:58 [scrapy.utils.log] INFO: Scrapy 2.5.1 started (bot: scrapybot)
2022-02-06 23:02:58 [scrapy.utils.log] INFO: Versions: lxml 4.2.1.0, libxml2 2.9.8, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 21.2.0, Python 3.6.5 |Anaconda, Inc.| (default, Mar 29 2018, 13:32:41) [MSC v.1900 64 bit (AMD64)], pyOpenSSL 18.0.0 (OpenSSL 1.0.2o  27 Mar 2018), cryptography 2.2.2, Platform Windows-10-10.0.19041-SP0
2022-02-06 23:02:58 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2022-02-06 23:02:58 [scrapy.crawler] INFO: Overridden settings:
{}
2022-02-06 23:02:59 [scrapy.extensions.telnet] INFO: Telnet Password: 5e18c325b43c54a3
2022-02-06 23:02:59 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
2022-02-06 23:02:59 [scrapy.middleware] INFO: Enabled downloader middlewares:
['