# Scrapy

https://github.com/rennerocha/scrapy-tutorial.git

* Example of all HackerSpaces throughout the World

URL: https://wiki.hackerspaces.org/List_of_Hacker_Spaces

Quotes to scrape

URL: http://quotes.toscrape.com/

URL: toscrape.com

##### SPIDER

In [2]:
import scrapy

In [None]:
class HackerspaceListSpider(scrapy.Spider):
    name = 'hackerspace-list'
    stat_urls = [
        'https://wiki.hackerspaces.org/List_of_Hacker_Spaces'
    ]
    def parse(self, response):
        for row in response.css('table tr'):
            yield {
                'hackerspace': row.css('.Hackerspace *::text').get(),
                'country': row.css('.Country *::text').get(),
                'status': row.css('.Hackerspace-status *::text').get(),
                'url': row.css('.Website a::attr(href)').get()
            }

                
        further_results = response.xpath(
            '//a[contains(text(), "further")]//@href|'
            '//a[contains(text(), "next")]//@href')
        if further_results:
            yield scrapy.Request(response.urljoin(further_results.get()))
            

In [None]:
class MySpider(scrapy.Spider):
    name = 'spider_name'
    def start_requests(self):
        yield [
            scrapy.Request(
                'http://example.com',
                callback=self.parse)]
    def parse(self, response):
        self.logger.info('Passei por aqui!')

Extraindo dados a partir de:

Seletores CSS: www.w3.com/TR/selectors

Seletores XPath: www.w3.com/TR/xpath/all


## HANDSON LOCAL

* Parsel use selector like

In [8]:
from parsel import Selector
product_list = 'scrapy-tutorial/code/product_list.html'
with open (product_list) as file:
    response = Selector(text=file.read())

In [9]:
response

<Selector xpath=None data='<html>\n  <body>\n    <h1>Last Offers</h1>'>

In [10]:
response.get() # transform selector in string

'<html>\n  <body>\n    <h1>Last Offers</h1>\n    <ul id="offers">\n      <li class="product">\n        <a href="http://mystore.com/product-1">Product 1</a>\n        <p>I am a great product! Buy me!</p>\n      </li>\n      <li class="product bestseller">\n        <a href="http://mystore.com/product-2">Product 2</a>\n        <p>I am a better! Buy me!</p>\n      </li>\n      <li class="ad">\n        <a href="http://otherstore.com/product-2">Ad Product 2</a>\n        <p>I am an ad product! I paid to be here!</p>\n      </li>\n      <li class="product">\n        <a href="http://mystore.com/product-2">Product 3</a>\n        <p>Ok, you won\'t buy me anyway :-(</p>\n      </li>\n    </ul>\n\n    <h1>You may like</h1>\n    <ul id="recommendations">\n      <li class="product">\n        <a href="http://recommentation.com/recommendations-product-1">Recommended Product 1</a>\n        <p>Probably you will like me too.</p>\n      </li>\n      <li class="product">\n        <a href="http://recommentati

* Using CSS

In [12]:
response.css("h1").getall()

['<h1>Last Offers</h1>', '<h1>You may like</h1>']

In [13]:
response.css("ul#offers").getall() # '#' means ID

['<ul id="offers">\n      <li class="product">\n        <a href="http://mystore.com/product-1">Product 1</a>\n        <p>I am a great product! Buy me!</p>\n      </li>\n      <li class="product bestseller">\n        <a href="http://mystore.com/product-2">Product 2</a>\n        <p>I am a better! Buy me!</p>\n      </li>\n      <li class="ad">\n        <a href="http://otherstore.com/product-2">Ad Product 2</a>\n        <p>I am an ad product! I paid to be here!</p>\n      </li>\n      <li class="product">\n        <a href="http://mystore.com/product-2">Product 3</a>\n        <p>Ok, you won\'t buy me anyway :-(</p>\n      </li>\n    </ul>']

In [15]:
response.css(".product").getall() # '.' means Class

['<li class="product">\n        <a href="http://mystore.com/product-1">Product 1</a>\n        <p>I am a great product! Buy me!</p>\n      </li>',
 '<li class="product bestseller">\n        <a href="http://mystore.com/product-2">Product 2</a>\n        <p>I am a better! Buy me!</p>\n      </li>',
 '<li class="product">\n        <a href="http://mystore.com/product-2">Product 3</a>\n        <p>Ok, you won\'t buy me anyway :-(</p>\n      </li>',
 '<li class="product">\n        <a href="http://recommentation.com/recommendations-product-1">Recommended Product 1</a>\n        <p>Probably you will like me too.</p>\n      </li>',
 '<li class="product">\n        <a href="http://recommentation.com/recommendations-product-2">Recommended Product 2</a>\n        <p>Probably you will like me too (2).</p>\n      </li>']

In [17]:
response.css("ul#offers .product").getall() # From class .product gets id offers

['<li class="product">\n        <a href="http://mystore.com/product-1">Product 1</a>\n        <p>I am a great product! Buy me!</p>\n      </li>',
 '<li class="product bestseller">\n        <a href="http://mystore.com/product-2">Product 2</a>\n        <p>I am a better! Buy me!</p>\n      </li>',
 '<li class="product">\n        <a href="http://mystore.com/product-2">Product 3</a>\n        <p>Ok, you won\'t buy me anyway :-(</p>\n      </li>']

In [18]:
response.css("ul#offers .product a::attr(href)").getall() # From class product in ul with id offers gets a with attributs href

['http://mystore.com/product-1',
 'http://mystore.com/product-2',
 'http://mystore.com/product-2']

In [19]:
response.css('ul#offers .product *::text').getall() #From class product in ul with id offers gets any string

['\n        ',
 'Product 1',
 '\n        ',
 'I am a great product! Buy me!',
 '\n      ',
 '\n        ',
 'Product 2',
 '\n        ',
 'I am a better! Buy me!',
 '\n      ',
 '\n        ',
 'Product 3',
 '\n        ',
 "Ok, you won't buy me anyway :-(",
 '\n      ']

In [21]:
response.css('ul#offers .product p::text').getall() #From class product in ul with id offers gets text with p tags

['I am a great product! Buy me!',
 'I am a better! Buy me!',
 "Ok, you won't buy me anyway :-("]

In [20]:
response.css('.bestseller').getall()

['<li class="product bestseller">\n        <a href="http://mystore.com/product-2">Product 2</a>\n        <p>I am a better! Buy me!</p>\n      </li>']

* Using Xpath

In [22]:
response.xpath('//h1').getall()

['<h1>Last Offers</h1>', '<h1>You may like</h1>']

In [23]:
response.xpath('//h1[2]').getall() #gets the second appers of h1

['<h1>You may like</h1>']

In [25]:
response.xpath('//ul[@id="offers"]').getall()

['<ul id="offers">\n      <li class="product">\n        <a href="http://mystore.com/product-1">Product 1</a>\n        <p>I am a great product! Buy me!</p>\n      </li>\n      <li class="product bestseller">\n        <a href="http://mystore.com/product-2">Product 2</a>\n        <p>I am a better! Buy me!</p>\n      </li>\n      <li class="ad">\n        <a href="http://otherstore.com/product-2">Ad Product 2</a>\n        <p>I am an ad product! I paid to be here!</p>\n      </li>\n      <li class="product">\n        <a href="http://mystore.com/product-2">Product 3</a>\n        <p>Ok, you won\'t buy me anyway :-(</p>\n      </li>\n    </ul>']

In [27]:
response.xpath('//ul[@id="offers"]//li[@class="product"]') # return exact the name 'product' and can't return 'product Bestseller'

[<Selector xpath='//ul[@id="offers"]//li[@class="product"]' data='<li class="product">\n        <a href="ht'>,
 <Selector xpath='//ul[@id="offers"]//li[@class="product"]' data='<li class="product">\n        <a href="ht'>]

In [28]:
response.xpath('//li/a/@href')

[<Selector xpath='//li/a/@href' data='http://mystore.com/product-1'>,
 <Selector xpath='//li/a/@href' data='http://mystore.com/product-2'>,
 <Selector xpath='//li/a/@href' data='http://otherstore.com/product-2'>,
 <Selector xpath='//li/a/@href' data='http://mystore.com/product-2'>,
 <Selector xpath='//li/a/@href' data='http://recommentation.com/recommendation'>,
 <Selector xpath='//li/a/@href' data='http://recommentation.com/recommendation'>]

In [30]:
response.xpath('//li/text()').getall() # All text

['\n        ',
 '\n        ',
 '\n      ',
 '\n        ',
 '\n        ',
 '\n      ',
 '\n        ',
 '\n        ',
 '\n      ',
 '\n        ',
 '\n        ',
 '\n      ',
 '\n        ',
 '\n        ',
 '\n      ',
 '\n        ',
 '\n        ',
 '\n      ']

In [32]:
response.xpath('//p/text()').getall() # all text inside tag p

['I am a great product! Buy me!',
 'I am a better! Buy me!',
 'I am an ad product! I paid to be here!',
 "Ok, you won't buy me anyway :-(",
 'Probably you will like me too.',
 'Probably you will like me too (2).']

In [34]:
response.xpath('//ul[@id="offers"]//li[@class="product"]').getall() # gets

['<li class="product">\n        <a href="http://mystore.com/product-1">Product 1</a>\n        <p>I am a great product! Buy me!</p>\n      </li>',
 '<li class="product">\n        <a href="http://mystore.com/product-2">Product 3</a>\n        <p>Ok, you won\'t buy me anyway :-(</p>\n      </li>']

## Another Example with 
* http://quotes.toscrape.com

In [38]:
cd scrapy-tutorial/code/quotes

/home/phillipe/Working/Web Scraping/scrapy-tutorial


#### Using Scrapy Shell

response - get info about site
response.hesder - get insto header site
response.body - all html


**If you face some problem in auto-comple using scrapy shell:**

```python

import logging
logging.getLogger('parso.python.diff').disabled = True
logging.getLogger('parso.cache').disable = True
logging.getLogger('parso.cache').disable = True
logging.getLogger('parso.cache.pickle').disabled=True
logging.getLogger('py.warnings').disabled=True
logging.disable(logging.DEBUG)
logger = logging.getLogger()
logger.disabled = True

  ```

to run a spider
scrapy crawl ...

In [46]:
# -*- coding: utf-8 -*-
import scrapy
class DefaultQuotesSpider(scrapy.Spider):
    name = 'default_quotes'
    allowed_domains = ['quotes.toscrape.com']
    start_urls = ['http://quotes.toscrape.com/']
    def parse(self, response):
        quotes = response.css('.quote')
        for quote in quotes:
            yield {
                'citation': quote.css('.text::text').get(),
                'author': quote.css(".author::text").get(),
                'tags':  quote.css(".tag").css("a::text").getall()
            }
        further_results = response.xpath(
            '//ul[@class="pager"]//a[contains(text(),"Next")]//@href')
        if further_results:
            yield scrapy.Request(response.urljoin(further_results.get()))

            
        


In [None]:
import json
class ScrollQuotesSpider(scrapy.Spider):
(...)
def parse(self, response):
data = json.loads(response.body)