In [None]:
import scrapy
from scrapy.crawler import CrawlerProcess

In [None]:
class ESSpider(scrapy.Spider):
    # Name is important, need different ones for each spider
    # of the same class
    name = 'ESS'
    
    start_urls = ['http://www.everydaysexism.com']
    
    # Defining the scraping process
    def parse(self, response):
        with open('./scraper_results/mainpage.html', 'wb') as f:
            f.write(response.body)

# Instantiate the crawler
process = CrawlerProcess()

# Start the crawler with the spider
process.crawl(ESSpider)
process.start()

That spider downloaded the entire html code for the www.everydaysexism.com page to the specified directory.  However, to get more useful, parsed data, we must give the spider more specific instructions.

**Remember that we need to restart the notebook kernel for the next spider to run**

In [None]:
import scrapy
from scrapy.crawler import CrawlerProcess

class ESSpider(scrapy.Spider):
    name = 'ESS'
    start_urls = ['http://www.everydaysexism.com']
    
    def parse(self, response):
        # Iterate over every <article> element on the page
        for article in response.xpath('//article'):
            yield {
                'name': article.xpath('header/h2/a/@title').extract_first(),
                'date': article.xpath('header/section/span[@class="entry-date"]/text()').extract_first(),
                'text': article.xpath('section[@class="entry-content"]/p/text()').extract(),
                'tags': article.xpath('*/span[@class="tag-links"]/a/text()').extract()
            }
            
            
# Pass in crawler parameters
process = CrawlerProcess({
    'FEED_FORMAT': 'json',         # Store data in json
    'FEED_URI': './scraper_results/firstpage.json',  # Name of the json file
    'LOG_ENABLED': False           # Turning off logging
})

# Start the crawler & spider
process.crawl(ESSpider)
process.start()
print('Success')

In [None]:
import pandas as pd 

firstpage = pd.read_json('./scraper_results/firstpage.json', orient='records')
print(firstpage.shape)
firstpage.head()

So we successfully scraped the first page of the site.  In order to scrape more pages, we need to recursively call the scraper as it's running.  We'll need to find the link to the next page and also institute some scraping etiquette rules to conform to internet norms.

In [None]:
import scrapy
from scrapy.crawler import CrawlerProcess
import re

class ESSpider(scrapy.Spider):
    name = 'ESS'
    start_urls = ['http://www.everydaysexism.com']
    
    def parse(self, response):

        # Iterate over every <article> element on the page
        for article in response.xpath('//article'):
            yield {
                'name': article.xpath('header/h2/a/@title').extract_first(),
                'date': article.xpath('header/section/span[@class="entry-date"]/text()').extract_first(),
                'text': article.xpath('section[@class="entry-content"]/p/text()').extract(),
                'tags': article.xpath('*/span[@class="tag-links"]/a/text()').extract()
            }

            # Getting the next page URL
            next_page = response.xpath('//div[@class="nav-previous"]/a/@href').extract_first()

            # Grabbing the next page number
            pagenum = int(re.findall(r'\d+', next_page)[0])

            # Recursively call the spider until page 9
            if next_page is not None and pagenum < 10:
                next_page = response.urljoin(next_page)
                # Request next page with same parsing as above
                yield scrapy.Request(next_page, callback=self.parse)

            
# Pass in crawler parameters
# Additional parameters are for scraping etiquette
process = CrawlerProcess({
    'FEED_FORMAT': 'json',
    'FEED_URI': './scraper_results/data.json',
    'LOG_ENABLED': False,
    'ROBOTSTXT_OBEY': True,
    'USER_AGENT': 'ThinkfulDataScienceBootcampCrawler (thinkful.com)',
    'AUTOTHROTTLE_ENABLED': True,
    'HTTPCACHE_ENABLED': True
})

# Instantiate and start crawler
process.crawl(ESSpider)
process.start()
print('Success')



In [None]:
import pandas as pd

df = pd.read_json('./scraper_results/data.json', orient='records')
print(df.shape)
df.head()

Cool.  We sucessfully used Wikipedia's API to scrape specific info and store it.

Let's try a less canned version.  We'll try to pull Nate Silver's articles from [FiveThirtyEight](https://fivethirtyeight.com/).

In [1]:
import scrapy
from scrapy.crawler import CrawlerProcess
import re 

class FiveThirtyEight(scrapy.Spider):
    name = "NS"
    
    start_urls = ['https://fivethirtyeight.com/contributors/nate-silver/']
    
    def parse(self, response):
        for item in response.xpath("//div[@class='content-area']/div"):
            yield {
                'date': item.xpath(".//div[@class='post-info']/p/time/text()").extract_first(),
                'title': item.xpath(".//div[@class='post-info']/div/div/h2/a/text()").extract_first(),
                'article_link': item.xpath(".//div[@class='post-info']/div/div/h2/a/@href").extract_first(),
                'author': item.xpath(".//div[@class='post-info']/div/div/p[@class='single-metadata card vcard']/a/text()").extract_first()
            }
        
        nextpage = response.xpath("//div[@class='links']/a/@href").extract_first()
        pagenum = int(re.findall(r'\d+', nextpage)[0])
        
        # Recursively call next page
        if nextpage is not None and pagenum < 4: 
            nextpage = response.urljoin(nextpage)
            yield scrapy.Request(nextpage, callback=self.parse)
            
            
# Passing crawler parameters
process = CrawlerProcess({
    'FEED_FORMAT': 'json',
    'FEED_URI': './scraper_results/NS538.json',
    'LOG_ENABLED': False,
    'ROBOTSTXT_OBEY': True,
    'USER_AGENT': 'Matt Francsis (mkfrancsis@gmail.com)',
    'AUTOTHROTTLE_ENABLED': True,
    'HTTPCACHE_ENABLED': True
})

# Instantiate and start scraper
process.crawl(FiveThirtyEight)
process.start()
print('Finished')

Finished


In [2]:
import pandas as pd

df = pd.read_json('./scraper_results/NS538.json', orient='records')

print(df.shape)
df.head()

(26, 4)


Unnamed: 0,article_link,author,date,title
0,,,NaT,
1,,,NaT,
2,https://fivethirtyeight.com/features/silver-bu...,Nate Silver,2019-05-30,\n\t\t\t\tSilver Bulletpoints: Who’s In Danger...
3,https://fivethirtyeight.com/features/are-the-r...,Nate Silver,2019-05-29,\n\t\t\t\tAre The Raptors Really Favorites Aga...
4,https://fivethirtyeight.com/features/is-there-...,,2019-05-29,\n\t\t\t\tIs There An Anti-Biden Lane In The D...


Great, so the scraper works.  The `None` rows are *article-like* areas of the web page that are not actually articles while the articles with `None` as the author are podcasts or group chats.