In [2]:
import scrapy
from scrapy.crawler import CrawlerProcess


class WikiSpider(scrapy.Spider):
    name = "WS"
    
    # Here is where we insert our API call.
    start_urls = [
        'https://en.wikipedia.org/w/api.php?action=query&format=xml&prop=linkshere&titles=Monty_Python&lhprop=title%7Credirect'
        ]

    # Identifying the information we want from the query response and extracting it using xpath.
    def parse(self, response):
        for item in response.xpath('//lh'):
            # The ns code identifies the type of page the link comes from.  '0' means it is a Wikipedia entry.
            # Other codes indicate links from 'Talk' pages, etc.  Since we are only interested in entries, we filter:
            if item.xpath('@ns').extract_first() == '0':
                yield {
                    'title': item.xpath('@title').extract_first() 
                    }
        # Getting the information needed to continue to the next ten entries.
        next_page = response.xpath('continue/@lhcontinue').extract_first()
        
        # Recursively calling the spider to process the next ten entries, if they exist.
        if next_page is not None:
            next_page = '{}&lhcontinue={}'.format(self.start_urls[0],next_page)
            yield scrapy.Request(next_page, callback=self.parse)
            
    
process = CrawlerProcess({
    'FEED_FORMAT': 'json',
    'FEED_URI': 'PythonLinks.json',
    # Note that because we are doing API queries, the robots.txt file doesn't apply to us.
    'ROBOTSTXT_OBEY': False,
    'USER_AGENT': 'ThinkfulDataScienceBootcampCrawler (thinkful.com)',
    'AUTOTHROTTLE_ENABLED': True,
    'HTTPCACHE_ENABLED': True,
    'LOG_ENABLED': False,
    # We use CLOSESPIDER_PAGECOUNT to limit our scraper to the first 100 links.    
    'CLOSESPIDER_PAGECOUNT' : 10
})
                                         

# Starting the crawler with our spider.
process.crawl(WikiSpider)
process.start()
print('First 100 links extracted!')

First 100 links extracted!


In [3]:
import pandas as pd

# Checking whether we got data 

Monty=pd.read_json('PythonLinks.json', orient='records')
print(Monty.shape)
print(Monty.tail())

(94, 1)
                    title
89  Surrealist automatism
90        Raymond Queneau
91           Andr√© Breton
92      Tim Brooke-Taylor
93           Fifth Beatle


## Challenge
Do a little scraping or API-calling of your own. Pick a new website and see what you can get out of it. Expect that you'll run into bugs and blind alleys, and rely on your mentor to help you get through.

Formally, your goal is to write a scraper that will:

1) Return specific pieces of information (rather than just downloading a whole page)<br>
2) Iterate over multiple pages/queries<br>
3) Save the data to your computer<br>

Once you have your data, compute some statistical summaries and/or visualizations that give you some new insights into your scraping topic of interest. Write up a report from scraping code to summary and share it with your mentor.

In [1]:
import scrapy
import re
from scrapy.crawler import CrawlerProcess

class TestSpider(scrapy.Spider):
    name = "TS"
    
    # URL(s) to start with.
    start_urls = [
        'https://data.inewsource.org/interactives/california-sat-scores-2016-17',
        ]

    # Use XPath to parse the response we get.
    def parse(self, response):
        
        # Iterate over every <article> element on the page.
        for row in response.xpath('//*[@id="mySelection"]/tbody/tr[1]'):
        
            # Yield a dictionary with the values we want.
            yield {
                'school': article.xpath('/td[1]').extract_first(),
                'district': article.xpath('/td[2]').extract_first(),
                'test_takers': article.xpath('/td[3]').extract_first(),
                'met_benchmark': article.xpath('/td[4]').extract_first(),
                'benchmark_percent': article.xpath('/td[5]').extract_first(),
                'school_year': article.xpath('/td[6]').extract()
            }
        # Get the URL of the previous page.
        next_page = response.xpath('//*[@id="mySelection_next"]/a/@href').extract_first()  
    
        # Identify page number to limit data collected
        pagenum = int(re.findall(r'\d+',next_page)[0])
        
        # Recursively call the spider to run on the next page, if it exists, stop at page 9
        if next_page is not None and pagenum < 10:
            next_page = response.urljoin(next_page)
            # Request the next page and recursively parse it the same way we did above
            yield scrapy.Request(next_page, callback=self.parse)

# Tell the script how to run the crawler by passing in settings.
# The new settings have to do with scraping etiquette.          
process = CrawlerProcess({
    'FEED_FORMAT': 'json',              # Store data in JSON format.
    'FEED_URI': 'test_data.json',       # Name our storage file.
    'LOG_ENABLED': False,               # Turn off logging for now.
    'ROBOTSTXT_OBEY': True,
    'USER_AGENT': 'ThinkfulDataScienceBootcampCrawler (thinkful.com)',
    'AUTOTHROTTLE_ENABLED': True,
    'HTTPCACHE_ENABLED': True
})

# Start the crawler with our spider.
process.crawl(TestSpider)
process.start()
print('Success!')

Success!


In [3]:
import pandas as pd

# Checking whether we got data from all 9 pages
#TSdf=pd.read_json('test_data.json', orient='records')
TSdf=pd.read_json('test_data.json')
print(TSdf.shape)
print(TSdf.head())

ValueError: Expected object or value