In [1]:
import scrapy
from scrapy.crawler import CrawlerProcess


class WikiSpider(scrapy.Spider):
    name = "WS"
    
    # Here is where we insert our API call.
    start_urls = [
        'https://en.wikipedia.org/w/api.php?action=query&format=xml&prop=linkshere&titles=Monty_Python&lhprop=title%7Credirect'
        ]

    # Identifying the information we want from the query response and extracting it using xpath.
    def parse(self, response):
        for item in response.xpath('//lh'):
            # The ns code identifies the type of page the link comes from.  '0' means it is a Wikipedia entry.
            # Other codes indicate links from 'Talk' pages, etc.  Since we are only interested in entries, we filter:
            if item.xpath('@ns').extract_first() == '0':
                yield {
                    'title': item.xpath('@title').extract_first() 
                    }
        # Getting the information needed to continue to the next ten entries.
        next_page = response.xpath('continue/@lhcontinue').extract_first()
        
        # Recursively calling the spider to process the next ten entries, if they exist.
        if next_page is not None:
            next_page = '{}&lhcontinue={}'.format(self.start_urls[0],next_page)
            yield scrapy.Request(next_page, callback=self.parse)
            
    
process = CrawlerProcess({
    'FEED_FORMAT': 'json',
    'FEED_URI': 'PythonLinks.json',
    # Note that because we are doing API queries, the robots.txt file doesn't apply to us.
    'ROBOTSTXT_OBEY': False,
    'USER_AGENT': 'ThinkfulDataScienceBootcampCrawler (thinkful.com)',
    'AUTOTHROTTLE_ENABLED': True,
    'HTTPCACHE_ENABLED': True,
    'LOG_ENABLED': True,
    # We use CLOSESPIDER_PAGECOUNT to limit our scraper to the first 100 links.    
    'CLOSESPIDER_PAGECOUNT' : 10
})
                                         

# Starting the crawler with our spider.
process.crawl(WikiSpider)
process.start()
print('First 100 links extracted!')

2018-11-26 16:49:13 [scrapy.utils.log] INFO: Scrapy 1.5.1 started (bot: scrapybot)
2018-11-26 16:49:13 [scrapy.utils.log] INFO: Versions: lxml 4.2.5.0, libxml2 2.9.8, cssselect 1.0.3, parsel 1.5.1, w3lib 1.19.0, Twisted 18.9.0, Python 3.7.0 (default, Jun 29 2018, 20:13:13) - [Clang 9.1.0 (clang-902.0.39.2)], pyOpenSSL 18.0.0 (OpenSSL 1.1.0j  20 Nov 2018), cryptography 2.4.2, Platform Darwin-17.7.0-x86_64-i386-64bit
2018-11-26 16:49:13 [scrapy.crawler] INFO: Overridden settings: {'AUTOTHROTTLE_ENABLED': True, 'CLOSESPIDER_PAGECOUNT': 10, 'FEED_FORMAT': 'json', 'FEED_URI': 'PythonLinks.json', 'HTTPCACHE_ENABLED': True, 'USER_AGENT': 'ThinkfulDataScienceBootcampCrawler (thinkful.com)'}
2018-11-26 16:49:14 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.closespider.CloseSpider',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.lo

2018-11-26 16:49:14 [scrapy.core.scraper] DEBUG: Scraped from <200 https://en.wikipedia.org/w/api.php?action=query&format=xml&prop=linkshere&titles=Monty_Python&lhprop=title%7Credirect&lhcontinue=14626>
{'title': 'J. K. Rowling'}
2018-11-26 16:49:14 [scrapy.core.scraper] DEBUG: Scraped from <200 https://en.wikipedia.org/w/api.php?action=query&format=xml&prop=linkshere&titles=Monty_Python&lhprop=title%7Credirect&lhcontinue=14626>
{'title': 'John Peel'}
2018-11-26 16:49:14 [scrapy.core.scraper] DEBUG: Scraped from <200 https://en.wikipedia.org/w/api.php?action=query&format=xml&prop=linkshere&titles=Monty_Python&lhprop=title%7Credirect&lhcontinue=14626>
{'title': 'Joke'}
2018-11-26 16:49:14 [scrapy.core.scraper] DEBUG: Scraped from <200 https://en.wikipedia.org/w/api.php?action=query&format=xml&prop=linkshere&titles=Monty_Python&lhprop=title%7Credirect&lhcontinue=14626>
{'title': 'Kate Bush'}
2018-11-26 16:49:14 [scrapy.core.scraper] DEBUG: Scraped from <200 https://en.wikipedia.org/w/api

2018-11-26 16:49:14 [scrapy.core.scraper] DEBUG: Scraped from <200 https://en.wikipedia.org/w/api.php?action=query&format=xml&prop=linkshere&titles=Monty_Python&lhprop=title%7Credirect&lhcontinue=28341>
{'title': 'Surrealism'}
2018-11-26 16:49:14 [scrapy.core.scraper] DEBUG: Scraped from <200 https://en.wikipedia.org/w/api.php?action=query&format=xml&prop=linkshere&titles=Monty_Python&lhprop=title%7Credirect&lhcontinue=28341>
{'title': 'The Goon Show'}
2018-11-26 16:49:14 [scrapy.core.scraper] DEBUG: Scraped from <200 https://en.wikipedia.org/w/api.php?action=query&format=xml&prop=linkshere&titles=Monty_Python&lhprop=title%7Credirect&lhcontinue=28341>
{'title': 'Terry Gilliam'}
2018-11-26 16:49:14 [scrapy.core.scraper] DEBUG: Scraped from <200 https://en.wikipedia.org/w/api.php?action=query&format=xml&prop=linkshere&titles=Monty_Python&lhprop=title%7Credirect&lhcontinue=28341>
{'title': 'The Stranglers'}
2018-11-26 16:49:14 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://en.wiki

2018-11-26 16:49:15 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://en.wikipedia.org/w/api.php?action=query&format=xml&prop=linkshere&titles=Monty_Python&lhprop=title%7Credirect&lhcontinue=51610> (referer: https://en.wikipedia.org/w/api.php?action=query&format=xml&prop=linkshere&titles=Monty_Python&lhprop=title%7Credirect&lhcontinue=45185) ['cached']
2018-11-26 16:49:15 [scrapy.core.engine] INFO: Closing spider (closespider_pagecount)
2018-11-26 16:49:15 [scrapy.core.scraper] DEBUG: Scraped from <200 https://en.wikipedia.org/w/api.php?action=query&format=xml&prop=linkshere&titles=Monty_Python&lhprop=title%7Credirect&lhcontinue=51610>
{'title': 'Sketch comedy'}
2018-11-26 16:49:15 [scrapy.core.scraper] DEBUG: Scraped from <200 https://en.wikipedia.org/w/api.php?action=query&format=xml&prop=linkshere&titles=Monty_Python&lhprop=title%7Credirect&lhcontinue=51610>
{'title': 'Terry Jones'}
2018-11-26 16:49:15 [scrapy.core.scraper] DEBUG: Scraped from <200 https://en.wikipedia.org/w/ap

First 100 links extracted!


In [None]:
import pandas as pd
# Checking whether we got data
Monty=pd.read_json('PythonLinks.json', orient='records')
print(Monty.shape)
print(Monty.tail())