In [1]:
import os 
import logging
# Import scrapy and scrapy.crawler 
import scrapy
from scrapy.crawler import CrawlerProcess


In [2]:
class YelpCrawler (scrapy.Spider):
    name = "yelp-spy"
    start_urls = ['https://www.yelp.fr/']
    
    def __init__(self, search_keyword="restaurant japonais", location_keyword="Toulouse", *args, **kwargs):
        super(YelpCrawler, self).__init__(*args, **kwargs)
        self.search_keyword = search_keyword
        self.location_keyword = location_keyword
    
    def parse(self, response):
        
        return scrapy.FormRequest.from_response(response, formdata={'find_desc':self.search_keyword, 
                                                                    'find_loc':self.location_keyword},
                                               callback=self.after_search)
        #pass
    
    def after_search(self ,response):
        #.css-1661a90
        rest_list = response.css('h4.css-1l5lt1i')
        for tag in rest_list:
            #print(tag)
            yield{
                'name':tag.css('span a::text').get(),
                #'url': tag.css('span a::attr(href)').getall() # PEZ: It creates a list with one element
                #'url': tag.css('span a::attr(href)').extract_first(),# PEZ: This works fine also!
                'url': tag.css('span a').attrib['href'], #PEZ: This works fine!
                'next-link': response.css('a.next-link').attrib['href']
            }
            try:
                # Identify what the next page is. type(next_page) is str
                next_page = response.css('a.next-link').attrib['href']
            except KeyError:
                logging.info('There is no next page. The crawling process will terminate')
            else:
                # if the next page is found
                yield response.follow(next_page, callback = self.after_search)
                

In [3]:
def remove_file(filename):
    if filename in os.listdir("data/"):
        os.remove("data/" + filename)

In [4]:
def get_filename(search_keyword, search_location):
    return f"{search_keyword}-{search_location}.json"

In [5]:
get_filename("spa", "Toulouse")

'spa-Toulouse.json'

In [6]:
def launch_crawler(search_input, location_input):
    filename = get_filename(search_input, location_input)
    remove_file(filename)
    process = CrawlerProcess(settings={
        'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
        'LOG_LEVEL': logging.INFO,
        "FEEDS": {
            'data/' + filename : {"format": "json"},
        },
        "AUTOTHROTTLE_ENABLED": True  # AutoThrottle Here!
    })
    process.crawl(YelpCrawler, search_keyword =search_input , search_location=location_input)
    process.start()
    print("FINISHED")


In [7]:
search_keyword = input("Please enter your search keywords:")
location_keyword = input("Please enter the name of the city")

Please enter your search keywords: spa
Please enter the name of the city toulouse


In [8]:
launch_crawler(search_keyword, location_keyword)


2021-07-04 15:37:41 [scrapy.utils.log] INFO: Scrapy 2.5.0 started (bot: scrapybot)
2021-07-04 15:37:41 [scrapy.utils.log] INFO: Versions: lxml 4.6.3.0, libxml2 2.9.10, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 21.2.0, Python 3.8.6 | packaged by conda-forge | (default, Oct  7 2020, 19:08:05) - [GCC 7.5.0], pyOpenSSL 19.1.0 (OpenSSL 1.1.1h  22 Sep 2020), cryptography 3.1.1, Platform Linux-5.4.89+-x86_64-with-glibc2.10
2021-07-04 15:37:41 [scrapy.crawler] INFO: Overridden settings:
{'AUTOTHROTTLE_ENABLED': True,
 'LOG_LEVEL': 20,
 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'}
2021-07-04 15:37:42 [scrapy.extensions.telnet] INFO: Telnet Password: 140396b7cb738c45
2021-07-04 15:37:42 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats',
 'scrapy.extensions.

FINISHED
