In [None]:
import re
import scrapy

from ast import literal_eval
from bs4 import BeautifulSoup
from selenium import webdriver

from scrapy import signals
from scrapy.exceptions import DropItem
from scrapy.crawler import CrawlerProcess
from scrapy.exporters import CsvItemExporter

from pydispatch import dispatcher

Documentation for `re` module [here](https://docs.python.org/3/library/re.html).

In [None]:
def clean_string(st):
    '''
    Receives a string.
    Removes \n, \t, commas and semicolons and strips blankspaces.
    Returns the clean string
    '''
    if st is not None:
        s = st.strip()
        return re.sub(r'\n|\t|\,|\;', '', str(s))

**Search info**:

In [None]:
base_url = 'https://www.doctoralia.com.br'
search_url = base_url + '/pesquisa?filters%5Bspecializations%5D%5B%5D={0}&q={1}&loc={2}'
place = 'Bahia'
max_pages = 500

spec_key, spec_value = 'specialization.id', 'specialization.name'

**I/O info**:

In [None]:
output_file = '../data/doctoralia_raw.csv'
driver_file = '../chromedriver.exe'

**`scrapy` info**:

In [None]:
fields_list = ['nome', 'endereco', 'telefone', 'especialidade', 'nota', 'registro']

settings = {
    'ITEM_PIPELINES': {'__main__.DoctoraliaPipeline': 0},
    'FEED_EXPORT_FIELDS': fields_list
}

***

More on `scrapy.Item`s [here](https://docs.scrapy.org/en/latest/topics/items.html).

In [None]:
class DoctoraliaItem(scrapy.Item):

    fields = {f: scrapy.Field() for f in fields_list}

More on `scrapy` item pipelines [here](https://docs.scrapy.org/en/latest/topics/item-pipeline.html).
- Documentation for `pydispatch` module [here](https://pypi.org/project/PyDispatcher/).

In [None]:
class DoctoraliaPipeline(object):

    def __init__(self):
        dispatcher.connect(self.spider_opened, signals.spider_opened)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        
    def spider_opened(self, spider):
        self.file = open(output_file, 'w+b')
        self.exporter = CsvItemExporter(file=self.file, join_multivalued=',', **{'delimiter': ';'})
        self.exporter.start_exporting()
    
    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        self.file.close()

    def process_item(self, item, spider):
        if item['nome'] is None:
            raise DropItem('Empty-named item.')

        item['nome'] = clean_string(item['nome'])
        
        item['endereco'] = list(set([clean_string(end) for end in item['endereco']]))

        item['telefone'] = list(set([clean_string(tel) for tel in item['telefone']]))

        item['registro'] = clean_string(item['registro'])

        item['especialidade'] = [clean_string(spec) for spec in item['especialidade']]
        # Specific case: removing "(descricao)" specialties
        item['especialidade'] = list(set([re.sub(r'\(desc.+\)', '', str(esp)) for esp in item['especialidade']]))
        # Filtering blank specialties
        item['especialidade'] = list(filter(None, item['especialidade']))

        item = DoctoraliaItem(item)
        self.exporter.export_item(item)

        return item

More on `scrapy.Spider`s [here](https://docs.scrapy.org/en/latest/topics/spiders.html).
- Documentation for `ast` module [here](https://docs.python.org/3/library/ast.html).
- (Unofficial) Documentation for `selenium` package [here](https://selenium-python.readthedocs.io/).
- Documentation for `bs4` (Beautiful Soup) library [here](https://www.crummy.com/software/BeautifulSoup/bs4/doc/).

In [None]:
class DoctoraliaSpider(scrapy.Spider):

    name = 'doctoralia'
    start_urls = [base_url]

    def __init__(self):
        self.options = webdriver.ChromeOptions()
        self.options.add_argument('--headless')

    def start_requests(self):
        yield scrapy.http.Request(url=self.start_urls[0],
                                  callback=self.parse)
    
    def parse(self, response):
        # Retrieves (string) list of specialties and parses it to a Python list
        str_list = response.xpath('//template[@data-template="search-autocomplete-specializations"]/@data-json').get()
        spec_list = literal_eval(str_list)

        # Opens browser
        self.browser = webdriver.Chrome(driver_file, options=self.options)
        
        # Iterates over list of specialties
        for ind, spec in enumerate(spec_list):
            url = search_url.format(spec[spec_key], spec[spec_value].replace(' ', '+'), place)
            
            # (Dinamically) Iterates over pages of results
            for page in range(1, max_pages+1):
                self.browser.get(url)
                soup = BeautifulSoup(self.browser.page_source)
                
                # Visits each result's particular page
                for link in soup.select('a[itemprop="name"]'):
                    yield scrapy.http.Request(url=link.get('href'), callback=self.parse_result)
                
                # Updates URL with next page to visit (if any)
                if bool(soup.select('li.next > a')):
                    url = soup.select_one('li.next > a')['href']
                else:
                    break

        # Closes browser
        self.browser.close()

    def parse_result(self, response):
        item = {}
        
        # Structuring item...
        nome = response.xpath('//div[@class="unified-doctor-header-info__name"]/span[@itemprop="name"]/text()') \
                       .extract_first()
        item['nome'] = nome
        
        enderecos = response.xpath('//span[@itemprop="streetAddress"]/span[@class="street"]/text()') \
                            .extract()
        logradouros = [', '.join(tag.xpath('./a/text()').extract()) \
                       for tag in response.xpath('//span[@itemprop="streetAddress"]')]
        item['endereco'] = [', '.join(end_log) for end_log in zip(enderecos, logradouros)]

        telefone = response.xpath('//div[@class="display-flex"]/a[@class="text-muted padding-left-2"]/b/text()') \
                   .extract()
        item['telefone'] = telefone
        
        especialidade = response.xpath('//span[@class="text-ellipsis"]/a/text()').extract()
        item['especialidade'] = especialidade
        
        registro = response.xpath('//div[@class="text-muted small"]/div/text()').extract_first()
        item['registro'] = registro
        
        nota = response.xpath('//span[@class="rating rating--lg"]/@data-score').extract_first()
        item['nota'] = nota
        
        yield item


Running spider:

In [None]:
process = CrawlerProcess(settings)
process.crawl(DoctoraliaSpider)
process.start()