Source: https://www.worldpresidentsdb.com/list/countries/

In [1]:
import scrapy
import logging

In [2]:
class WorldPresidents(scrapy.Spider):
    name = 'worldpresidentsdb'
    BASE_URL = "https://www.worldpresidentsdb.com/"
    start_urls = [
        BASE_URL + '/list/countries/',
    ]
    def parse(self, response):
        for country in response.css(".container div.list-group a"):
            country_url = country.css('::attr("href")').get()
            country_url = WorldPresidents.BASE_URL + country_url
            yield response.follow(country_url, self.parse_country)
                
    def parse_country(self, response):
        for president in response.css(".container div.list-group a"):
            president_url = president.css('::attr("href")').get()
            president_url = WorldPresidents.BASE_URL + president_url
            yield response.follow(president_url, self.parse_president)
            
    def parse_president(self, response):
        info = {
            "url": response.url
        }
        for p in response.css(".container div.row div.col-md-8 p"):
            p_info = self.extract_info(p)
            info.update(p_info)
        yield info
            
    def extract_info(self, p):
        info = {}
        k = None
        for text in p.css("::text").extract():                
            text = text.strip()
            if not text: continue
            if text.endswith(":"):
                if k and len(info[k]) == 1 and k not in {"Terms"}:
                    info[k] = v[0]
                k = text[:-1]
                info[k] = []
            else:
                v = text
                if k == "Terms":
                    if text.startswith(") "):
                        t = text[2:]
                        if t.lower().startswith("in office since "):
                            start = t.split(" since ")[1]
                            end = None
                        else:
                            start, end = t.split(" to ")
                        v = {"start": start, "end": end}
                    else:
                        continue
                info[k].append(v)
        if len(info[k]) == 1 and k not in  {"Terms"}:
            info[k] = v[0]
        return info

In [3]:
from scrapy.crawler import CrawlerProcess
from scrapy.exporters import JsonLinesItemExporter

In [4]:
process = CrawlerProcess({
    'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
    "FEEDS": {
        "worldpresidentsdb.json": {
            "format": "jsonlines",
            'encoding': 'utf8',
            'overwrite': True
        },
    },
    "LOG_LEVEL": "INFO"
})

process.crawl(WorldPresidents)

2021-01-04 18:18:56 [scrapy.utils.log] INFO: Scrapy 2.4.1 started (bot: scrapybot)
2021-01-04 18:18:56 [scrapy.utils.log] INFO: Versions: lxml 4.6.2.0, libxml2 2.9.10, cssselect 1.1.0, parsel 1.5.2, w3lib 1.21.0, Twisted 20.3.0, Python 3.7.3 (default, Mar 27 2019, 16:54:48) - [Clang 4.0.1 (tags/RELEASE_401/final)], pyOpenSSL 19.1.0 (OpenSSL 1.1.1i  8 Dec 2020), cryptography 2.8, Platform Darwin-19.6.0-x86_64-i386-64bit
2021-01-04 18:18:56 [scrapy.crawler] INFO: Overridden settings:
{'LOG_LEVEL': 'INFO',
 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'}
2021-01-04 18:18:56 [scrapy.extensions.telnet] INFO: Telnet Password: da4496763a7930d3
2021-01-04 18:18:56 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
2021-01-04 18:18:56 [scrapy.middleware] INFO: Enabled d

<Deferred at 0x7fd938c68e10>

In [5]:
%%time
process.start()

2021-01-04 18:19:03 [scrapy.core.engine] INFO: Closing spider (finished)
2021-01-04 18:19:04 [scrapy.extensions.feedexport] INFO: Stored jsonlines feed (544 items) in: worldpresidentsdb.json
2021-01-04 18:19:04 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 241323,
 'downloader/request_count': 618,
 'downloader/request_method_count/GET': 618,
 'downloader/response_bytes': 1933204,
 'downloader/response_count': 618,
 'downloader/response_status_count/200': 618,
 'dupefilter/filtered': 36,
 'elapsed_time_seconds': 7.151829,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2021, 1, 4, 23, 19, 4, 838),
 'item_scraped_count': 544,
 'log_count/INFO': 11,
 'memusage/max': 69541888,
 'memusage/startup': 69541888,
 'request_depth_max': 2,
 'response_received_count': 618,
 'scheduler/dequeued': 618,
 'scheduler/dequeued/memory': 618,
 'scheduler/enqueued': 618,
 'scheduler/enqueued/memory': 618,
 'start_time': datetime.datetime(2021, 1, 4, 23, 18

CPU times: user 6.24 s, sys: 65.4 ms, total: 6.3 s
Wall time: 7.16 s
