# Use XML sitemaps to scrape news articles & blog posts from a website

In [1]:
import trafilatura
from trafilatura import sitemaps
homepage = "https://www.sitemaps.org/"
links = sitemaps.sitemap_search(homepage)
print(links)
len(links)

['https://www.sitemaps.org', 'https://www.sitemaps.org/da/', 'https://www.sitemaps.org/da/faq.html', 'https://www.sitemaps.org/da/protocol.html', 'https://www.sitemaps.org/da/terms.html', 'https://www.sitemaps.org/de/', 'https://www.sitemaps.org/de/faq.html', 'https://www.sitemaps.org/de/protocol.html', 'https://www.sitemaps.org/de/terms.html', 'https://www.sitemaps.org/en_GB/', 'https://www.sitemaps.org/en_GB/faq.html', 'https://www.sitemaps.org/en_GB/protocol.html', 'https://www.sitemaps.org/en_GB/terms.html', 'https://www.sitemaps.org/es/', 'https://www.sitemaps.org/es/faq.html', 'https://www.sitemaps.org/es/protocol.html', 'https://www.sitemaps.org/es/terms.html', 'https://www.sitemaps.org/faq.html', 'https://www.sitemaps.org/fi/', 'https://www.sitemaps.org/fi/faq.html', 'https://www.sitemaps.org/fi/protocol.html', 'https://www.sitemaps.org/fi/terms.html', 'https://www.sitemaps.org/fr/', 'https://www.sitemaps.org/fr/faq.html', 'https://www.sitemaps.org/fr/protocol.html', 'https://w

84

In [2]:
[l for l in links if "protocol" in l]

['https://www.sitemaps.org/da/protocol.html',
 'https://www.sitemaps.org/de/protocol.html',
 'https://www.sitemaps.org/en_GB/protocol.html',
 'https://www.sitemaps.org/es/protocol.html',
 'https://www.sitemaps.org/fi/protocol.html',
 'https://www.sitemaps.org/fr/protocol.html',
 'https://www.sitemaps.org/it/protocol.html',
 'https://www.sitemaps.org/ja/protocol.html',
 'https://www.sitemaps.org/ko/protocol.html',
 'https://www.sitemaps.org/nl/protocol.html',
 'https://www.sitemaps.org/no/protocol.html',
 'https://www.sitemaps.org/pl/protocol.html',
 'https://www.sitemaps.org/protocol.html',
 'https://www.sitemaps.org/pt_BR/protocol.html',
 'https://www.sitemaps.org/ro/protocol.html',
 'https://www.sitemaps.org/ru/protocol.html',
 'https://www.sitemaps.org/sv/protocol.html',
 'https://www.sitemaps.org/tr/protocol.html',
 'https://www.sitemaps.org/zh_CN/protocol.html',
 'https://www.sitemaps.org/zh_HK/protocol.html',
 'https://www.sitemaps.org/zh_TW/protocol.html']

In [3]:
links = sitemaps.sitemap_search(homepage, target_lang="en")
links

['https://www.sitemaps.org',
 'https://www.sitemaps.org/en_GB/',
 'https://www.sitemaps.org/en_GB/faq.html',
 'https://www.sitemaps.org/en_GB/protocol.html',
 'https://www.sitemaps.org/en_GB/terms.html',
 'https://www.sitemaps.org/faq.html',
 'https://www.sitemaps.org/protocol.html',
 'https://www.sitemaps.org/terms.html']

In [4]:
len(links)

8

In [5]:
from trafilatura import fetch_url, extract
downloaded = [fetch_url(l) for l in links]
downloaded

['\ufeff<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">\r\n\r\n<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">\r\n\r\n<head>\r\n\r\n    <title>sitemaps.org - Home</title>\r\n\r\n    <meta http-equiv="Content-Type" content="text/html; charset=utf-8">\r\n    <script type="text/javascript">  var appInsights=window.appInsights||function(config){function i(config){t[config]=function(){var i=arguments;t.queue.push(function(){t[config].apply(t,i)})}}var t={config:config},u=document,e=window,o="script",s="AuthenticatedUserContext",h="start",c="stop",l="Track",a=l+"Event",v=l+"Page",y=u.createElement(o),r,f;y.src=config.url||"https://az416426.vo.msecnd.net/scripts/a/ai.0.js";u.getElementsByTagName(o)[0].parentNode.appendChild(y);try{t.cookie=u.cookie}catch(p){}for(t.queue=[],t.version="1.0",r=["Event","Exception","Metric","PageView","Trace","Dependency"];r.length;)i("track"+r.pop());return i("set"+s),i("clear"+s),i(h+a),i(c

In [6]:
texts = [extract(d) for d in downloaded]
texts[0][:500]

'What are Sitemaps?\nSitemaps are an easy way for webmasters to inform search engines about pages on their sites that are available for crawling. In its simplest form, a Sitemap is an XML file that lists URLs for a site along with additional metadata about each URL (when it was last updated, how often it usually changes, and how important it is, relative to other URLs in the site) so that search engines can more intelligently crawl the site.\nWeb crawlers usually discover pages from links within th'

In [7]:
texts = [extract(d, output_format="xml") for d in downloaded]
texts[0][:500]

'<doc sitename="sitemaps.org" title="Home" date="2020-04-17" description="The Sitemaps protocol enables webmasters to information earch engine about pages on their site that are available for crawling." categories="" tags="" fingerprint="4448db0242e4adac">\n  <main>\n    <head rend="h1">What are Sitemaps?</head>\n    <p>Sitemaps are an easy way for webmasters to inform search engines about pages on their sites that are available for crawling. In its simplest form, a Sitemap is an XML file that lists'

In [8]:
texts = [extract(d, output_format="csv") for d in downloaded]
texts[0][:500]

'None\t4448db0242e4adac\tNone\tHome\tNone\t2020-04-17\tWhat are Sitemaps? Sitemaps are an easy way for webmasters to inform search engines about pages on their sites that are available for crawling. In its simplest form, a Sitemap is an XML file that lists URLs for a site along with additional metadata about each URL (when it was last updated, how often it usually changes, and how important it is, relative to other URLs in the site) so that search engines can more intelligently crawl the site. Web craw'

In [9]:
# use the command line
!trafilatura --sitemap "https://www.sitemaps.org/" --list

https://www.sitemaps.org/
https://www.sitemaps.org/da/
https://www.sitemaps.org/da/faq.html
https://www.sitemaps.org/da/protocol.html
https://www.sitemaps.org/da/terms.html
https://www.sitemaps.org/de/
https://www.sitemaps.org/de/faq.html
https://www.sitemaps.org/de/protocol.html
https://www.sitemaps.org/de/terms.html
https://www.sitemaps.org/en_GB/
https://www.sitemaps.org/en_GB/faq.html
https://www.sitemaps.org/en_GB/protocol.html
https://www.sitemaps.org/en_GB/terms.html
https://www.sitemaps.org/es/
https://www.sitemaps.org/es/faq.html
https://www.sitemaps.org/es/protocol.html
https://www.sitemaps.org/es/terms.html
https://www.sitemaps.org/faq.html
https://www.sitemaps.org/fi/
https://www.sitemaps.org/fi/faq.html
https://www.sitemaps.org/fi/protocol.html
https://www.sitemaps.org/fi/terms.html
https://www.sitemaps.org/fr/
https://www.sitemaps.org/fr/faq.html
https://www.sitemaps.org/fr/protocol.html
https://www.sitemaps.org/fr/terms.html
https://www.sitemaps.org/it/
https://www.sitem