# Automatic scraping of research publications for Elicit search tasks

This notebook contains code to automate the retrieval of lists of research papers, typically from the websites of third-party think tanks.

Between websites, this task differs mainly in the classes or paths of the HTML elements that contain each research paper's title, authors, date, etc. Therefore, this notebook contains one function that is (hopefully) powerful and generic enough to scrape any think tank website, given a small Python dict that specifies how to do so for the website in question.
A number of such specificiation dicts are then provided for think tanks covering the development and societal effects of technology.

The goal of this effort is to assist users in creating such specifications for new websites. Evolving attempts can be found at the bottom of the notebook.

First, we install any non-standard Python libraries:

In [None]:
!pip install jsonpath_ng
!pip install pyppeteer
!pyppeteer-install
!pip install nest_asyncio
!apt install chromium  # pyppeteer downloads its own chromium, but gotta pull in the gazillion libraries

Collecting jsonpath_ng
  Downloading jsonpath_ng-1.5.3-py3-none-any.whl (29 kB)
Collecting ply
  Downloading ply-3.11-py2.py3-none-any.whl (49 kB)
[K     |████████████████████████████████| 49 kB 4.5 MB/s 
[?25hInstalling collected packages: ply, jsonpath-ng
Successfully installed jsonpath-ng-1.5.3 ply-3.11
You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.[0m
Collecting pyppeteer
  Downloading pyppeteer-0.2.5-py3-none-any.whl (87 kB)
[K     |████████████████████████████████| 87 kB 11.0 MB/s 
[?25hCollecting pyee<9.0.0,>=8.1.0
  Downloading pyee-8.1.0-py2.py3-none-any.whl (12 kB)
Collecting importlib-metadata<3.0.0,>=2.1.1
  Downloading importlib_metadata-2.1.1-py2.py3-none-any.whl (10 kB)
Collecting websockets<9.0,>=8.1
  Downloading websockets-8.1-cp37-cp37m-manylinux2010_x86_64.whl (79 kB)
[K     |████████████████████████████████| 79 kB 16.8 MB/s 
[?25hCollecting appdirs<2.0.0,>=1.4.3
  Downloading appdirs-1.4.4-py2.py3-none-any

The following `Extractor` class implements the generic scraping procedure:

In [None]:
import requests
from io import StringIO
from datetime import datetime
import time
from lxml import etree
from lxml.etree import XML, ElementTree
import lxml.html
import sys

try:
    import pyppeteer
    import asyncio
    HAS_PYPPETEER = True
except Exception as e:
    print("Could not import Pyppeteer", e)
    HAS_PYPPETEER = False

import jsonpath_ng
import json
import traceback




import nest_asyncio
nest_asyncio.apply()



SEPARATOR = "\n\n\n"

import logging
log = logging.getLogger(__name__)
#log_handler = logging.StreamHandler(sys.stdout)
#log.addHandler(log_handler)
logging.getLogger(__name__).setLevel(logging.DEBUG)
logging.basicConfig(format='%(levelname)s:%(message)s')

class Extractor:
    def __init__(self):
        self.browser = None
        self.browser_times = []
        self.processing_times = []
        self.headless = True

    async def start_browser(self, headless):
        if self.browser is None or headless != self.headless:
            await self.close_browser()
            log.info(f"Starting headless browser (headless={self.headless})...")
            self.headless = headless
            self.browser = await pyppeteer.launch(headless=self.headless, args=["--no-sandbox"])
            logging.getLogger("pyppeteer").setLevel(logging.WARNING)
            log.info("Browser started.")

    async def close_browser(self):
        if self.browser is not None:
            if HAS_PYPPETEER:
                await self.browser.close()
            self.browser = None
            log.info("\nBrowser closed.")

    async def get_raw_content(self, url, request_strategy="requests", headless=True, delay=0):
        """
        Runs a HTTP(S) request to the url and returns the page's source code.

        For the vast majority of pages, a simple GET request via the requests library is sufficient (request_strategy="requests").

        However, some website require additional interaction, e.g. JS or scrolling,
        either due to overzealous DDOS protections like Cloudflare or due to dynamic loading strategies (Medium blogs are one example).
        For these pages, Selenium can be used.
        """

        start = time.time()
        if request_strategy == "requests":
            r = requests.get(url)
            if not r.status_code == 200:
                logging.warning(f"Could not access {url}. HTTP status {r.status_code}")
                raise LookupError("HTTP non-200 response")
            raw_content = r.text

        elif request_strategy == "chrome" and HAS_PYPPETEER:
            await self.start_browser(headless)
            page = await self.browser.newPage()
            await asyncio.sleep(delay)
            response = await page.goto(url)
            if response.headers["status"] != "200":
                logging.error(f"Status code: {response.headers['status']}")
                logging.error(f"Full headers: {response.headers}")
                raise LookupError
            raw_content = await page.content()
            page = await page.close()

        else:
            logging.warning(f"No viable implementation found for request strategy '{request_strategy}'")
            raw_content = ""
            raise LookupError

        end = time.time()
        self.browser_times.append(end-start)
        return raw_content

    def normalize_space_in_string(self, s):
        try:
            return lxml.html.fromstring(s).xpath("normalize-space(.)") if (len(s) and ord(s[0]) != 65279) else ""
        except lxml.etree.ParserError:
            return ""

    def extract_items_by_xpath(self, item_els, fields):
        item_strs_in_tree = []

        if self.debug and True:
            print("Item:", etree.tostring(item_els[0], pretty_print=True, method="html").decode('unicode_escape'))
        if False:
            print("Whole page:", etree.tostring(tree, pretty_print=True, method="html").decode('unicode_escape'))

        for item_el in item_els:
            item_str = []
            for field in fields:
                if "extractor" in field:
                    result = field["extractor"](item_el.value)
                else:
                    result = item_el.xpath(field['xpath'])
                    if result and len(result):
                        if isinstance(result, list):
                            result = [self.normalize_space_in_string(str(a).strip()) for a in result if a and len(a.strip())]
                            result = ", ".join(result)
                        elif isinstance(result, str):
                            result = self.normalize_space_in_string(str(result).strip())

                if result:
                    item_str.append(f"{field['key']}: {result}")
            item_strs_in_tree.append("\n".join(item_str))

        return item_strs_in_tree

    def extract_items_by_jsonpath(self, item_els, fields):
        item_strs_in_tree = []

        for item_el in item_els:
            item_str = []
            for field in fields:
                if "extractor" in field:
                    result = field["extractor"](item_el.value)
                else:
                    result = jsonpath_ng.parse(field["jsonpath"]).find(item_el.value)
                    if result and len(result):
                        if isinstance(result, list):
                            result = [(r.value or "") for r in result]
                            result = ", ".join(result)

                if result:
                    item_str.append(f"{field['key']}: {result}")
            item_strs_in_tree.append("\n".join(item_str))

        return item_strs_in_tree


    def run(self, spec, debug=False, outfile="/tmp/scraping_results"):
        all_item_strs = []
        self.debug = debug

        logging.info(f"Attempting to scrape {spec['name']} ...")

        if not isinstance(spec["url_funcs"], list):
            spec["url_funcs"] = [spec["url_funcs"]]
        if not isinstance(spec["url_funcs_args"], list):
            spec["url_funcs_args"] = [spec["url_funcs_args"]]

        event_loop = asyncio.get_event_loop()
        try:
            for url_func, url_func_args in zip(spec["url_funcs"], spec["url_funcs_args"]):
                i = -1
                previous_result = None
                next_url_gen = iter(url_func_args) if hasattr(url_func_args, '__iter__') else url_func_args
                while True:
                    i += 1
                    if hasattr(url_func_args, '__iter__'):
                        args = next(next_url_gen, None)
                    else:
                        args = url_func_args(previous_result)

                    if args is None:  # this isn't the same as [None]
                        break

                    if self.debug and i > 1:
                        break

                    url = url_func(*args)

                    if url is None or len(url) == 0:
                        break

                    print(f"[{i}] Items already found: {len(all_item_strs)} (current url: {url})", end="\r", flush=True)

                    try:
                        start = time.time()

                        raw_content = event_loop.run_until_complete(self.get_raw_content(url, spec.get("request_strategy", "requests"),
                                                                                                   spec.get("headless", True),
                                                                                                   spec.get("delay", 0)))

                        if "preprocessor" in spec:
                            raw_content = spec["preprocessor"](raw_content)

                        previous_result = raw_content

                        item_strs = []
                        if spec.get("content_type", "html") == "html":
                            tree = lxml.html.fromstring(raw_content)
                            tree.make_links_absolute(url)
                            item_els = tree.xpath(spec["items_path"])

                            if len(item_els) == 0:
                                logging.info(f"\nNo {'more ' if len(all_item_strs) else ''}articles found, stopping.")
                                break

                            item_strs = self.extract_items_by_xpath(item_els, spec["fields"])

                        elif spec.get("content_type", "json") == "json":
                            item_els = jsonpath_ng.parse(spec["items_path"]).find(json.loads(raw_content))
                            if len(item_els) == 0:
                                logging.info(f"\nNo {'more ' if len(all_item_strs) else ''}articles found, stopping.")
                                break

                            item_strs = self.extract_items_by_jsonpath(item_els, spec["fields"])

                        if item_strs is None:
                            break

                        len_before_new_items = len(all_item_strs)
                        all_item_strs += item_strs

                        # remove duplicates
                        len_before_deduplication = len(all_item_strs)
                        all_item_strs = list(set(all_item_strs))
                        len_after_deduplication = len(all_item_strs)
                        deduplication_difference = len_before_deduplication - len_after_deduplication

                        if len_after_deduplication == len_before_new_items:
                            logging.info("\nNo more new items available, stopping.")
                            raise LookupError
                        elif deduplication_difference > 0:
                            logging.info(f"\nRemoved {deduplication_difference} duplicated item(s)")

                        end = time.time()
                        self.processing_times.append(end - start)

                    except LookupError:
                        log.info("\nLookup Error")
                        break
                    except AttributeError:
                        traceback.print_exc()
                        break
                    except ValueError:
                        traceback.print_exc()
                        break
                    except KeyboardInterrupt:
                        log.info("\nReceived exit, exiting")
                        break
                        event_loop.close()
                        sys.exit()
                    except RuntimeError:
                        break
                        traceback.print_exc()
                        event_loop.close()
                        sys.exit()
                    except pyppeteer.errors.BrowserError as e:
                        log.warning(f"\nBrowser Error: {e}")
                        traceback.print_exc()
                        break
                    except Exception as e:
                        log.info(f"\nError while trying to scrape {url}: {e}")
                        traceback.print_exc()
                        continue

            log.info(f"\nProcessing complete. Items found: {len(all_item_strs)}.")
            log.info(f"\nRequest time: {sum(self.browser_times)}. Processing time: {sum(self.processing_times)}")

            if self.debug:
                print(SEPARATOR.join(all_item_strs[0:5]))
            else:
                with open(outfile, "w") as f:
                    f.write(SEPARATOR.join(all_item_strs))

        finally:
            event_loop.run_until_complete(self.close_browser())

Existing specifications:

In [None]:

import itertools
from cssselect import GenericTranslator, SelectorError
def css(css_selector):
    try:
        return GenericTranslator().css_to_xpath(css_selector)
    except SelectorError:
        print('Invalid selector.')

from datetime import datetime

cset = {
    "name": "CSET",
    "url_funcs": (lambda page_index: f"https://cset.georgetown.edu/publications/?fwp_paged={page_index}"),
    "url_funcs_args": [([i] for i in itertools.count(1))],
    "items_path": css(".teaser"),
    "fields": [{
        "key": "Title",
        "xpath": ".//h4[1]/a/text()"
    }, {
        "key": "Abstract",
        "xpath": ".//p[1]/text()"
    }, {
        "key": "Authors",
        "xpath": ".//div[@class='teaser__authors']/span/text()"
    }, {
        "key": "Date",
        "xpath": ".//div[@class='teaser__top']/span[last()]/text()"
    }, {
        "key": "URL",
        "xpath": ".//h4[1]//a[1]/@href"
    }]
}

brookings = {
    "name": "Brookings",
    "url_funcs": [(lambda page_index: f"https://www.brookings.edu/project/artificial-intelligence-and-emerging-technology-initiative/page/{page_index}/?type=research"),
                 (lambda page_index: f"https://www.brookings.edu/topic/technology-innovation/page/{page_index}/?type=research")],
    "url_funcs_args": [([i] for i in itertools.count(1)),
                      ([i] for i in itertools.count(1))],
    "items_path": css(".article-info"),
    "fields": [{
        "key": "Title",
        "xpath": ".//h4/a[1]/text()"
    }, {
        "key": "Authors",
        "xpath": ".//div[@class='authors']//span/text()",
    }, {
        "key": "Date",
        "xpath": ".//time/text()"
    }, {
        "key": "URL",
        "xpath": ".//h4/a[1]/@href"
    }]
}

cnas = {
    "name": "CNAS",
    "url_funcs": [(lambda page_index: f"https://www.cnas.org/reports/p{page_index}")],
    "url_funcs_args": [([i] for i in itertools.count(1))],
    "items_path": css(".entry-listing") + "[1]/li",
    "fields": [{
        "key": "Title",
        "xpath": css(".fz16") + "[1]/text()",
    }, {
        "key": "Authors",
        "xpath": css(".fz15") + "[1]/text()[2]"
    }, {
        "key": "Date",
        "xpath": ".//ul[1]/li[last()]/text()"
    }, {
        "key": "URL",
        "xpath": css(".fz16") + "[1]/@href"
    }]
}

cdi = {
    "name": "Center for Data Innovation",
    "url_funcs": [(lambda page_index: f"https://datainnovation.org/category/publications/reports/page/{page_index}")],
    "url_funcs_args": [([i] for i in itertools.count(1))],
    "items_path": css(".header-list-style"),
    "fields": [{
        "key": "Title",
        "xpath": css(".grid-title") + "[1]/a/text()",
    }, {
        "key": "Authors",
        "xpath": css(".grid-post-box-meta") + "[1]/a/text()"
    }, {
        "key": "Date",
        "xpath": css(".published") + "[1]/text()"
    }, {
        "key": "URL",
        "xpath": css(".grid-title") + "[1]/a/@href"
    }]
}

itif = {
    "name": "Information Technology & Innovation Foundation",
    "url_funcs": [(lambda page_index: f"https://itif.org/publications/reports-briefings?page={page_index}")],
    "url_funcs_args": [([i] for i in itertools.count(0))],
    "items_path": css(".views-row"),
    "fields": [{
        "key": "Title",
        "xpath": css(".views-field-title") + "[1]//a/text()",
    }, {
        "key": "Summary",
        "xpath": css(".views-field-body") + "[1]//" + css(".field-content") + "[1]/text()"
    }, {
        "key": "Date",
        "xpath": css(".views-field-field-date") + "[1]//" + css(".date-display-single") + "[1]/text()"
    }, {
        "key": "URL",
        "xpath": css(".views-field-title") + "[1]//a/@href",
    }]
}

rand = {
    "name": "RAND",
    "url_funcs": [(lambda page_index: f"https://www.rand.org/topics/science-and-technology.html?page={page_index}")],
    "url_funcs_args": [([i] for i in itertools.count(1))],
    "items_path": css("ul.teasers.list.organic") + "/li",
    "fields": [{
        "key": "Title",
        "xpath": css(".title") + "[1]/a/text()",
    }, {
        "key": "Summary",
        "xpath": css("p") + "[1]/text()",
    }, {
        "key": "Date",
        "xpath": css(".date") + "/text()"
    }, {
        "key": "URL",
        "xpath": css(".title") + "[1]/a/@href",
    }]
}

rand = {
    "name": "RAND",
    "url_funcs": [(lambda page_index: f"https://www.rand.org/topics/science-and-technology.html?page={page_index}")],
    "url_funcs_args": [([i] for i in itertools.count(1))],
    "items_path": css("ul.teasers.list.organic") + "/li",
    "fields": [{
        "key": "Title",
        "xpath": css(".title") + "[1]/a/text()",
    }, {
        "key": "Summary",
        "xpath": css("p") + "[1]/text()",
    }, {
        "key": "Date",
        "xpath": css(".date") + "/text()"
    }, {
        "key": "URL",
        "xpath": css(".title") + "[1]/a/@href",
    }]
}


belfer = {
    "name": "Belfer Center",
    "url_funcs": [(lambda page_index: f"https://www.belfercenter.org/research/publication-type/reports-papers?f%5B0%5D=topic%3AScience%20%26%20Technology&page={page_index}")],
    "url_funcs_args": [([i] for i in itertools.count(0))],
    "items_path": css(".teaser-body"),
    "fields": [{
        "key": "Title",
        "xpath": css(".title") + "[1]/a/span/text()",
    }, {
        "key": "Summary",
        "xpath": css(".field--name-field-summary") + "[1]/p/text()",
    }, {
        "key": "Authors",
        "xpath": css(".author") + "/text()",
    }, {
        "key": "Date",
        "xpath": css(".pub-date") + "/text()"
    }, {
        "key": "URL",
        "xpath": css(".title") + "[1]/a/@href",
    }]
}


csis = {
    "name": "Center for Strategic and International Studies",
    "url_funcs": [(lambda page_index:
                   f"https://www.csis.org/analysis?&type=publication&field_publication_type%5B1%5D=781&field_categories_field_topics%5B2%5D=822&page={page_index}"
                   )],
    "url_funcs_args": [([i] for i in itertools.count(0))],
    "items_path": css(".ds-right"),
    "fields": [{
        "key": "Title",
        "xpath": css(".teaser__title") + "[1]/a/text()",
    }, {
        "key": "Summary",
        "xpath": css(".teaser__text") + "[1]/text()",
    }, {
        "key": "Authors",
        "xpath": css(".teaser__expert") + "/a/text()",
    }, {
        "key": "Date",
        "xpath": css(".date-display-single") + "/text()"
    }, {
        "key": "URL",
        "xpath": css(".teaser__title") + "[1]/a/@href",
    }]
}


mitre = {
    "name": "MITRE",
    "url_funcs": [(lambda page_index:
                    f"https://www.mitre.org/publication-keywords/artificial-intelligence?page={page_index}"
                   ),
                  (lambda page_index:
                    f"https://www.mitre.org/publication-keywords/computer-security?page={page_index}"
                   ),
                  ],
    "url_funcs_args": [([i] for i in itertools.count(0)), ([i] for i in itertools.count(0))],
    "items_path": css(".list-main.list-item"),
    "fields": [{
        "key": "Title",
        "xpath": css(".title") + "[1]/text()",
    }, {
        "key": "Summary",
        "xpath": css(".teaser") + "[1]/text()",
    }, {
        "key": "Date",
        "xpath": css(".date-display-single") + "/text()"
    }, {
        "key": "URL",
        "xpath": "a[1]/@href",
    }]
}


csba = {
    "name": "Center for Strategic and Budgetary Assessments",
    "url_funcs": [(lambda page_index:
                    f"https://csbaonline.org/research/publications/P{page_index*6}?categories%5B%5D=132&categories%5B%5D=131"
                   ),
                  ],
    "url_funcs_args": [([i] for i in itertools.count(0))],
    "items_path": css(".research-publications-articles .article"),
    "fields": [{
        "key": "Title",
        "xpath": css(".article-title") + "[1]/a/text()",
    }, {
        "key": "Summary",
        "xpath": ".//p/text()",
    }, {
        "key": "Authors",
        "xpath": css(".article-meta") + "/a/text()", #"*[self::a or self::span[not(@class='sep')]]/text()",
    }, {
        "key": "Date",
        "xpath": ".//time/text()"
    }, {
        "key": "URL",
        "xpath": css(".article-title") + "[1]/a/@href",
    }]
}


wilson = {
    "name": "Wilson Center",
    "url_funcs": [(lambda page_index:
                   f"https://www.wilsoncenter.org/insight-analysis?_page={page_index}&keywords=&_limit=10&programs=116,400,526&types=publication"
                   ),],
    "url_funcs_args": [([i] for i in itertools.count(1))],
    "items_path": css(".teaser"),
    "fields": [{
        "key": "Title",
        "xpath": css(".teaser-title-text") + "[1]/text()",
    }, {
        "key": "URL",
        "xpath": "./@href",
    }]
}

ac = {
    "name": "Atlantic Council",
    "url_funcs": [(lambda page_index:
                   f"https://www.atlanticcouncil.org/insights-impact/research/?ac-page-collection-block_5d7284e0aa316={page_index}"
                   ),],
    "url_funcs_args": [([i] for i in itertools.count(1))],
    "items_path": css(".gta-post-embed--content"),
    "fields": [{
        "key": "Title",
        "xpath": "h4/a/text()",
    }, {
        "key": "Summary",
        "xpath": css(".gta-post-embed--excerpt") + "/text()",
    }, {
        "key": "Authors",
        "xpath": css(".gta-post-site-banner--tax--expert") + "/text()",
    }, {
        "key": "Date",
        "xpath": css(".gta-post-embed--heading") + "/text()",
    }, {
        "key": "URL",
        "xpath": "h4/a/@href",
    }]
}

newamerica = {
    "name": "New America",
    "url_funcs": [(lambda previous_result:
                        "https://www.newamerica.org/api/post/?page_size=1000&content_type=report" if previous_result is None else json.loads(previous_result)["next"])],
    "url_funcs_args": [lambda previous_result: [previous_result]],
    "items_path": "results.[*]",
    "content_type": "json",
    "fields": [{
        "key": "Title",
        "jsonpath": "title"
    }, {
        "key": "Excerpt",
        "jsonpath": "story_excerpt"
    }, {
        "key": "Authors",
        "extractor": lambda i: ", ".join([(a.get("first_name", "") + " " + a.get("last_name", "")) for a in (i.get("authors", []) or [])]),
    }, {
        "key": "Date",
        "jsonpath": "date"
    }, {
        "key": "URL",
        "extractor": lambda i: f"https://newamerica.org{i['url']}"
    }],
}


afsd = {
    "name": "Alliance for Securing Democracy",
    "url_funcs": [(lambda page_index:
                   f"https://securingdemocracy.gmfus.org/category/policy-paper/page/{page_index}"
                   ),],
    "url_funcs_args": [([i] for i in itertools.count(1))],
    "items_path": ".//article",
    "fields": [{
        "key": "Title",
        "xpath": css(".entry-title") + "/a/text()",
    }, {
        "key": "Summary",
        "xpath": css(".fusion-post-content-container") + "/p[1]/text()",
    }, {
        "key": "Authors",
        "xpath": css(".vcard .fn") + "[1]/a/text()",
    }, {
        "key": "Date",
        "xpath": css(".asd-meta__date") + "/text()",
    }, {
        "key": "URL",
        "xpath": css(".entry-title") + "/a/@href"
    }]
}


rstreet = {
    "name": "R Street",
    "url_funcs": [(lambda page_index:
                   f"https://www.rstreet.org/publications/?_category=research-commentary&_issue=technology-innovation&_paged={page_index}"
                   ),],
    "url_funcs_args": [([i] for i in itertools.count(1))],
    "request_strategy": "chrome",
    "items_path": css(".w-full.pt-4"),
    "fields": [{
        "key": "Title",
        "xpath": css("a.mb-4") + "/text()",
    }, {
        "key": "Summary",
        "xpath": css(".pt-4") + "/p[1]/text()",
    }, {
        "key": "Authors",
        "xpath": css(".tracking-wider.text-sm") + "/text()",
    }, {
        "key": "Date",
        "xpath": ".//time/text()"
    }, {
        "key": "URL",
        "xpath": css("a.mb-4") + "/@href",
    }]
}


heritage = {
    "name": "Heritage Foundation",
    "url_funcs": [(lambda page_index:
                    f"https://www.heritage.org/technology?taxonomy_term_tid=143&f[0]=content_type%3Areport&page={page_index}"
                   ),
                  (lambda page_index:
                    f"https://www.heritage.org/cybersecurity?taxonomy_term_tid=143&f[0]=content_type%3Areport&page={page_index}"
                   ),],
    "url_funcs_args": [([i] for i in itertools.count(0)), ([i] for i in itertools.count(0))],
    "items_path": css(".result-card"),
    "fields": [{
        "key": "Title",
        "xpath": css(".result-card__title") + "/text()",
    }, {
        "key": "Authors",
        "xpath": css(".result-card__link") + "/span/text()"
    }, {
        "key": "Date",
        "xpath": css(".result-card__date") + "/span/text()"
    }, {
        "key": "URL",
        "xpath": css(".result-card__title") + "/@href",
    }]
}



peterson = {
    "name": "Peterson Institute for International Economics",
    "url_funcs": [(lambda page_index:
                   f"https://www.piie.com/research/publications/policy-briefs?page={page_index}"
                   ),],
    "url_funcs_args": [([i] for i in itertools.count(0))],
    "items_path": css(".node--publication"),
    "fields": [{
        "key": "Title",
        "xpath": ".//h3/a/text()",
    }, {
        "key": "Authors",
        "xpath": css(".field--contributor") + "/p/a/text()"
    }, {
        "key": "Date",
        "xpath": css(".date-display-single") + "/text()"
    }, {
        "key": "URL",
        "xpath": ".//h3/a/@href"
    }]
}


# -----------------  NEW ADDITIONS TO THE LIST -----------------------


fsi = {
    "name": "Stanford Freeman Spogli Institute, Internet Observatory",
    "url_funcs": [(lambda page_index: f"https://cyber.fsi.stanford.edu/io/publications")],
    "url_funcs_args": [([i] for i in itertools.count(0))],
    "items_path": css(".views-row"),
    "fields": [{
        "key": "Title",
        "xpath": css(".views-field-title") + "[1]//a/text()",
    }, {
        "key": "Authors",
        "xpath": css(".views-field-field-related-authors") + "[1]//li/text()",
    }, {
        "key": "URL",
        "xpath": css(".views-field-title") + "[1]//a/@href",
    }]
}


dfrlab = {
    "name": "Digital Forensic Research Lab",
    "url_funcs": [
        (lambda page_index, cur_timestamp: f"https://medium.com/_/api/collections/df0d49d8c59b/topics/f6dfe506de26?to={cur_timestamp}&page={page_index}"),  # technology
        (lambda page_index, cur_timestamp: f"https://medium.com/_/api/collections/df0d49d8c59b/topics/f361c4ce391a?to={cur_timestamp}&page={page_index}"),  # democracy
        (lambda page_index, cur_timestamp: f"https://medium.com/_/api/collections/df0d49d8c59b/topics/5046526b459a?to={cur_timestamp}&page={page_index}"),  # security
        (lambda page_index, cur_timestamp: f"https://medium.com/_/api/collections/df0d49d8c59b/topics/8893b0f7d055?to={cur_timestamp}&page={page_index}"),  # contributors
    ],
    "url_funcs_args": [
        ([i, int(datetime.now().timestamp() * 1000)] for i in itertools.count(1)),
        ([i, int(datetime.now().timestamp() * 1000)] for i in itertools.count(1)),
        ([i, int(datetime.now().timestamp() * 1000)] for i in itertools.count(1)),
        ([i, int(datetime.now().timestamp() * 1000)] for i in itertools.count(1)),
    ],
    "preprocessor": lambda r: r[16:], # cut off first 16 letters of non-json gibberish
    "items_path": "payload.references.Post.*",
    "content_type": "json",
    "fields": [{
        "key": "Title",
        "jsonpath": "title"
    }, {
        "key": "Summary",
        "jsonpath": "content.subtitle"
    }, {
        "key": "Date",
        "extractor": lambda i: datetime.fromtimestamp(float(i['firstPublishedAt']) / 1000).strftime("%d %B, %Y")
    }, {
        "key": "URL",
        "extractor": lambda i: f"https://medium.com/dfrlab/{i['uniqueSlug']}"
    }],
}

aspi = {
    "name": "Australian Strategic Policy Institute",
    "url_funcs": [(lambda page_index: f"https://www.aspi.org.au/search?sort_by=field_publication_date_common&page={page_index}")],
    "url_funcs_args": [([i] for i in itertools.count(0))],
    "request_strategy": "chrome",
    "headless": False,
    "delay": 30,  # 30 second delay between pages, to avoid triggering the rate limiter
    "items_path": ".//div[@role='article']",
    "fields": [{
        "key": "Title",
        "xpath": ".//h4/a/span/text()"
    }, {
        "key": "Authors",
        "xpath": css(".group-right") + "[1]/p[last()]//a/text()",
    }, {
        "key": "Date",
        "xpath": css(".field--name-field-publication-date-common") + "[1]/text()",
    }, {
        "key": "URL",
        "xpath": ".//h4/a/@href"
    }]
}

stopfake = {
    "name": "StopFake.org",
    "url_funcs": [(lambda page_index: f"https://www.stopfake.org/en/category/research/page/{page_index}")],
    "url_funcs_args": [([i] for i in itertools.count(1))],
    "items_path": css(".item-details"),
    "fields": [{
        "key": "Title",
        "xpath": ".//h3/a/text()"
    }, {
        "key": "Excerpt",
        "xpath": css(".td-excerpt") + "[1]/text()",
    }, {
        "key": "Date",
        "xpath": ".//time/text()"
    }, {
        "key": "URL",
        "xpath": ".//h3/a/@href"
    }]
}


stratcom = {
    "name": "NATO Strategic Communications Centre of Excellence",
    "url_funcs": [(lambda page_index: f"https://stratcomcoe.org/publications?page={page_index}")],
    "url_funcs_args": [([i] for i in itertools.count(1))],
    "items_path": css(".item-content"),
    "fields": [{
        "key": "Title",
        "xpath": ".//h2/text()"
    }, {
        "key": "Date",
        "xpath": css(".date") + "[1]/text()"
    }, {
        "key": "URL",
        "xpath": "a/@href"
    }]
}


lowyinstitute = {
    "name": "Lowy Institute",
    "url_funcs": [(lambda page_index: f"https://www.lowyinstitute.org/all/publications/page/{page_index}")],
    "url_funcs_args": [([i] for i in itertools.count(0))],
    "items_path": css(".node-content"),
    "fields": [{
        "key": "Title",
        "xpath": ".//h2/a/text()"
    }, {
        "key": "Summary",
        "xpath": css(".summary_content") + "[1]/text()"
    }, {
        "key": "Authors",
        "xpath": css(".submitted") + "[1]/a/text()"
    }, {
        "key": "Date",
        "xpath": css(".date") + "[1]/text()"
    }, {
        "key": "URL",
        "xpath": ".//h2/a/@href"
    }]
}


fireeye = {
    "name": "FireEye Threat Intelligence Reports",
    "url_funcs": [(lambda _: f"https://www.fireeye.com/current-threats/threat-intelligence-reports.html")],
    "url_funcs_args": [[[None]]],
    "items_path": css(".c01_item"),
    "fields": [{
        "key": "Title",
        "xpath": ".//a/text()"
    }, {
        "key": "Summary",
        "xpath": ".//span/text()"
    }, {
        "key": "URL",
        "xpath": ".//a/@href"
    }]
}

graphika = {
    "name": "Graphika Social Network Reports",
    "url_funcs": [(lambda _: f"https://graphika.com/reports")],
    "url_funcs_args": [[[None]]],
    "items_path": css(".report-item"),
    "fields": [{
        "key": "Title",
        "xpath": ".//div//a/h2/text()"
    }, {
        "key": "Summary",
        "xpath": ".//div/p[@class='']/text()"
    }, {
        "key": "Date",
        "xpath": ".//small/text()"
    }, {
        "key": "URL",
        "xpath": ".//div//a/h2/parent::a/@href"
    }]
}


In [None]:

source_specs = [cset,
                brookings,
                cnas,
                cdi,
                itif,
                rand,
                belfer,
                csis,
                mitre,
                csba,
                wilson,
                ac,
                newamerica,
                afsd,
                rstreet,  # warning -- requires headless chrome
                heritage,
                peterson,
                fsi,
                dfrlab,
                #aspi,  # exclude for now, due to cloudflare issues
                stopfake,
                stratcom,
                lowyinstitute,
                fireeye,
                graphika]

e = Extractor()
for s in source_specs:
    e.run(s, debug=False, outfile=("./" + s["name"] + ".scrape.txt"))


























INFO:
Lookup Error
INFO:
Processing complete. Items found: 227.
INFO:
Request time: 8.654510021209717. Processing time: 8.752454042434692
INFO:
Lookup Error
INFO:
Lookup Error
INFO:
Processing complete. Items found: 526.
INFO:
Request time: 37.826335430145264. Processing time: 39.02401375770569
INFO:
Lookup Error
INFO:
Processing complete. Items found: 539.
INFO:
Request time: 50.2677845954895. Processing time: 51.764790058135986
INFO:
Lookup Error
INFO:
Processing complete. Items found: 36.
INFO:
Request time: 51.392231702804565. Processing time: 52.94012475013733
INFO:
Processing complete. Items found: 652.
INFO:
Request time: 98.71362638473511. Processing time: 100.4991204738617
INFO:
Lookup Error
INFO:
Processing complete. Items found: 1132.
INFO:
Request time: 219.91992831230164. Processing time: 220.94064593315125
INFO:
Processing complete. Items found: 348.
INFO:
Request time: 287.39141392707825. Processing time: 288.9628851413727
INFO:
Processing complete. Items found: 341.
INF

In [None]:
from glob import glob
import os


def concat_files(file_list, outfilename):
    with open(outfilename, "w") as outfile:
        for filename in sorted(file_list):

            print("joining", filename)
            with open(filename, "r") as infile:
                outfile.write(infile.read())
                outfile.write(SEPARATOR)

concat_files(glob('./*.scrape.txt'), "./merged_scrape.txt")
#concat_files([(ss["name"] + ".scrape.txt") for ss in [fsi, dfrlab, stopfake, stratcom, lowyinstitute, fireeye, graphika]], "./cybersecurity.scrape.txt")


joining ./Alliance for Securing Democracy.scrape.txt
joining ./Atlantic Council.scrape.txt
joining ./Belfer Center.scrape.txt
joining ./Brookings.scrape.txt
joining ./CNAS.scrape.txt
joining ./CSET.scrape.txt
joining ./Center for Data Innovation.scrape.txt
joining ./Center for Strategic and Budgetary Assessments.scrape.txt
joining ./Center for Strategic and International Studies.scrape.txt
joining ./Digital Forensic Research Lab.scrape.txt
joining ./FireEye Threat Intelligence Reports.scrape.txt
joining ./Graphika Social Network Reports.scrape.txt
joining ./Heritage Foundation.scrape.txt
joining ./Information Technology & Innovation Foundation.scrape.txt
joining ./Lowy Institute.scrape.txt
joining ./MITRE.scrape.txt
joining ./NATO Strategic Communications Centre of Excellence.scrape.txt
joining ./New America.scrape.txt
joining ./Peterson Institute for International Economics.scrape.txt
joining ./R Street.scrape.txt
joining ./RAND.scrape.txt
joining ./Stanford Freeman Spogli Institute, 

# WORK IN PROGRESS BELOW

In [None]:
from pyppeteer.launcher import Launcher
' '.join(Launcher().cmd)

'/root/.local/share/pyppeteer/local-chromium/588429/chrome-linux/chrome --disable-background-networking --disable-background-timer-throttling --disable-breakpad --disable-browser-side-navigation --disable-client-side-phishing-detection --disable-default-apps --disable-dev-shm-usage --disable-extensions --disable-features=site-per-process --disable-hang-monitor --disable-popup-blocking --disable-prompt-on-repost --disable-sync --disable-translate --metrics-recording-only --no-first-run --safebrowsing-disable-auto-update --enable-automation --password-store=basic --use-mock-keychain --headless --hide-scrollbars --mute-audio about:blank --remote-debugging-port=35859 --user-data-dir=/root/.local/share/pyppeteer/.dev_profile/tmpsus9muup'

In [None]:
#!/usr/bin/env python3


from pprint import pprint
from lxml import etree
from lxml.etree import XML, ElementTree
import lxml.html
from cssselect import GenericTranslator, SelectorError
from itertools import combinations
def css(css_selector):
    try:
        return GenericTranslator().css_to_xpath(css_selector)
    except SelectorError:
        print('Invalid selector.')

import re
from collections import defaultdict

tree = lxml.html.fromstring("""<html>
  <div class="header"><h1>Welcome</h1></div>
  <div/>
  <div class="article">
    <h2><a>John Doe</a></h2>
    <p>Title: <span class="title small">The great article</span><span class="annotation">(out of print)</span></p>
    <div />
  </div>
  <div class="article">
    <h2><a>Jane Smith</a></h2>
    <p class="annotation">Guest contributor</p>
    <p>Title: <span>A small book</span>(out of stock)</p>
    <div />
  </div>
</html>""")
roottree = tree.getroottree()

def parse_template(template):
    return [l.split(": ") for l in template.split("\n")]

templates = ["""Author: John Doe
Title: The great article""",
"""Author: Jane Smith
Title: (out of stock)"""]

potential_selectors = {}  # keys are element's memory addresses id(element), values are lists of possible selectors
def get_potential_selectors(e, allow_nonunique_classes=False):
    """
    Returns (tag, [<a list of strategies>]) for the element, where each strategy can be one of the following:
      * a set() of class names, some combination of which may correctly identify the element and its relevant siblings
      * an integer, representing a fixed index (to differentiate this element from its siblings)
      * nothing, if the element has no siblings
    """
    if id(e) in potential_selectors:
        return potential_selectors[id(e)]

    # if there are multiple siblings
    siblings = []
    for s in e.itersiblings(preceding=False):
        if s.tag == e.tag:
            siblings.append(s)
    for s in e.itersiblings(preceding=True):
        if s.tag == e.tag:
            siblings.append(s)

    selectors = []
    if siblings:
        # filter by same tag
        # does this one have a class that distinguishes it from the others?
        e_classes = set(e.attrib.get("class", "").split())
        some_siblings_classes = set()

        for s in siblings:
            s_classes = s.attrib.get("class", "").split()
            some_siblings_classes = some_siblings_classes.union(*s_classes)

        unique_identifying_classes = e_classes.difference(some_siblings_classes)
        if unique_identifying_classes:
            selectors.append(unique_identifying_classes)

        if allow_nonunique_classes:
            all_siblings_classes = set()
            for s in siblings:
                s_classes = s.attrib.get("class", "").split()
                all_siblings_classes = all_siblings_classes.intersection(set(s_classes))
            nonunique_identifying_classes = e_classes.difference(unique_identifying_classes).difference(some_siblings_classes)

            if nonunique_identifying_classes:
                selectors.append(nonunique_identifying_classes)

        parent_tree = etree.ElementTree(e.getparent())
        path_full = parent_tree.getpath(e)
        index_tag = path_full[-len(path_full.split("[")[-1]):-1]
        selectors.append(int(index_tag))

    result = (e.tag, selectors or [""])  # when no particular strategies have been identified, just use the string.
    potential_selectors[id(e)] = result
    return result

def product(*sequences):
    '''Breadth First Search Cartesian Product'''
    # sequences = tuple(tuple(seq) for seqin sequences)

    def partitions(n, k):
        for c in combinations(range(n+k-1), k-1):
            yield (b-a-1 for a, b in zip((-1,)+c, c+(n+k-1,)))

    max_position = [len(i)-1 for i in sequences]
    for i in range(sum(max_position)):
        for positions in partitions(i, len(sequences)):
            try:
                yield tuple(map(lambda seq, pos: seq[pos], sequences, positions))
            except IndexError:
                continue
    yield tuple(map(lambda seq, pos: seq[pos], sequences, max_position))

def get_xpath_part(selector):
    (tag, strategy) = selector
    if isinstance(strategy, set): # classes
        return tag + "[@class and (" + " or ".join([f"contains(concat(' ', normalize-space(@class), ' '), ' {c} ')" for c in list(strategy)[:1]]) + ")]"
    elif isinstance(strategy, int): # index
        return tag + "[" + str(strategy) + "]"
    elif strategy == "": # no strategies, just use the tag itself
        return tag

def compile_xpath(selectors):
    return "./" + "/".join(get_xpath_part(s) for s in selectors[::-1])

def expand_selector_list(selector_list):
    tags, strategy_lists = zip(*selector_list)
    selector_strategies_generator = product(*strategy_lists)
    for selector_strategy_combo in selector_strategies_generator:
        yield compile_xpath(list(zip(tags, selector_strategy_combo)))

def check_has_child(parent_node, selector_list):
    """
    Check if, for the given parent node, any of the many combinations of selectors for each of their children
    actually match an element.
    """
    return any(parent_node.xpath(selector) for selector in expand_selector_list(selector_list))

def find_likely_parent_item(template):
    parents_lists = []
    tls = parse_template(template)
    for e in tree.iter():
        for eti, et in enumerate(e.xpath("text()")):
            for (key, value) in tls:
                if et == value:
                    parents_lists.append([e] + list(e.iterancestors()))

    # now, we want to find the first element that overlaps in all lists.
    # all lists, of course, start with /html. So we go backwards and find the first one that diverges.
    i = -1
    while True:
        if len(set(parents[i] for parents in parents_lists)) == 1:
            i -= 1
        else:
            # first divergence at i
            break

    i += 1 # this is the index (some negative number) of the first candidate for being the article div or similar

    # build up all of the possible strategies of constructing xpaths for each found element
    potential_selector_lists = []
    for parents in parents_lists:
        potential_selector_list = []
        for j, pe in enumerate(parents):
            neg_index = j - len(parents)
            may_be_article = True #neg_index >= i
            potential_selector_list.append(get_potential_selectors(pe, allow_nonunique_classes=may_be_article))
        potential_selector_lists.append(potential_selector_list)

    # go up the tree, and see at which level we come across the most siblings for which (at least some of) the
    # target-finding paths match, i.e. siblings which also have the target elements, i.e. siblings that are also article divs.
    parent_scores = defaultdict(int)
    for j in range(i, 0):  # starting with the first potential *article* element ...
        p = parents[j]
        parent_siblings = list(p.itersiblings(preceding=True)) + list(p.itersiblings(preceding=False))
        parent_siblings_are_articles = [False] * len(parent_siblings)
        for psi, ps in enumerate(parent_siblings):
            # ... go through all of its siblings, and for each of them,
            for potential_selector_list in potential_selector_lists:
                # ... check if we can find each one of the targets' equivalents underneath this sibling

                if check_has_child(ps, potential_selector_list[:j]):
                    parent_siblings_are_articles[psi] = True
                    break

        parent_scores[j] = sum(parent_siblings_are_articles)

    parent_item_index = max(parent_scores, key=parent_scores.get)
    return parents_lists[0][parent_item_index]  # the most likely article div

print(find_likely_parent_item(templates[0]))


<Element div at 0x7fd854d17e90>


# automagic scraping: the great adventure (below -- work in progress)

In [None]:
#!/usr/bin/env python3

from pprint import pprint
from lxml import etree
from lxml.etree import XML, ElementTree
import lxml.html
from cssselect import GenericTranslator, SelectorError
import itertools
from functools import lru_cache
import copy
import numpy as np

def css(css_selector):
    try:
        return GenericTranslator().css_to_xpath(css_selector)
    except SelectorError:
        print('Invalid selector.')

import re
from collections import defaultdict, namedtuple

sample = """<html>
  <div class="header">
    <h1>Welcome</h1>
  </div>
  <div class="feature">
    <h2><span class="bold">
  </div>
  <div class="article">
    <h2><a>John Doe</a></h2>
    <h2><a>John Doe</a></h2>
    <h2><a>John Doe</a></h2>
    <h2><a>John Doe</a></h2>
    <h2>John Doe</h2>
    <p>Title: <span class="title small">The great article</span><span class="annotation">(out of print)</span></p>
    <div><span>John</span> and <span>Doe</span> and <p>John</p> <p>Doe</p></div>
  </div>
  <div class="article">
    <h2><a>Jane Smith</a></h2>
    <p class="annotation">Guest contributor</p>
    <p>Title: <span>A small book</span>(out of stock)</p>
    <div />
    <span>John</span>
  </div>
</html>"""

#with open("./tests/websites/fsi.html") as f:
#    sample = f.read()


templates = ["""Author: John Doe
Title: The great article""",
"""Author: Jane Smith
Title: A small book"""]

#templates = ["""Title: Contours and Controversies of Parler
#Authors: David Thiel, Renee DiResta, Shelby Grossman, Elena Cryst""",
#"""Title: DeZurdaTeam: A Twitter network promotes pro-Cuba hashtags (TAKEDOWN)
#Authors: Elena Cryst, Shelby Perkins"""]


def template_prob_match(prob1, prob2):
    total = 0
    for l1, l2 in zip(prob1, prob2):
        total += l1 * l2
    return total


tree = lxml.html.fromstring(sample)

class ContentElement():
    def __init__(self, selector, parent, content, n_fields, n_templates):
        self.parent = parent
        self.selector = selector
        self.content = content
        self.n_fields = n_fields
        self.n_templates = n_templates

        self.template_field_prob = defaultdict(lambda: [0] * n_templates)  # [template][field] = 0.0
        self.template_other_prob = 0.0
        super().__init__()

    def getparent(self):
        return self.parent

    def iterancestors(self):
        return itertools.chain(iter([self.parent]), self.parent.iterancestors())

    def iterchildren(self):
        return iter([])

    def get_xored_template_field_prob(self, ):
        """
        Return a version of self.template_field_prob that is collapsed along the fields,
        showing the probability that this will take on exactly one field (XOR) for each template.
        """

        def xor_values(probs):
            negated_m = np.tile(np.array(probs)[None, :], [len(probs), 1])
            m = (1 - negated_m) + 2 * np.eye(len(probs)) * (negated_m - .5)
            return np.sum(np.prod(m, 1))

        all_template_prob = [[] for ti in range(self.n_templates)]  # for each template index, a list of all probs any field ever had.
        for field, template_prob in self.template_field_prob.items():
            for ti, prob in enumerate(template_prob):
                all_template_prob[ti].append(prob)

        xored_values = [0] * self.n_templates
        for ti, probs in enumerate(all_template_prob):
            xored_values[ti] = xor_values(probs)

        return xored_values


    def update_template_other_prob(self):
        total = 0
        for field_probs in self.template_field_prob.values():
            for prob in field_probs:
                total += prob

        self.template_other_prob = 1 - total

    def __hash__(self):
        return hash((self.parent, self.selector))

    def __repr__(self):
        return f"<{self.parent.tag} {self.selector}={self.content}>"


# we first want to find all of the instances of the values.
class PathInductor():
    def __init__(self, tree, templates):
        self.tree = tree
        self.templates = templates
        self.parsed_templates = [self.parse_template(t) for t in templates]
        self.fields = set().union(*[list(tp.keys()) for tp in self.parsed_templates])
        self.n_fields = len(self.fields)
        self.n_templates = len(self.templates)

        self.field_prob = defaultdict(lambda: defaultdict(lambda: defaultdict(float)))  # by (field, level)  # TODO competition between items, such that a field knows to belong somewhere

        self.itemness_prob = defaultdict(float) # by (element) -- this is computed by the model
        self.item_field_level_prob = defaultdict(float)  # by (field, level) -- this is after checking for presence of children

        self.template_prob = defaultdict(lambda: [1] * (self.n_templates)) # by (element)
        self.template_other_prob = defaultdict(lambda: float) # by (element)
        self.template_field_prob = defaultdict(lambda: defaultdict(lambda: defaultdict(float)))
        self.field_selectors = defaultdict(lambda: defaultdict(float)) # by (field, level)
        self.item_selectors = []

        self.content_elements_by_parent = defaultdict(set)
        self.content_elements_by_content = defaultdict(set)
        self.template_fields_by_value = defaultdict(set)

    def parse_template(self, template):
        return {l.split(": ", 1)[0]: l.split(": ", 1)[1] for l in template.split("\n")}

    @lru_cache(None)  # elements (e) are hashable!
    def get_element_selector_options(self, e, allow_nonunique_classes=False):
        if isinstance(e, ContentElement):
            return [(None, e.selector)]

        # if there are multiple siblings
        siblings = list(e.itersiblings(e.tag, preceding=False)) + list(e.itersiblings(e.tag, preceding=True))

        selectors = []
        if siblings:
            # filter by same tag
            # does this one have a class that distinguishes it from the others?
            e_classes = set(e.attrib.get("class", "").split())
            some_siblings_classes = set()

            for s in siblings:
                s_classes = s.attrib.get("class", "").split()
                some_siblings_classes = some_siblings_classes.union(*s_classes)

            unique_identifying_classes = frozenset(e_classes.difference(some_siblings_classes))
            if unique_identifying_classes:
                selectors.append(unique_identifying_classes) #{"strategy": "classes", "tag": e.tag, "classes": unique_identifying_classes})

            if allow_nonunique_classes:
                all_siblings_classes = set()
                for s in siblings:
                    s_classes = s.attrib.get("class", "").split()
                    all_siblings_classes = all_siblings_classes.intersection(set(s_classes))
                nonunique_identifying_classes = frozenset(e_classes.difference(unique_identifying_classes).difference(some_siblings_classes))

                if nonunique_identifying_classes:
                    selectors.append(nonunique_identifying_classes)

            parent_tree = etree.ElementTree(e.getparent())
            path_full = parent_tree.getpath(e)
            index_tag = path_full[-len(path_full.split("[")[-1]):-1]
            selectors.append(int(index_tag))

        result = [(e.tag, s) for s in (selectors or [""])]  # when no particular strategies have been identified, just use the string.o
        return result

    def get_xpath_part(self, selector):
        (tag, strategy) = selector
        if tag is None:
            return strategy
        elif isinstance(strategy, set) or isinstance(strategy, frozenset): # classes
            return tag + "[@class and (" + " or ".join([f"contains(concat(' ', normalize-space(@class), ' '), ' {c} ')" for c in list(strategy)]) + ")]"
        elif isinstance(strategy, int): # index
            return tag + "[" + str(strategy) + "]"
        elif strategy == "": # no strategies, just use the tag itself
            return tag

    def compile_xpath(self, selectors, global_path=False):
        return ("//" if global_path else "./") + "/".join(self.get_xpath_part(s) for s in selectors[::-1])

    def find_candidate_content_elements(self, value):
        candidates = []
        for e in self.tree.iter():
            for eti, et in enumerate(e.xpath("text()")):
                if et == value:
                    if eti == 0:
                        le = ContentElement("text()", e, value, self.n_fields, self.n_templates)
                    else:
                        le = ContentElement(f"text()[{eti + 1}]", e, value, self.n_fields, self.n_templates)
                    candidates.append(le)
                    self.content_elements_by_content[value].add(le)
                    self.content_elements_by_parent[e].add(le)

            for eti, et in enumerate(e.xpath("@href")):
                if et == value:
                    le = ContentElement("@href", e, value, self.n_fields, self.n_templates)
                    candidates.append(le)
                    self.content_elements_by_content[value].add(le)
                    self.content_elements_by_parent[e].add(le)

        return candidates

    def initialize_leaf_probs(self):
        for ti, pt in enumerate(self.parsed_templates):
            for field, value in pt.items():
                candidate_content_elements = self.find_candidate_content_elements(value)
                template_prob = 1. / len(candidate_content_elements)
                for ce in candidate_content_elements:
                    self.field_prob[field][ce][0] = 1.0
                    ce.template_field_prob[field][ti] = template_prob
                    self.template_fields_by_value[value].add((ti, field))

        for ce in set.union(*self.content_elements_by_parent.values()):
            ce.update_template_other_prob()

    def _update_template_prob(self, element):
        """
        The probability of a parent being e.g. template 1 is the probability that:
        - all children are *either* template1 fields, or other, AND
        - all children are *not* just other fields.

        Furthermore, we assume that it can never happen that e.g. a template1 child and a template2 child are under the same parent.
        Because those cases aren't captured in our calculation at all, the resulting probabilities won't add to 1, and we have to explicitly renormalize them.

        In this case, we look through all children, and ask all children to report their template probs to us.

        """
        template_prob = None
        template_other_prob = 1
        for child in itertools.chain(element.iterchildren(), self.content_elements_by_parent[element]):  # if this is a regular element with children
            child_template_prob = None
            if isinstance(child, lxml.etree.ElementBase):
                child_template_prob = self._update_template_prob(child)
                child_other_prob = self.template_other_prob[child]

            elif isinstance(child, ContentElement):
                child_template_prob = child.get_xored_template_field_prob()
                child_other_prob = child.template_other_prob
                print("child template prob", child, child_template_prob)

            if child_template_prob:
                if template_prob is None:
                    template_prob = [1] * self.n_templates
                for ti, l in enumerate(child_template_prob):
                    template_prob[ti] *= (l + child_other_prob)
                template_other_prob *= child_other_prob

        if template_prob is not None:
            # subtract template_other_prob from everything
            for ti, l in enumerate(template_prob):
                template_prob[ti] -= template_other_prob

            # then renormalize
            total = sum(template_prob) + template_other_prob
            for ti, l in enumerate(template_prob):
                if total > 0:
                    template_prob[ti] /= total

            self.template_prob[element] = template_prob
            self.template_other_prob[element] = (template_other_prob / total) if total > 0 else 0.0
            return template_prob
        else:
            return False

    def _update_field_prob(self, element, field):
        element_field_prob = defaultdict(float)
        for child in itertools.chain(element.iterchildren(), self.content_elements_by_parent[element]):
            child_field_probs = None
            match = 0
            if child in self.field_prob[field]:
                child_field_probs = self.field_prob[field][child]  # these are directly from below, e.g. [0]: 1
                if isinstance(child, lxml.etree.ElementBase):
                    match = sum(c * e for c, e in zip(self.template_prob[child], self.template_prob[element]))  # simple dot product
                elif isinstance(child, ContentElement):
                    match = sum(c * e for c, e in zip(child.template_field_prob[field], self.template_prob[element]))
            else:
                # or it doesn't, then we want to go one level below
                child_field_probs = self._update_field_prob(child, field)
                match = sum(c * e for c, e in zip(self.template_prob[child], self.template_prob[element]))  # simple dot product

            if child_field_probs:
                for level, level_prob in child_field_probs.items():
                    element_field_prob[level + 1] += (level_prob * match)

        if element_field_prob:
            self.field_prob[field][element] = element_field_prob
            return element_field_prob
        else:
            return False

    def propagate_coverage_up(self):
        self._update_template_prob(tree)
        for field in self.fields:
            self._update_field_prob(tree, field)

        print("template prob")
        pprint(self.template_prob)
        print("field prob")
        pprint(self.field_prob)

    def distribute_belonging_down(self):
        # first, we have the parent element's new field_probs, and also its template_probs. These together
        # form a multiplicative table.
        # first, we go down once and copy down all of the parent nodes' template_probs, unless they're all zero.

        def set_parent_template_prob_recursively(element):
            for child in element.iterchildren():
                if isinstance(child, lxml.etree.ElementBase):
                    if sum(self.template_prob[element]):
                        self.template_prob[child] = self.template_prob[element][:] # deep copy just in case
                    set_parent_template_prob_recursively(child)

        set_parent_template_prob_recursively(tree)

        # then, we go down once more and compute a new template_field_probs table for each leaf parent, to be normalized later.
        parent_template_field_probs = defaultdict(lambda: defaultdict(lambda: defaultdict(float)))
        def set_parent_template_field_probs_recursively(element):
            for child in element.iterchildren():
                if isinstance(child, lxml.etree.ElementBase):
                    for field, elementlevelprobs in self.field_prob.items():
                        for element, levelprobs in elementlevelprobs.items():
                            if 0 in levelprobs:
                                # this could include fields that some selector thought were a good idea, but which don't actually match the example's value
                                for ti, template_prob in enumerate(self.template_prob[element]):
                                    parent_template_field_probs[element][field] = levelprobs[0] * np.array(self.template_prob[element])
                    set_parent_template_field_probs_recursively(child)

        set_parent_template_field_probs_recursively(tree)

        # then, we go down once more and, for each one of the competing children, we collect all of the parents and normalize them
        totals = defaultdict(lambda: defaultdict(lambda: np.array([0] * self.n_templates)))
        for value, children in self.content_elements_by_content.items():
            for child in children:
                ptfb = parent_template_field_probs[child.getparent()]
                for field, template_probs in ptfb.items():
                    for ti, tprob in enumerate(template_probs):
                        if (ti, field) in self.template_fields_by_value[child.content]:
                            totals[value][field] += template_probs

        # then, go through the children and normalize their respective parent's beliefs
        child_totals = defaultdict(float)
        for value, children in self.content_elements_by_content.items():
            for child in children:
                ptfb = parent_template_field_probs[child.getparent()]
                for field, template_probs in ptfb.items():
                    self.template_field_prob[child][field] = ptfb[field] / totals[value][field]
                    child_totals[child] += np.sum(ptfb[field] / totals[value][field])
                self.template_other_prob[child] = 1 - child_totals[child]



    def update_item_model(self, full_items_only=False):
        """
        Go through all of the elements for which we have field_prob
        *or* through all which qualify as items, and count the occurence (or absence) of covered fields.

        TODO if not full_items_only, consider missing fields too
        """
        element_has_all_fields = defaultdict(lambda: False)
        if full_items_only:
            element_has_fields = defaultdict(set)
            for field, elementlevelprobs in self.field_prob.items():
                for element, levelprobs in elementlevelprobs.items():
                    for level, prob in levelprobs.items():
                        element_has_fields[element].add(field)
                    for element, fields in element_has_fields.items():
                        element_has_all_fields[element] = len(fields) == self.n_fields

        field_level_combos_found = defaultdict(int)
        levels_found_by_field = defaultdict(int)
        for field, elementlevelprobs in self.field_prob.items():
            for element, levelprobs in elementlevelprobs.items():
                for level, prob in levelprobs.items():
                    if (not full_items_only) or element_has_all_fields[element]:
                        field_level_combos_found[field, level] += prob
                        levels_found_by_field[field] += prob

        # an item: for each field, has a certain distribution what the levels are:
        for (field, level), combos_found in field_level_combos_found.items():
            self.item_field_level_prob[field, level] = (combos_found / levels_found_by_field[field]) if levels_found_by_field[field] else 0.0

        # when an element has a high probability of being a parent of multiple fields, its itemness goes up.
        # however, we then infer the selectors for items, and next time, only the items with high selector prob become items.
        # TODO also generate what kinds of items these things are
        element_itemness_estimate = defaultdict(lambda: defaultdict(float))

        for field, elementlevelprobs in self.field_prob.items():
            for element, levelprobs in elementlevelprobs.items():
                for level, prob in levelprobs.items():
                    if (not full_items_only) or element_has_all_fields[element]:
                        element_itemness_estimate[element][field, level] += prob * self.item_field_level_prob[field, level]

        # clear out previous itemness_probs, then fill them in if the item_prob is large enough.
        self.itemness_prob = defaultdict(float)
        for element, fieldprobs in element_itemness_estimate.items():
            element_item_prob = sum(fieldprobs.values()) / len(self.fields)  # itemprob is the average prob over the fields.
            if element_item_prob > 0.01:
                self.itemness_prob[element] = element_item_prob
            else:
                self.itemness_prob.pop(element, None)


    def generate_field_selectors(self):
        # now we want to update the probabilities of each selector going up
        # for each field and level, we have a dictionary of selectors with their probabilities.
        self.field_selectors = defaultdict(lambda: defaultdict(float)) # by (field, level)
        for field in self.fields:
            elements_to_update_selectors = set()  # set of (element, level)
            for ce, levelprobs in self.field_prob[field].items():
                if not 0 in levelprobs: # we are just looking for elements that occur at level 0, i.e. leaves
                    continue
                elements_to_update_selectors.add((ce, 0))
                for level, ancestor in enumerate(ce.iterancestors(), 1):
                    elements_to_update_selectors.add((ancestor, level))

            for element, level in elements_to_update_selectors:
                options = self.get_element_selector_options(element)
                element_prob = self.field_prob[field][element][level]
                for option in options:
                    self.field_selectors[field, level][option] += element_prob

        # prune selectors with low probabilities
        for (field, level), selector_options in self.field_selectors.items():
            prunable_options = [selector_option for selector_option, prob in selector_options.items() if prob < 0.01]
            for po in prunable_options:
                del selector_options[po]

        # we have a selector for elements that are (field: URL, generation: 2).
        # the correct one is /div[@class='details']/, but because the URL is just a span, there are a lot of
        # grandparents of spans.
        # TODO clean up and prune the list of selectors for each kind of element
        # for each level, we want to look at the probabilities, and only keep the top 10 selectors
        #print("field selectors are:")
        #pprint(dict(self.field_selectors))
        # TODO especially if there are highly common intersections of classes, find the most likely set of classes


    def generate_item_selectors(self):
        # TODO choose items that have enough itemness, pruning some if necessary, and generate selectors for those.
        itemness_by_tag = defaultdict(dict)
        for element, itemness in self.itemness_prob.items():
            itemness_by_tag[element.tag][element] = itemness

        element_by_tag = defaultdict(list)
        for tag, element_itemness_probs in itemness_by_tag.items():
            # for all of the ones that have the same tag, see which ones have the same parent.
            element_by_parent = defaultdict(list)
            for element, itemness_prob in element_itemness_probs.items():
                element_by_parent[element.getparent()].append((element, itemness_prob,))

            # for each parent, compute the total score:
            parent_scores = defaultdict(float)
            for parent, elementprobs in element_by_parent.items():
                for element, prob in elementprobs:
                    parent_scores[parent] += prob
            # get the parent with the highest score
            best_parent = max(parent_scores, key=parent_scores.get)
            element_by_tag[tag] = element_by_parent[best_parent]

        tag_scores = defaultdict(float)
        for tag, elementprobs in element_by_tag.items():
            for element, prob in elementprobs:
                tag_scores[tag] += prob
        best_tag = max(tag_scores, key=tag_scores.get)
        best_items = [t for (t, p) in element_by_tag[best_tag]]
        # now generate some selector that captures all of the items but nothing else
        best_item_selectors = [self.get_element_selector_options(t) for t in best_items]
        # first, try merging the selectors' class sets
        selector = (best_item_selectors[0][0][0], frozenset(set.intersection(*[set(s[0][1]) for s in best_item_selectors])))
        item_field_level_coverage = defaultdict(set)
        for item in best_items:
            for field, elementprobs in self.field_prob.items():
                for level, prob in elementprobs[item].items():
                    if prob > 0.01:
                        item_field_level_coverage[field].add(level)
        self.item_selectors.append((selector, 1, item_field_level_coverage))


    def select_and_check_item_candidates(self):

        def recursively_detect_coverage(element, field, level):
            if level > 2:
                candidate_children = set()
                for selector in self.field_selectors[field, level - 1].keys():
                    for child in element.xpath(self.compile_xpath([selector])):
                        candidate_children.add(child)
                return any([recursively_detect_coverage(child, field, level - 1) for child in candidate_children])

            elif level == 2:
                # only one needs to match
                leaf_selectors = self.field_selectors[field, 1].keys()  # the text() has been found in p's and in spans. could be both.
                content_element_selectors = self.field_selectors[field, 0].keys()
                for (leaf_selector, content_element_selector) in itertools.product(leaf_selectors, content_element_selectors):
                    if len(element.xpath(self.compile_xpath([content_element_selector, leaf_selector]))):
                        return True
                return False

        # TODO update the itemness probs based on the item selectors
        coverage = defaultdict(lambda: defaultdict(lambda: defaultdict(float)))
        for (selector, prob, item_field_level_coverage) in self.item_selectors:
            for el in tree.xpath(self.compile_xpath([selector], global_path=True)):
                # now, for each one of these found items, we want to check what their itemness would be.
                for field in self.fields:
                    for level in item_field_level_coverage[field]:
                        # we think it might be possible that the item contains (field @ level).
                        # we check this by recursively checking for the presence of such an element.
                        # these items, most likely, are not examples we have already discovered.
                        # so when we discover particular items near the bottom of the chain,
                        # we want to figure out the coverage, which will most likely just be "1",
                        # and we will then normalize this by the end of it.
                        field_level_coverage = recursively_detect_coverage(el, field, level)
                        if field_level_coverage:
                            coverage[field][el][level] = True

        # TODO: don't update coverage for items that we already have a prob for
        # TODO: normalize
        # TODO check if close enough to item model, kick out if not
        # TODO update item model accordingly.
        print("coverage", coverage)

    # now we want to run the loop where we continuously update the probabilities
    def approximate_probabilities(self):
        self.initialize_leaf_probs()  # does the search and propagates the field_probs upwards through the tree
        self.propagate_coverage_up()  # TODO this is clearly not correct at the moment; shouldn't be using logit addition but rather something smarter.
        self.distribute_belonging_down()

        # now let's look at what the items are like.
        self.update_item_model(full_items_only=True)

        # build the initial item model based on full-coverage items only.
        self.generate_field_selectors()

        self.generate_item_selectors()  # find item selectors for the top n items.
        self.select_and_check_item_candidates()  # select a bunch of other candidates for items, and also their item types (i.e. likely level combos)
        return


pi = PathInductor(tree, templates)
pi.approximate_probabilities()


child template prob <a text()=John Doe> [0.20000000000000007, 0.0]
child template prob <a text()=John Doe> [0.20000000000000007, 0.0]
child template prob <a text()=John Doe> [0.20000000000000007, 0.0]
child template prob <a text()=John Doe> [0.20000000000000007, 0.0]
child template prob <h2 text()=John Doe> [0.20000000000000007, 0.0]
child template prob <span text()=The great article> [1.0, 0.0]
child template prob <a text()=Jane Smith> [0.0, 1.0]
child template prob <span text()=A small book> [0.0, 1.0]
template prob
defaultdict(<function PathInductor.__init__.<locals>.<lambda> at 0x7fd854fb38c0>,
            {<a text()=John Doe>: [1, 1],
             <a text()=John Doe>: [1, 1],
             <a text()=John Doe>: [1, 1],
             <a text()=John Doe>: [1, 1],
             <span text()=A small book>: [1, 1],
             <span text()=The great article>: [1, 1],
             <a text()=Jane Smith>: [1, 1],
             <h2 text()=John Doe>: [1, 1],
             <Element p at 0x7fd854d