In [1]:
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
from concurrent.futures import ThreadPoolExecutor
from openai import OpenAI, RateLimitError
from tqdm import tqdm

# extra deps needed for this beyond ptracker requirements: pip install openpyxl gdown
import backoff
import gdown
import openpyxl
import os
import tempfile

from ptracker.api.models import Action, Promise
from ptracker.core import constants
from ptracker.core.llm_utils import cosine_similarity
from ptracker.core.sources import ActionExtractor, PromiseExtractor
from ptracker.core.sources.source_analyzer import SourceAnalyzer

client = OpenAI(api_key=os.environ["OPENAI_KEY"])

In [17]:
def extract_entities(analyzer: SourceAnalyzer, candidate_name: str, urls: list[str]) -> list[dict]:
    """
    Extract entity jsons *without* inserting them into the database.

    :param candidate_name: the name of the candidate.
    :param urls: list of source urls from which we want to extract entities
    :return: list of entity extracts, such as promise create or action create jsons.
    """
    assert len(analyzer.entity_registry) == 1, f"This extraction function only operates on one entity type at a time, but {len(analyzer.entity_registry)=} were supplied instead."
    entity_type = next(iter(analyzer.entity_registry))
    entity_jsons = analyzer.construct_entity_jsons(candidate_name=candidate_name, urls=urls).get(entity_type, {})
    deduplicator = analyzer.entity_registry[entity_type]
    return deduplicator.deduplicate_entities(entity_jsons=entity_jsons)

In [4]:
def evaluate_extracted_entities(extracts: list[dict], ground_truths: list[list[float]]) -> tuple[float]:
    """
    Given extracted entities and embeddings for the ground truth datasets, calculate eval stats.

    :param extracts: entity creation jsons, like for promises or actions.
    :param ground_truths: ground truth embeddings.
    :return: tuple of stats (recall, precision)
    """
    print(len(extracts))
    print(len(ground_truths))
    N = len(ground_truths)
    matches = 0
    counts = {idx: 0 for idx in range(len(ground_truths))}
    for extract in extracts:
        is_match = False
        for idx, reference in enumerate(ground_truths):
            if cosine_similarity(extract['embedding'], reference) >= constants.DUPLICATE_ENTITY_SIM_THRESHOLD:
                is_match = True
                counts[idx] += 1
        matches += int(is_match)

    recall = sum([(v > 0) for v in counts.values()]) / N
    precision = matches / len(extracts)

    print(f"Recall on reference set: {recall}")
    print(f"Match precision: {precision}")

    return recall, precision

In [5]:
@backoff.on_exception(backoff.expo, RateLimitError)
def get_embedding(text: str):
    return client.embeddings.create(
        input=text,
        model="text-embedding-3-large",
        encoding_format="float",
        dimensions=256,
    ).data[0].embedding

# Promise Extraction

In [6]:
with tempfile.NamedTemporaryFile(delete=False, dir=os.getcwd(), suffix=".xlsx") as fp:
    gdown.download(id="1s08EzhkD5KaWuZaLTuS6KquVYniheulkIBgtPdMUZbU", output=fp.name)
    # df = pd.read_excel(fp.name, sheet_name=None)
    wb = openpyxl.load_workbook(fp.name)
os.unlink(fp.name)

Downloading...
From (original): https://drive.google.com/uc?id=1s08EzhkD5KaWuZaLTuS6KquVYniheulkIBgtPdMUZbU
From (redirected): https://docs.google.com/spreadsheets/d/1s08EzhkD5KaWuZaLTuS6KquVYniheulkIBgtPdMUZbU/export?format=xlsx
To: C:\Users\nickm\develop\promisetracker\experiments\tmpxsxrlgp5.xlsx
216kB [00:00, 8.34MB/s]


In [7]:
sheets = wb.sheetnames
promise_data = []
for sheet in sheets:
    name = ' '.join(sheet.strip().split()[:2])
    for row in wb[sheet].iter_rows(min_row=2):
        link_obj = row[0].hyperlink
        if link_obj is not None:
            promise_data.append((name, link_obj.target, row[1].value))
        else:
            # We ran out of rows; no more sources. Assumes data is one contiguous section.
            break

In [8]:
with ThreadPoolExecutor() as executor:
    ref_embeddings = list(tqdm(executor.map(lambda tup: get_embedding(tup[2]), promise_data), position=0, leave=True))

167it [00:02, 56.28it/s]


In [9]:
promise_data[0]

('Daniel Lurie',
 'https://www.nbcbayarea.com/news/local/san-francisco/mayor-daniel-lurie-inauguration-speech/3754888/',
 'Declare a fentanyl state of emergency on Day 1')

In [10]:
article_mapping = {}
for cname, url, _ in promise_data:
    if cname not in article_mapping:
        article_mapping[cname] = set()
    article_mapping[cname].add(url)

In [18]:
analyzer = SourceAnalyzer()
analyzer.register_entity(entity=Promise, extractor=PromiseExtractor)

all_entities = []
for idx, cname in enumerate(article_mapping):
    cand_predicted_entities = extract_entities(analyzer, cname, list(article_mapping[cname]))
    all_entities.extend(cand_predicted_entities)
evaluate_extracted_entities(cand_predicted_entities, ref_embeddings)

[97m2025-02-22 12:03:02,536 INFO     || (ptracker.core.sources.source_analyzer:39) Received 10 urls for candidate Daniel Lurie. Beginning entity extraction; looping through them now.[0m
[97m2025-02-22 12:03:04,377 INFO     || (ptracker.core.sources.entity_extractor:113) Extracted promise: Ensure families impacted by school closures get first choice of neighborhood schools[0m
[97m2025-02-22 12:03:04,377 INFO     || (ptracker.core.sources.entity_extractor:114) Verbatim extraction honored: True[0m
[97m2025-02-22 12:03:04,377 INFO     || (ptracker.core.sources.entity_extractor:115) Article extract: Ensure families impacted by school closures get first choice of neighborhood schools, enhance math and literacy programs, support teachers, and make sure every city kid who needs a spot in after-school and summer camp programs has one.[0m
[97m2025-02-22 12:03:05,420 INFO     || (ptracker.core.sources.source_analyzer:56) Did not extract any Promise entities from chunk 1 of https://web.ar

55
167
Recall on reference set: 0.1377245508982036
Match precision: 0.4727272727272727


(0.1377245508982036, 0.4727272727272727)

# Action Extraction

In [None]:
# TODO once we have ground truth dataset