### This pipeline scrapes landing pages and PDFs. The files are stored in Cloudflare R2 and the metadata is saved in a table

**input**: recent crossref records, repo records, and PDF urls from landing page records

**process**: taxicab API on ECS

**output**: file id, url, related ids saved to `openalex.taxicab.taxicab_results`


In [None]:
from concurrent.futures import ThreadPoolExecutor
import concurrent.futures
import datetime
import time
from urllib3.util.retry import Retry
from pyspark.sql import functions as F
from pyspark.sql import types as T
import requests
from requests.adapters import HTTPAdapter
from datetime import timezone

In [0]:
ENDPOINT = "http://harvester-load-balancer-366186003.us-east-1.elb.amazonaws.com/taxicab"

In [None]:
%sql
CREATE TABLE IF NOT EXISTS openalex.taxicab.taxicab_results (
  taxicab_id STRING,
  url STRING,
  resolved_url STRING,
  status_code INT,
  content_type STRING,
  native_id STRING,
  native_id_namespace STRING,
  s3_path STRING,
  is_soft_block BOOLEAN,
  created_date TIMESTAMP,
  processed_date TIMESTAMP,
  error STRING
)
USING DELTA;

In [None]:
%sql
CREATE TABLE IF NOT EXISTS openalex.taxicab.rescrape_queue (
  native_id STRING,
  native_id_namespace STRING,
  created_date TIMESTAMP DEFAULT current_timestamp()
)
USING DELTA;

In [None]:
# result schema

results_schema = T.StructType([
    T.StructField("taxicab_id", T.StringType(), True),
    T.StructField("url", T.StringType(), True),
    T.StructField("resolved_url", T.StringType(), True),
    T.StructField("status_code", T.IntegerType(), True),
    T.StructField("content_type", T.StringType(), True),
    T.StructField("native_id", T.StringType(), True),
    T.StructField("native_id_namespace", T.StringType(), True),
    T.StructField("s3_path", T.StringType(), True),
    T.StructField("is_soft_block", T.BooleanType(), True),
    T.StructField("created_date", T.TimestampType(), True),
    T.StructField("processed_date", T.TimestampType(), True),
    T.StructField("error", T.StringType(), True)
])

In [None]:
dbutils.widgets.text("lookback_days", "3", "Lookback window (days)")
dbutils.widgets.text("rescrape_queue_only", "false", "Rescrape queue only (true/false)")

rescrape_queue_only = dbutils.widgets.get("rescrape_queue_only").strip().lower() == "true"
lookback_days = int(dbutils.widgets.get("lookback_days"))

if rescrape_queue_only:
    print("RESCRAPE MODE: processing queue table")
else:
    last_processed_date = datetime.datetime.now(timezone.utc) - datetime.timedelta(days=lookback_days)
    print(f"Looking back {lookback_days} days from: {last_processed_date}")

In [None]:
# urls to scrape

dbutils.widgets.text("url_limit", "250000", "Max URLs to process")
url_limit = int(dbutils.widgets.get("url_limit"))

if rescrape_queue_only:
    queue_df = spark.read.table("openalex.taxicab.rescrape_queue")
    queue_count = queue_df.count()
    if queue_count == 0:
        dbutils.notebook.exit("Queue empty â€” nothing to rescrape")

    print(f"Rescrape queue has {queue_count} records")

    # DOIs: construct URL directly
    doi_urls = (
        queue_df.filter(F.col("native_id_namespace") == "doi")
        .withColumn("url", F.concat(F.lit("https://doi.org/"), F.col("native_id")))
        .select("native_id", "native_id_namespace", "url")
    )

    # Non-DOIs: look up most recent URL from taxicab_results
    non_doi_urls = (
        queue_df.filter(F.col("native_id_namespace") != "doi")
        .join(
            spark.read.table("openalex.taxicab.taxicab_results")
                .select("native_id", "native_id_namespace", "url")
                .dropDuplicates(["native_id", "native_id_namespace", "url"]),
            ["native_id", "native_id_namespace"], "inner"
        )
        .select("native_id", "native_id_namespace", "url")
    )

    all_urls = doi_urls.unionByName(non_doi_urls).limit(url_limit)

    all_urls_pd = all_urls.toPandas()

    jsonUrls = [
        {
            "url": row["url"],
            "native_id": row.get("native_id", ""),
            "native_id_namespace": row.get("native_id_namespace", "")
        }
        for row in all_urls_pd.to_dict('records')
        if row["url"] is not None
    ]

else:
    # Source 1: Crossref works
    recent_crossref_works = (
        spark.read
        .table("openalex.crossref.crossref_works")
        .filter(F.col("created_date") >= F.lit(last_processed_date))
        .select(
            "native_id",
            "native_id_namespace",
            F.expr("get(filter(urls, x -> x.url like '%doi.org%'), 0).url").alias("url"),
            F.to_timestamp("created_date").alias("source_created_date"),
        )
    )

    # Source 2: Repo works
    recent_repo_works = (
        spark.read.table("openalex.repo.repo_works")
        .filter(F.col("created_date") >= F.lit(last_processed_date))
        .select(
            "native_id",
            "native_id_namespace",
            F.slice("urls", 1, 3).alias("urls"),
            F.to_timestamp("created_date").alias("source_created_date"),
        )
        .filter(F.col("urls").isNotNull())
        .select("*", F.explode("urls").alias("url_struct"))
        .select(
            "native_id",
            "native_id_namespace",
            "source_created_date",
            F.col("url_struct.url").alias("url")
        )
        .filter(~F.col("url").contains("doi.org"))
    )

    # Source 3: Landing page PDF URLs
    recent_pdf_works = (
        spark.read
        .table("openalex.landing_page.landing_page_works")
        .filter(F.col("created_date") >= F.lit(last_processed_date))
        .select(
            "ids",
            "native_id",
            "native_id_namespace",
            F.expr("get(filter(urls, x -> x.content_type = 'pdf'), 0).url").alias("url"),
            F.to_timestamp("created_date").alias("source_created_date"),
        )
        .withColumn("pmh_id", F.expr("get(filter(ids, x -> x.namespace = 'pmh'), 0).id"))
        .withColumn("doi_id", F.expr("get(filter(ids, x -> x.namespace = 'doi'), 0).id"))
        # Set priority: PMH first, then DOI, then original
        .withColumn("final_native_id", 
            F.when(F.col("pmh_id").isNotNull(), F.col("pmh_id"))
            .when(F.col("doi_id").isNotNull(), F.col("doi_id"))
            .otherwise(F.col("native_id")))
        .withColumn("final_namespace", 
            F.when(F.col("pmh_id").isNotNull(), F.lit("pmh"))
            .when(F.col("doi_id").isNotNull(), F.lit("doi"))
            .otherwise(F.col("native_id_namespace")))
        .select(
            F.col("final_native_id").alias("native_id"),
            F.col("final_namespace").alias("native_id_namespace"),
            "url",
            "source_created_date",
        )
        .filter(F.col("url").isNotNull())
    )

    # Union all sources, clean native_id, dedup, order newest first, then drop the ordering column
    taxicab_results = spark.table("openalex.taxicab.taxicab_results").select("url")

    all_urls = (
        recent_crossref_works
        .unionByName(recent_repo_works)
        .unionByName(recent_pdf_works)
        .withColumn("native_id", F.regexp_replace("native_id", "^https://doi\\.org/", ""))
        .join(taxicab_results, ["url"], "left_anti")
        .orderBy(F.col("source_created_date").desc())
        .limit(url_limit)
        .drop("source_created_date")
    )

    all_urls_pd = all_urls.toPandas()

    jsonUrls = [
        {
            "url": row["url"],
            "native_id": row.get("native_id", ""),
            "native_id_namespace": row.get("native_id_namespace", "")
        }
        for row in all_urls_pd.to_dict('records')
        if row["url"] is not None
    ]

total_urls = len(jsonUrls)
pdf_urls = sum(1 for url in jsonUrls if '.pdf' in url['url'].lower())
doi_urls_count = sum(1 for url in jsonUrls if 'doi.org' in url['url'].lower())
other_urls = total_urls - pdf_urls - doi_urls_count

print(f"Harvesting {total_urls} URLs ({pdf_urls} PDFs, {doi_urls_count} DOIs, {other_urls} other URLs)")

In [None]:
# process single url

def process_url(url_data, session):
    """
    Submit a URL to the Taxicab API for scraping.
    Uses the provided requests.Session for connection pooling.
    """
    try:
        payload = {
            "url": url_data.get("url"),
            "native_id": url_data.get("native_id", ""),
            "native_id_namespace": url_data.get("native_id_namespace", "")
        }
        
        response = session.post(ENDPOINT, json=payload)
        response.raise_for_status()
        response_data = response.json()
        print(f"OK {url_data.get('url')} -> {response_data.get('id')}")
        
        return {
            "taxicab_id": response_data.get("id"),
            "url": url_data.get("url"),
            "status_code": response_data.get("status_code"),
            "resolved_url": response_data.get("resolved_url"),
            "content_type": response_data.get("content_type"),
            "native_id": response_data.get("native_id"),
            "native_id_namespace": response_data.get("native_id_namespace"),
            "s3_path": response_data.get("s3_path"),
            "is_soft_block": response_data.get("is_soft_block", False),
            "error": None
        }
    
    except Exception as e:
        print(f"ERR {url_data.get('url')} -> {e}")
        return {
            "taxicab_id": None,
            "url": url_data.get("url"),
            "status_code": getattr(getattr(e, 'response', None), 'status_code', 0),
            "resolved_url": None,
            "content_type": None,
            "native_id": url_data.get("native_id", ""),
            "native_id_namespace": url_data.get("native_id_namespace", ""),
            "s3_path": None,
            "is_soft_block": False,
            "error": str(e),
        }

In [None]:
# run all urls in a threadpool

def process_urls_with_threadpool(url_list, max_workers):
    """
    Process URLs using a ThreadPoolExecutor to parallelize requests.
    """
    # configure session with connection pooling and retry
    session = requests.Session()
    retries = Retry(
        total=3,
        backoff_factor=0.5,
        status_forcelist=[500, 502, 503, 504],
        allowed_methods=["GET", "POST"]
    )
    adapter = HTTPAdapter(
        pool_connections=max_workers,
        pool_maxsize=max_workers,
        max_retries=retries
    )
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    
    print(f"Starting ThreadPool with {max_workers} workers to process {len(url_list)} URLs")
    start_time = time.time()
    results = []
    
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_url = {
            executor.submit(process_url, url_data, session): url_data 
            for url_data in url_list
        }

        for count, future in enumerate(concurrent.futures.as_completed(future_to_url), 1):
            if count % 1000 == 0:
                print(f"Processed {count}/{len(future_to_url)}")
            results.append(future.result())
    
    elapsed_time = time.time() - start_time
    print(f"ThreadPool processing completed in {elapsed_time:.2f} seconds")
    print(f"Processed {len(results)} URLs")
    
    return results

In [None]:
# run it all
results = process_urls_with_threadpool(jsonUrls, max_workers=120)

now = datetime.datetime.now(timezone.utc)

for result in results:
    result["created_date"] = now
    result["processed_date"] = now

# create DataFrame directly from results and save to table
results_df = spark.createDataFrame(results, schema=results_schema)
results_df.write.mode("append").format("delta").saveAsTable("openalex.taxicab.taxicab_results")

print(f"Updated {results_df.count()} records in the results table")

if rescrape_queue_only:
    spark.sql("TRUNCATE TABLE openalex.taxicab.rescrape_queue")
    print("Rescrape queue cleared")