### This pipeline scrapes landing pages and PDFs. The files are stored in Cloudflare R2 and the metadata is saved in a table

**input**: recent crossref records, repo records, and PDF urls from landing page records

**process**: taxicab API on ECS

**output**: file id, url, related ids saved to `openalex.taxicab.taxicab_results`


In [None]:
from concurrent.futures import ThreadPoolExecutor
import concurrent.futures
import datetime
import time
from urllib3.util.retry import Retry
from pyspark.sql import functions as F
from pyspark.sql import types as T
import requests
from requests.adapters import HTTPAdapter
from datetime import timezone

In [0]:
ENDPOINT = "http://harvester-load-balancer-366186003.us-east-1.elb.amazonaws.com/taxicab"

In [0]:
%sql
CREATE TABLE IF NOT EXISTS openalex.taxicab.taxicab_results (
  taxicab_id STRING,
  url STRING,
  resolved_url STRING,
  status_code INT,
  content_type STRING,
  native_id STRING,
  native_id_namespace STRING,
  s3_path STRING,
  is_soft_block BOOLEAN,
  created_date TIMESTAMP,
  processed_date TIMESTAMP,
  error STRING
)
USING DELTA;

In [None]:
dbutils.widgets.text("lookback_days", "3", "Lookback window (days)")
lookback_days = int(dbutils.widgets.get("lookback_days"))

last_processed_field = "created_date"
last_processed_date = datetime.datetime.now(timezone.utc) - datetime.timedelta(days=lookback_days)
print(f"Using last processed date ({lookback_days} days ago): {last_processed_date}")

In [None]:
# urls to scrape

dbutils.widgets.text("url_limit", "250000", "Max URLs to process")
url_limit = int(dbutils.widgets.get("url_limit"))

def normalize_created_date(df):
    return df.withColumn(
        "created_date",
        F.to_timestamp("created_date")
    )

# Source 1: Crossref works
recent_crossref_works = normalize_created_date(
    spark.read
    .table("openalex.crossref.crossref_works")
    .filter(F.col(last_processed_field) >= F.lit(last_processed_date))
    .select(
        "native_id",
        "native_id_namespace",
        F.expr("get(filter(urls, x -> x.url like '%doi.org%'), 0).url").alias("url"),
        "created_date"
    )
)

# Source 2: Repo works
recent_repo_works = normalize_created_date(
    spark.read.table("openalex.repo.repo_works")
    .filter(F.col(last_processed_field) >= F.lit(last_processed_date))
    .select(
        "native_id",
        "native_id_namespace",
        F.slice("urls", 1, 3).alias("urls"),
        "created_date"
    )
    .filter(F.col("urls").isNotNull())
    .select("*", F.explode("urls").alias("url_struct"))
    .select(
        "native_id",
        "native_id_namespace",
        "created_date",
        F.col("url_struct.url").alias("url")
    )
    .filter(~F.col("url").contains("doi.org"))
)

# Source 3: Landing page PDF URLs
recent_pdf_works = normalize_created_date(
    spark.read
    .table("openalex.landing_page.landing_page_works")
    .filter(F.col(last_processed_field) >= F.lit(last_processed_date))
    .select(
        "ids",
        "native_id",
        "native_id_namespace",
        F.expr("get(filter(urls, x -> x.content_type = 'pdf'), 0).url").alias("url"),
        "created_date"
    )
    .withColumn("pmh_id", F.expr("get(filter(ids, x -> x.namespace = 'pmh'), 0).id"))
    .withColumn("doi_id", F.expr("get(filter(ids, x -> x.namespace = 'doi'), 0).id"))
    # Set priority: PMH first, then DOI, then original
    .withColumn("final_native_id", 
        F.when(F.col("pmh_id").isNotNull(), F.col("pmh_id"))
        .when(F.col("doi_id").isNotNull(), F.col("doi_id"))
        .otherwise(F.col("native_id")))
    .withColumn("final_namespace", 
        F.when(F.col("pmh_id").isNotNull(), F.lit("pmh"))
        .when(F.col("doi_id").isNotNull(), F.lit("doi"))
        .otherwise(F.col("native_id_namespace")))
    # select final columns
    .select(
        F.col("final_native_id").alias("native_id"),
        F.col("final_namespace").alias("native_id_namespace"),
        "url",
        "created_date"
    )
    .filter(F.col("url").isNotNull())
)

# Union all sources, then dedup once against existing results
all_urls = (
    recent_crossref_works
    .unionByName(recent_repo_works)
    .unionByName(recent_pdf_works)
)

taxicab_results = spark.table("openalex.taxicab.taxicab_results").select("url")

all_urls = (
    all_urls
    .join(taxicab_results, ["url"], "left_anti")
    .orderBy(F.col("created_date").desc())
    .limit(url_limit)
)

# Null out extreme dates that would break Arrow/pandas conversion (valid range ~1677-2262)
all_urls = all_urls.withColumn(
    "created_date",
    F.when(
        (F.year(F.col("created_date")) < 1900) | 
        (F.year(F.col("created_date")) > 2100),
        F.lit(None).cast("timestamp")
    ).otherwise(F.col("created_date"))
)

all_urls_pd = all_urls.toPandas()

jsonUrls = [
    {
        "url": row["url"],
        "created_date": row["created_date"],
        "native_id": row.get("native_id", ""),
        "native_id_namespace": row.get("native_id_namespace", "")
    }
    for row in all_urls_pd.to_dict('records')
    if row["url"] is not None
]

total_urls = len(jsonUrls)
pdf_urls = sum(1 for url in jsonUrls if '.pdf' in url['url'].lower())
doi_urls = sum(1 for url in jsonUrls if 'doi.org' in url['url'].lower())
other_urls = total_urls - pdf_urls - doi_urls

print(f"Harvesting {total_urls} URLs ({pdf_urls} PDFs, {doi_urls} DOIs, {other_urls} other URLs)")

In [None]:
# result schema

results_schema = T.StructType([
    T.StructField("taxicab_id", T.StringType(), True),
    T.StructField("url", T.StringType(), True),
    T.StructField("resolved_url", T.StringType(), True),
    T.StructField("status_code", T.IntegerType(), True),
    T.StructField("content_type", T.StringType(), True),
    T.StructField("native_id", T.StringType(), True),
    T.StructField("native_id_namespace", T.StringType(), True),
    T.StructField("s3_path", T.StringType(), True),
    T.StructField("is_soft_block", T.BooleanType(), True),
    T.StructField("created_date", T.TimestampType(), True),
    T.StructField("processed_date", T.TimestampType(), True),
    T.StructField("error", T.StringType(), True)
])

In [None]:
# process single url

def process_url(url_data, session):
    """
    Submit a URL to the Taxicab API for scraping.
    Uses the provided requests.Session for connection pooling.
    """
    # clean native_id in case it contains doi.org
    native_id = url_data.get("native_id", "")
    if native_id and "https://doi.org/" in native_id:
        native_id = native_id.replace("https://doi.org/", "")
    
    try:
        payload = {
            "url": url_data.get("url"),
            "native_id": native_id,
            "native_id_namespace": url_data.get("native_id_namespace", "")
        }
        
        response = session.post(ENDPOINT, json=payload)
        response.raise_for_status()
        response_data = response.json()
        print(response_data.get("id"))
        print(response_data.get("native_id"))
        print(response_data.get("native_id_namespace"))
        
        return {
            "taxicab_id": response_data.get("id"),
            "url": url_data.get("url"),
            "status_code": response_data.get("status_code"),
            "resolved_url": response_data.get("resolved_url"),
            "content_type": response_data.get("content_type"),
            "created_date": url_data["created_date"],
            "native_id": response_data.get("native_id"),
            "native_id_namespace": response_data.get("native_id_namespace"),
            "s3_path": response_data.get("s3_path"),
            "is_soft_block": response_data.get("is_soft_block", False),
            "error": None
        }
    
    # something went wrong
    except requests.RequestException as e:
        return {
            "taxicab_id": None,
            "url": url_data.get("url"),
            "status_code": getattr(e.response, 'status_code', 0),
            "resolved_url": None,
            "content_type": None,
            "created_date": url_data["created_date"],
            "native_id": native_id,
            "native_id_namespace": url_data.get("native_id_namespace", ""),
            "s3_path": None,
            "is_soft_block": False,
            "error": str(e),
        }

In [None]:
# run all urls in a threadpool

def process_urls_with_threadpool(url_list, max_workers):
    """
    Process URLs using a ThreadPoolExecutor to parallelize requests.
    """
    results = []
    
    # configure session with retry mechanism for better reliability
    session = requests.Session()
    retries = Retry(
        total=3,
        backoff_factor=0.5,
        status_forcelist=[500, 502, 503, 504],
        allowed_methods=["GET", "POST"]
    )
    adapter = HTTPAdapter(
        pool_connections=max_workers,
        pool_maxsize=max_workers,
        max_retries=retries
    )
    session.mount('http://', adapter)
    session.mount('https://', adapter)

    # helper function to clean DOI prefixes
    def clean_doi(native_id):
        if native_id and isinstance(native_id, str):
            if "https://doi.org/" in native_id:
                return native_id.replace("https://doi.org/", "")
        return native_id
    
    def submit_url(url_data):
        if "native_id" in url_data and url_data["native_id"]:
            url_data = url_data.copy()
            url_data["native_id"] = clean_doi(url_data["native_id"])

        try:
            result = process_url(url_data, session)
            print(f"Processed {url_data.get('url')} - Status: {result.get('status_code')}")
            return result
        except Exception as e:
            print(f"Error processing {url_data.get('url')}: {str(e)}")
            return {
                "taxicab_id": None,
                "url": url_data.get('url'),
                "status_code": 0,
                "resolved_url": None,
                "content_type": None,
                "created_date": url_data["created_date"],
                "native_id": url_data.get("native_id"),
                "native_id_namespace": url_data.get("native_id_namespace", ""),
                "s3_path": None,
                "is_soft_block": False,
                "error": str(e),
            }
    
    print(f"Starting ThreadPool with {max_workers} workers to process {len(url_list)} URLs")
    start_time = time.time()
    
    # Use ThreadPoolExecutor to process URLs in parallel
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # submit all tasks and map them to their original URLs
        future_to_url = {executor.submit(submit_url, url_data): url_data for url_data in url_list}

        count = 0
        total = len(future_to_url)
        
        # process results as they complete
        for future in concurrent.futures.as_completed(future_to_url):
            count += 1
            print(f"Processed {count}/{total}")
            
            url_data = future_to_url[future]
            try:
                result = future.result()
                results.append(result)
            except Exception as exc:
                print(f"URL {url_data.get('url')} generated an exception: {exc}")
                original_native_id = url_data.get("native_id")
                cleaned_native_id = clean_doi(original_native_id)
                results.append({
                    "taxicab_id": None,
                    "url": url_data.get('url'),
                    "status_code": 0,
                    "resolved_url": None,
                    "content_type": None,
                    "created_date": url_data["created_date"],
                    "native_id": cleaned_native_id,
                    "native_id_namespace": url_data.get("native_id_namespace", ""),
                    "s3_path": None,
                    "is_soft_block": False,
                    "error": str(exc),
                })
    
    elapsed_time = time.time() - start_time
    print(f"ThreadPool processing completed in {elapsed_time:.2f} seconds")
    print(f"Processed {len(results)} URLs")
    
    return results

In [None]:
# run it all
results = process_urls_with_threadpool(jsonUrls, max_workers=120)

processed_date = datetime.datetime.now(timezone.utc)

for result in results:
    result["processed_date"] = processed_date

# create DataFrame directly from results and save to table
results_df = spark.createDataFrame(results, schema=results_schema)
results_df.write.mode("append").format("delta").saveAsTable("openalex.taxicab.taxicab_results")

print(f"Updated {results_df.count()} records in the results table")