### This pipeline scrapes landing pages and PDFs. The files are stored in Cloudflare R2 and the metadata is saved in a table

**input**: recent crossref records, repo records, and PDF urls from landing page records

**process**: taxicab API on ECS

**output**: file id, url, related ids saved to `openalex.taxicab.taxicab_results`


In [0]:
from concurrent.futures import ThreadPoolExecutor
import concurrent.futures
import datetime
import pytz
import time
from urllib3.util.retry import Retry
from pyspark.sql import functions as F
from pyspark.sql import types as T
import requests
from requests.adapters import HTTPAdapter
import pandas as pd
from datetime import timezone

In [0]:
ENDPOINT = "http://harvester-load-balancer-366186003.us-east-1.elb.amazonaws.com/taxicab"

In [0]:
%sql
CREATE TABLE IF NOT EXISTS openalex.taxicab.taxicab_results (
  taxicab_id STRING,
  url STRING,
  resolved_url STRING,
  status_code INT,
  content_type STRING,
  native_id STRING,
  native_id_namespace STRING,
  s3_path STRING,
  is_soft_block BOOLEAN,
  created_date TIMESTAMP,
  processed_date TIMESTAMP,
  error STRING
)
USING DELTA;

In [0]:
def convert_to_datetime(value):
    """Convert various date/time formats to datetime.datetime with UTC timezone"""
    if value is None:
        return datetime.datetime.now(timezone.utc)
    
    # Handle pandas Timestamp objects specifically
    if hasattr(value, 'to_pydatetime'):
        dt = value.to_pydatetime()
        if dt.tzinfo is None:
            return dt.replace(tzinfo=timezone.utc)
        return dt
    
    # Handle regular datetime objects
    if isinstance(value, datetime.datetime):
        if value.tzinfo is None:
            return value.replace(tzinfo=timezone.utc)
        return value
    
    # Handle date objects (convert to datetime)
    if isinstance(value, datetime.date):
        return datetime.datetime.combine(value, datetime.time(0, 0, 0, tzinfo=timezone.utc))
    
    # Handle string dates
    if isinstance(value, str):
        try:
            # Try parsing ISO format first
            if 'T' in value or '+' in value or 'Z' in value:
                return datetime.datetime.fromisoformat(value.replace('Z', '+00:00'))
            else:
                # Try pandas to_datetime as fallback, then convert to datetime
                pd_timestamp = pd.to_datetime(value)
                return pd_timestamp.to_pydatetime().replace(tzinfo=timezone.utc)
        except Exception as e:
            print(f"Error parsing date string '{value}': {e}")
            return datetime.datetime.now(timezone.utc)
    
    # Fallback for any other type
    print(f"Unexpected date type: {type(value)} - {value}")
    return datetime.datetime.now(timezone.utc)

In [0]:
last_processed_field = "created_date"
post_2023 = convert_to_datetime(datetime.date(2023, 1, 1))
print(f"Using fixed last processed date: {post_2023}")

In [0]:
# urls to scrape

spark.catalog.refreshTable("openalex.taxicab.taxicab_results")

taxicab_finished_tasks = (
    spark.table("openalex.taxicab.taxicab_results")
    .withColumn("norm_url", F.regexp_replace("url", r"^https?://(dx\.)?", ""))
    .select(
        F.lower(F.col("native_id")).alias("finished_id"),
        F.col("norm_url").alias("finished_url")
    )
    .distinct()
)

def normalize_created_date(df):
    return df.withColumn(
        "created_date",
        F.to_timestamp("created_date")
    )

taxicab_results = normalize_created_date(
    spark.table("openalex.taxicab.taxicab_results").select("url", "created_date")
)

recent_crossref_works = (
    normalize_created_date(
        spark.read.table("openalex.crossref.crossref_works")
        .filter(F.col(last_processed_field) >= F.lit(post_2023))
    )
    # Explode URLs if a DOI has more than one, or just filter for doi.org
    .select(
        "native_id",
        "native_id_namespace",
        "created_date",
        F.explode("urls").alias("url_struct")
    )
    .withColumn("url", F.col("url_struct.url"))
    .filter(F.col("url").contains("doi.org"))
    # Normalize the source URL exactly like the finished tasks
    .withColumn("norm_url_src", F.regexp_replace("url", r"^https?://(dx\.)?", ""))
    .alias("src")
    .join(
        taxicab_finished_tasks.alias("fin"),
        (F.lower(F.col("src.native_id")) == F.col("fin.finished_id")) &
        (F.col("src.norm_url_src") == F.col("fin.finished_url")),
        "left_anti"
    )
    .select("src.native_id", "src.native_id_namespace", "src.url", "src.created_date")
)

recent_pdf_works = None
if spark.catalog.tableExists("openalex.landing_page.landing_page_works"):
    recent_pdf_works = (
        normalize_created_date(
            spark.read
            .table("openalex.landing_page.landing_page_works")
            .filter(F.col(last_processed_field) >= F.date_sub(F.lit(post_2023), 2))
            .select(
                "ids",
                "native_id",
                "native_id_namespace",
                F.expr("get(filter(urls, x -> x.content_type = 'pdf'), 0).url").alias("url"),
                "created_date"
            )
            .withColumn("pmh_id", F.expr("get(filter(ids, x -> x.namespace = 'pmh'), 0).id"))
            .withColumn("doi_id", F.expr("get(filter(ids, x -> x.namespace = 'doi'), 0).id"))
            # Set priority: PMH first, then DOI, then original
            .withColumn("final_native_id",
                F.when(F.col("pmh_id").isNotNull(), F.col("pmh_id"))
                .when(F.col("doi_id").isNotNull(), F.col("doi_id"))
                .otherwise(F.col("native_id")))
            .withColumn("final_namespace",
                F.when(F.col("pmh_id").isNotNull(), F.lit("pmh"))
                .when(F.col("doi_id").isNotNull(), F.lit("doi"))
                .otherwise(F.col("native_id_namespace")))
            # select final columns
            .select(
                F.col("final_native_id").alias("native_id"),
                F.col("final_namespace").alias("native_id_namespace"),
                "url",
                "created_date"
            )
            .filter(F.col("url").isNotNull())
        )
        .join(
            taxicab_results,
            ["url"],
            "left_anti"
        )
    )



recent_repo_works = (
    normalize_created_date(
        spark.read.table("openalex.repo.repo_works")
        .filter(F.col(last_processed_field) >= F.lit(post_2023))
        .select(
            "native_id",
            "native_id_namespace",
            F.slice("urls", 1, 3).alias("urls"),
            "created_date"
        )
        .filter(F.col("urls").isNotNull())
        .select("*", F.explode("urls").alias("url_struct"))
        .select(
            "native_id",
            "native_id_namespace",
            "created_date",
            F.col("url_struct.url").alias("url")
        )
        .filter(~F.col("url").contains("doi.org"))
    )
    .join(
        taxicab_results,
        ["url"],
        "left_anti"
    )
)

all_urls = recent_crossref_works
if recent_pdf_works is not None:
    all_urls = recent_crossref_works.unionByName(
        recent_pdf_works,
        allowMissingColumns=True
    ).unionByName(
        recent_repo_works,
        allowMissingColumns=True
    )
else:
    all_urls = recent_crossref_works.unionByName(
        recent_repo_works,
        allowMissingColumns=True
    )

recent_pdf_count = (recent_pdf_works and recent_pdf_works.count()) or 0
print(f"{recent_crossref_works.count()} crossref URLs, {recent_repo_works.count()} repo URLs, {recent_pdf_count} PDF URLs remaining")
print(f"{all_urls.count()} total URLs remaining")

all_urls = all_urls.orderBy("created_date").limit(250000)

current_date = datetime.datetime.now(timezone.utc)

# modify the DataFrame to replace extreme dates with current date
bounded_all_urls = all_urls.withColumn(
    "created_date",
    F.when(
        F.col("created_date").isNull() |
        (F.year(F.col("created_date")) < 1900) | 
        (F.year(F.col("created_date")) > 2100),
        F.lit(current_date)
    ).otherwise(F.col("created_date"))
)

all_urls_pd = bounded_all_urls.toPandas()

# convert to a JSON-compatible list and filter out None values
jsonUrls = []
for row in all_urls_pd.to_dict('records'):
    if row["url"] is not None:
        formatted_date = convert_to_datetime(row["created_date"])
        
        entry = {
            "url": row["url"],
            "created_date": formatted_date,
            "native_id": row.get("native_id", ""),
            "native_id_namespace": row.get("native_id_namespace", "")
        }
        jsonUrls.append(entry)

total_urls = len(jsonUrls)
pdf_urls = sum(1 for url in jsonUrls if '.pdf' in url['url'].lower())
doi_urls = sum(1 for url in jsonUrls if 'doi.org' in url['url'].lower())
other_urls = total_urls - pdf_urls - doi_urls

print(f"Harvesting {total_urls} URLs ({pdf_urls} PDFs, {doi_urls} DOIs, {other_urls} other URLs)")

In [0]:
# result schema

results_schema = T.StructType([
    T.StructField("taxicab_id", T.StringType(), True),
    T.StructField("url", T.StringType(), True),
    T.StructField("resolved_url", T.StringType(), True),
    T.StructField("status_code", T.IntegerType(), True),
    T.StructField("content_type", T.StringType(), True),
    T.StructField("native_id", T.StringType(), True),
    T.StructField("native_id_namespace", T.StringType(), True),
    T.StructField("s3_path", T.StringType(), True),
    T.StructField("is_soft_block", T.BooleanType(), True),
    T.StructField("created_date", T.TimestampType(), True),
    T.StructField("processed_date", T.TimestampType(), True),
    T.StructField("error", T.StringType(), True)
])

In [0]:
# check for existing scrape using taxicab API
def get_existing_scrape(url_data):
    """
    Check if the DOI already exists in the Taxicab API before sending to harvester.
    Matches the URL in url_data with URLs in either HTML or PDF lists.
    Returns the matching result if available, None otherwise.
    """
    native_id = url_data.get("native_id")
    native_id_namespace = url_data.get("native_id_namespace")
    url_to_match = url_data.get("url")
    
    try:
        api_url = f"{ENDPOINT}/{native_id_namespace}/{native_id}"
        response = requests.get(api_url)
        
        if response.status_code == 200:
            response_data = response.json()
            
            # check HTML list for matching URL
            if response_data.get("html") and len(response_data["html"]) > 0:
                for html_result in response_data["html"]:
                    if html_result.get("url") == url_to_match:
                        print(f"Found existing HTML for {url_to_match} with native_id {native_id} and native_id_namespace {native_id_namespace}")
                        return {
                            "taxicab_id": html_result["id"],
                            "url": url_data["url"],
                            "resolved_url": html_result.get("resolved_url"),
                            "status_code": 200,
                            "content_type": "text/html",
                            "created_date": url_data["created_date"],
                            "native_id": html_result["native_id"],
                            "native_id_namespace": html_result["native_id_namespace"],
                            "s3_path": html_result.get("s3_path"),
                            "is_soft_block": False,
                            "error": None
                        }
            
            # no match in HTML, check PDF list
            if response_data.get("pdf") and len(response_data["pdf"]) > 0:
                for pdf_result in response_data["pdf"]:
                    print(f"Found existing PDF for {url_to_match} with native_id {native_id} and native_id_namespace {native_id_namespace}")
                    if pdf_result.get("url") == url_to_match:
                        return {
                            "taxicab_id": pdf_result["id"],
                            "url": url_data["url"],
                            "resolved_url": pdf_result.get("resolved_url"),
                            "status_code": 200,
                            "content_type": "application/pdf",
                            "created_date": url_data["created_date"],
                            "native_id": pdf_result["native_id"],
                            "native_id_namespace": pdf_result["native_id_namespace"],
                            "s3_path": pdf_result.get("s3_path"),
                            "is_soft_block": False,
                            "error": None
                        }
        
        return None
    
    except Exception as e:
        print(f"Error checking {native_id_namespace} in API: {str(e)}")
        return None

In [0]:
# process single url

def process_url(url_data):
    """
    Main function to process urls.
    """
    # check if it was already scraped in unpaywall, or in this process earlier
    existing_scrape = get_existing_scrape(url_data)
    if existing_scrape:
        print(f"Found existing HTML or PDF for {url_data.get('native_id')} url {url_data.get('url')}, skipping harvest")
        return existing_scrape
    
    # clean native_id in case it contains doi.org
    native_id = url_data.get("native_id", "")
    if native_id and "https://doi.org/" in native_id:
        native_id = native_id.replace("https://doi.org/", "")
    
    # proceed with original harvester process 
    try:
        payload = {
            "url": url_data.get("url"),
            "native_id": native_id,
            "native_id_namespace": url_data.get("native_id_namespace", "")
        }
        
        response = requests.post(ENDPOINT, json=payload)
        response.raise_for_status()
        response_data = response.json()
        print(response_data.get("id"))
        print(response_data.get("native_id"))
        print(response_data.get("native_id_namespace"))
        
        return {
            "taxicab_id": response_data.get("id"),
            "url": url_data.get("url"),
            "harvester_id": response_data.get("id"),
            "status_code": response_data.get("status_code"),
            "resolved_url": response_data.get("resolved_url"),
            "content_type": response_data.get("content_type"),
            "created_date": url_data["created_date"],
            "native_id": response_data.get("native_id"),
            "native_id_namespace": response_data.get("native_id_namespace"),
            "s3_path": response_data.get("s3_path"),
            "is_soft_block": response_data.get("is_soft_block", False),
            "error": None
        }
    
    # something went wrong
    except requests.RequestException as e:
        return {
            "taxicab_id": None,
            "url": url_data.get("url"),
            "status_code": getattr(e.response, 'status_code', 0),
            "resolved_url": None,
            "content_type": None,
            "created_date": url_data["created_date"],
            "native_id": native_id,
            "native_id_namespace": url_data.get("native_id_namespace", ""),
            "s3_path": None,
            "is_soft_block": False,
            "error": str(e),
        }

In [0]:
# run all urls in a threadpool

def process_urls_with_threadpool(url_list, max_workers):
    """
    Process URLs using a ThreadPoolExecutor to parallelize requests.
    """
    results = []
    
    # configure session with retry mechanism for better reliability
    session = requests.Session()
    retries = Retry(
        total=3,
        backoff_factor=0.5,
        status_forcelist=[500, 502, 503, 504],
        allowed_methods=["GET", "POST"]
    )
    adapter = HTTPAdapter(
        pool_connections=120,
        pool_maxsize=120,
        max_retries=retries
    )
    session.mount('http://', adapter)
    session.mount('https://', HTTPAdapter(max_retries=retries))

    # helper function to clean DOI prefixes
    def clean_doi(native_id):
        if native_id and isinstance(native_id, str):
            if "https://doi.org/" in native_id:
                return native_id.replace("https://doi.org/", "")
        return native_id
    
    # process function that includes the session
    def process_url_with_session(url_data):
        if "native_id" in url_data and url_data["native_id"]:
            url_data = url_data.copy()
            url_data["native_id"] = clean_doi(url_data["native_id"])

        try:
            result = process_url(url_data)
            print(f"Processed {url_data.get('url')} - Status: {result.get('status_code')}")
            return result
        except Exception as e:
            print(f"Error processing {url_data.get('url')}: {str(e)}")
            return {
                "taxicab_id": None,
                "url": url_data.get('url'),
                "status_code": 0,
                "resolved_url": None,
                "content_type": None,
                "created_date": url_data["created_date"],
                "native_id": url_data.get("native_id"),
                "native_id_namespace": url_data.get("native_id_namespace", ""),
                "s3_path": None,
                "is_soft_block": False,
                "error": str(e),
            }
    
    print(f"Starting ThreadPool with {max_workers} workers to process {len(url_list)} URLs")
    start_time = time.time()
    
    # Use ThreadPoolExecutor to process URLs in parallel
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # submit all tasks and map them to their original URLs
        future_to_url = {executor.submit(process_url_with_session, url_data): url_data for url_data in url_list}

        count = 0
        total = len(future_to_url)
        
        # process results as they complete
        for future in concurrent.futures.as_completed(future_to_url):
            count += 1
            print(f"Processed {count}/{total}")
            
            url_data = future_to_url[future]
            try:
                result = future.result()
                results.append(result)
            except Exception as exc:
                print(f"URL {url_data.get('url')} generated an exception: {exc}")
                original_native_id = url_data.get("native_id")
                cleaned_native_id = clean_doi(original_native_id)
                results.append({
                    "taxicab_id": None,
                    "url": url_data.get('url'),
                    "status_code": 0,
                    "resolved_url": None,
                    "content_type": None,
                    "created_date": url_data["created_date"],
                    "native_id": cleaned_native_id,
                    "native_id_namespace": url_data.get("native_id_namespace", ""),
                    "s3_path": None,
                    "is_soft_block": False,
                    "error": str(exc),
                })
    
    elapsed_time = time.time() - start_time
    print(f"ThreadPool processing completed in {elapsed_time:.2f} seconds")
    print(f"Processed {len(results)} URLs")
    
    return results

In [0]:
# run it all
results = process_urls_with_threadpool(jsonUrls, max_workers=120)

processed_date = datetime.datetime.now(timezone.utc)

for result in results:
    result["processed_date"] = processed_date
    
    for field_name in ["created_date", "updated_date"]:
        if field_name in result:
            result[field_name] = convert_to_datetime(result[field_name])

# create DataFrame directly from results and save to table
results_df = spark.createDataFrame(results, schema=results_schema)
results_df.write.mode("append").format("delta").saveAsTable("openalex.taxicab.taxicab_results")

print(f"Updated {results_df.count()} records in the results table")

In [None]:
# --- Crossref DOI Link Progress ---
eligible_links = (
    spark.read.table("openalex.crossref.crossref_works")
    .filter(F.col(last_processed_field) >= F.lit(post_2023))
    .select("native_id", F.explode("urls").alias("u"))
    .filter(F.col("u.url").contains("doi.org"))
    .withColumn("norm_url", F.regexp_replace("u.url", r"^https?://(dx\.)?", ""))
    .select(F.lower(F.col("native_id")).alias("id"), "norm_url")
    .distinct()
)

total_to_do = eligible_links.count()
done_count = taxicab_finished_tasks.join(eligible_links,
    (taxicab_finished_tasks.finished_id == eligible_links.id) &
    (taxicab_finished_tasks.finished_url == eligible_links.norm_url)).count()

print(f"--- PROGRESS REPORT ---")
print(f"Total Unique DOI Links to Scrape: {total_to_do:,}")
print(f"Total Links Completed:           {done_count:,}")
print(f"Remaining DOI Links:             {total_to_do - done_count:,}")