# Taxicab Scraping Dashboard

Monitors scraping health across publishers and URL patterns. Each cell is a standalone query designed as a dashboard panel.

Data source: `openalex.taxicab.taxicab_results`

Key metrics:
- **Volume**: total scrapes per day/publisher
- **Error rate**: errors / total, broken down by type
- **URL patterns**: groups URLs by domain + path prefix to catch systematic failures
- **Trend detection**: compares recent vs prior period to flag worsening domains

In [None]:
%sql
-- Publisher domain mapping view
-- Maps URL domains to human-readable publisher names for all downstream queries
CREATE OR REPLACE TEMPORARY VIEW taxicab_publisher_map AS
WITH domain_publisher AS (
  SELECT domain, publisher FROM VALUES
    ('sciencedirect.com', 'Elsevier'),
    ('cell.com', 'Elsevier'),
    ('thelancet.com', 'Elsevier'),
    ('ars.els-cdn.com', 'Elsevier'),
    ('linkinghub.elsevier.com', 'Elsevier'),
    ('link.springer.com', 'Springer Nature'),
    ('nature.com', 'Springer Nature'),
    ('springeropen.com', 'Springer Nature'),
    ('biomedcentral.com', 'Springer Nature'),
    ('onlinelibrary.wiley.com', 'Wiley'),
    ('wiley.com', 'Wiley'),
    ('mdpi.com', 'MDPI'),
    ('tandfonline.com', 'Taylor & Francis'),
    ('taylorfrancis.com', 'Taylor & Francis'),
    ('journals.sagepub.com', 'SAGE'),
    ('sagepub.com', 'SAGE'),
    ('academic.oup.com', 'Oxford University Press'),
    ('oup.com', 'Oxford University Press'),
    ('ieeexplore.ieee.org', 'IEEE'),
    ('ieee.org', 'IEEE'),
    ('dl.acm.org', 'ACM'),
    ('jstor.org', 'JSTOR'),
    ('plos.org', 'PLOS'),
    ('journals.plos.org', 'PLOS'),
    ('frontiersin.org', 'Frontiers'),
    ('brill.com', 'Brill'),
    ('degruyter.com', 'De Gruyter'),
    ('cambridge.org', 'Cambridge University Press'),
    ('karger.com', 'Karger'),
    ('persee.fr', 'Persee'),
    ('pnas.org', 'PNAS'),
    ('science.org', 'AAAS'),
    ('acs.org', 'American Chemical Society'),
    ('pubs.acs.org', 'American Chemical Society'),
    ('aip.org', 'AIP Publishing'),
    ('iop.org', 'IOP Publishing'),
    ('iopscience.iop.org', 'IOP Publishing'),
    ('hindawi.com', 'Hindawi'),
    ('wolterskluwer.com', 'Wolters Kluwer'),
    ('lww.com', 'Wolters Kluwer'),
    ('jamanetwork.com', 'AMA'),
    ('bmj.com', 'BMJ'),
    ('arxiv.org', 'arXiv')
  AS t(domain, publisher)
)
SELECT * FROM domain_publisher

In [None]:
%sql
-- Daily overview: volume, errors, error rate (last 30 days)
-- Visualization: line chart with date on x-axis, dual y-axis for counts and error_pct
SELECT
  DATE(created_date) AS scrape_date,
  COUNT(*) AS total_scrapes,
  SUM(CASE WHEN error IS NOT NULL THEN 1 ELSE 0 END) AS errors,
  ROUND(100.0 * SUM(CASE WHEN error IS NOT NULL THEN 1 ELSE 0 END) / COUNT(*), 2) AS error_pct,
  SUM(CASE WHEN is_soft_block = true THEN 1 ELSE 0 END) AS soft_blocks
FROM openalex.taxicab.taxicab_results
WHERE created_date >= DATE_ADD(CURRENT_DATE(), -30)
GROUP BY DATE(created_date)
ORDER BY scrape_date

In [None]:
%sql
-- Publisher error leaderboard (last 30 days)
-- Top publishers ranked by total error count
SELECT
  COALESCE(pm.publisher, TRY_PARSE_URL(t.url, 'HOST')) AS publisher,
  TRY_PARSE_URL(t.url, 'HOST') AS domain,
  COUNT(*) AS total,
  SUM(CASE WHEN t.error IS NOT NULL THEN 1 ELSE 0 END) AS errors,
  ROUND(100.0 * SUM(CASE WHEN t.error IS NOT NULL THEN 1 ELSE 0 END) / COUNT(*), 2) AS error_pct,
  SUM(CASE WHEN t.is_soft_block = true THEN 1 ELSE 0 END) AS soft_blocks
FROM openalex.taxicab.taxicab_results t
LEFT JOIN taxicab_publisher_map pm
  ON TRY_PARSE_URL(t.url, 'HOST') LIKE CONCAT('%', pm.domain)
WHERE t.created_date >= DATE_ADD(CURRENT_DATE(), -30)
GROUP BY COALESCE(pm.publisher, TRY_PARSE_URL(t.url, 'HOST')), TRY_PARSE_URL(t.url, 'HOST')
ORDER BY errors DESC
LIMIT 30

In [None]:
%sql
-- Publisher error rate leaderboard (min 500 scrapes, last 30 days)
-- Catches consistently-failing smaller publishers
SELECT
  COALESCE(pm.publisher, TRY_PARSE_URL(t.url, 'HOST')) AS publisher,
  TRY_PARSE_URL(t.url, 'HOST') AS domain,
  COUNT(*) AS total,
  SUM(CASE WHEN t.error IS NOT NULL THEN 1 ELSE 0 END) AS errors,
  ROUND(100.0 * SUM(CASE WHEN t.error IS NOT NULL THEN 1 ELSE 0 END) / COUNT(*), 2) AS error_pct
FROM openalex.taxicab.taxicab_results t
LEFT JOIN taxicab_publisher_map pm
  ON TRY_PARSE_URL(t.url, 'HOST') LIKE CONCAT('%', pm.domain)
WHERE t.created_date >= DATE_ADD(CURRENT_DATE(), -30)
GROUP BY COALESCE(pm.publisher, TRY_PARSE_URL(t.url, 'HOST')), TRY_PARSE_URL(t.url, 'HOST')
HAVING COUNT(*) >= 500
ORDER BY error_pct DESC
LIMIT 30

In [None]:
%sql
-- Error type breakdown by publisher (last 30 days)
-- Categorizes errors into timeout, server error, connection error, etc.
SELECT
  COALESCE(pm.publisher, TRY_PARSE_URL(t.url, 'HOST')) AS publisher,
  CASE
    WHEN t.status_code = 504 OR t.error LIKE '%timeout%' OR t.error LIKE '%Timeout%' THEN 'Timeout'
    WHEN t.status_code BETWEEN 500 AND 503 OR t.error LIKE '%500%' THEN 'Server Error (5xx)'
    WHEN t.status_code = 429 OR t.error LIKE '%429%' OR t.error LIKE '%rate%' THEN 'Rate Limited (429)'
    WHEN t.status_code = 403 OR t.error LIKE '%403%' OR t.error LIKE '%forbidden%' THEN 'Forbidden (403)'
    WHEN t.error LIKE '%ConnectionError%' OR t.error LIKE '%connection%' THEN 'Connection Error'
    WHEN t.error LIKE '%JSONDecode%' OR t.error LIKE '%json%' THEN 'JSON Decode Error'
    WHEN t.is_soft_block = true THEN 'Soft Block'
    WHEN t.status_code BETWEEN 400 AND 499 THEN 'Client Error (4xx)'
    ELSE 'Other'
  END AS error_type,
  COUNT(*) AS error_count
FROM openalex.taxicab.taxicab_results t
LEFT JOIN taxicab_publisher_map pm
  ON TRY_PARSE_URL(t.url, 'HOST') LIKE CONCAT('%', pm.domain)
WHERE t.created_date >= DATE_ADD(CURRENT_DATE(), -30)
  AND (t.error IS NOT NULL OR t.is_soft_block = true)
GROUP BY
  COALESCE(pm.publisher, TRY_PARSE_URL(t.url, 'HOST')),
  CASE
    WHEN t.status_code = 504 OR t.error LIKE '%timeout%' OR t.error LIKE '%Timeout%' THEN 'Timeout'
    WHEN t.status_code BETWEEN 500 AND 503 OR t.error LIKE '%500%' THEN 'Server Error (5xx)'
    WHEN t.status_code = 429 OR t.error LIKE '%429%' OR t.error LIKE '%rate%' THEN 'Rate Limited (429)'
    WHEN t.status_code = 403 OR t.error LIKE '%403%' OR t.error LIKE '%forbidden%' THEN 'Forbidden (403)'
    WHEN t.error LIKE '%ConnectionError%' OR t.error LIKE '%connection%' THEN 'Connection Error'
    WHEN t.error LIKE '%JSONDecode%' OR t.error LIKE '%json%' THEN 'JSON Decode Error'
    WHEN t.is_soft_block = true THEN 'Soft Block'
    WHEN t.status_code BETWEEN 400 AND 499 THEN 'Client Error (4xx)'
    ELSE 'Other'
  END
ORDER BY publisher, error_count DESC

In [None]:
%sql
-- URL path pattern errors (last 30 days)
-- Groups by domain + first 2 path segments to detect systematic URL pattern failures
-- e.g. www.cell.com/heliyon/pdf, link.springer.com/article/10.1007
SELECT
  TRY_PARSE_URL(url, 'HOST') AS domain,
  CONCAT(
    TRY_PARSE_URL(url, 'HOST'),
    '/',
    CONCAT_WS('/',
      SLICE(SPLIT(REGEXP_REPLACE(COALESCE(TRY_PARSE_URL(url, 'PATH'), '/'), '^/', ''), '/'), 1, 2)
    )
  ) AS url_pattern,
  COUNT(*) AS total,
  SUM(CASE WHEN error IS NOT NULL THEN 1 ELSE 0 END) AS errors,
  ROUND(100.0 * SUM(CASE WHEN error IS NOT NULL THEN 1 ELSE 0 END) / COUNT(*), 2) AS error_pct,
  SUM(CASE WHEN is_soft_block = true THEN 1 ELSE 0 END) AS soft_blocks
FROM openalex.taxicab.taxicab_results
WHERE created_date >= DATE_ADD(CURRENT_DATE(), -30)
GROUP BY
  TRY_PARSE_URL(url, 'HOST'),
  CONCAT(
    TRY_PARSE_URL(url, 'HOST'),
    '/',
    CONCAT_WS('/',
      SLICE(SPLIT(REGEXP_REPLACE(COALESCE(TRY_PARSE_URL(url, 'PATH'), '/'), '^/', ''), '/'), 1, 2)
    )
  )
HAVING COUNT(*) >= 50
ORDER BY errors DESC
LIMIT 50

In [None]:
%sql
-- Daily error rate by top 10 problem domains (last 30 days)
-- Multi-line time series showing whether errors are spiking or chronic
WITH top_error_domains AS (
  SELECT TRY_PARSE_URL(url, 'HOST') AS domain
  FROM openalex.taxicab.taxicab_results
  WHERE created_date >= DATE_ADD(CURRENT_DATE(), -30)
    AND error IS NOT NULL
  GROUP BY TRY_PARSE_URL(url, 'HOST')
  ORDER BY COUNT(*) DESC
  LIMIT 10
)
SELECT
  DATE(t.created_date) AS scrape_date,
  TRY_PARSE_URL(t.url, 'HOST') AS domain,
  COUNT(*) AS total,
  SUM(CASE WHEN t.error IS NOT NULL THEN 1 ELSE 0 END) AS errors,
  ROUND(100.0 * SUM(CASE WHEN t.error IS NOT NULL THEN 1 ELSE 0 END) / COUNT(*), 2) AS error_pct
FROM openalex.taxicab.taxicab_results t
INNER JOIN top_error_domains d
  ON TRY_PARSE_URL(t.url, 'HOST') = d.domain
WHERE t.created_date >= DATE_ADD(CURRENT_DATE(), -30)
GROUP BY DATE(t.created_date), TRY_PARSE_URL(t.url, 'HOST')
ORDER BY domain, scrape_date

In [None]:
%sql
-- Status code distribution (last 30 days)
SELECT
  COALESCE(CAST(status_code AS STRING), 'NULL') AS status_code,
  COUNT(*) AS count,
  ROUND(100.0 * COUNT(*) / SUM(COUNT(*)) OVER (), 2) AS pct
FROM openalex.taxicab.taxicab_results
WHERE created_date >= DATE_ADD(CURRENT_DATE(), -30)
GROUP BY status_code
ORDER BY count DESC

In [None]:
%sql
-- Elsevier family deep dive (last 30 days)
-- Drill-down for all Elsevier-owned domains: URL patterns, error rates, error types
SELECT
  TRY_PARSE_URL(url, 'HOST') AS domain,
  CONCAT(
    TRY_PARSE_URL(url, 'HOST'),
    '/',
    CONCAT_WS('/',
      SLICE(SPLIT(REGEXP_REPLACE(COALESCE(TRY_PARSE_URL(url, 'PATH'), '/'), '^/', ''), '/'), 1, 2)
    )
  ) AS url_pattern,
  COUNT(*) AS total,
  SUM(CASE WHEN error IS NOT NULL THEN 1 ELSE 0 END) AS errors,
  ROUND(100.0 * SUM(CASE WHEN error IS NOT NULL THEN 1 ELSE 0 END) / COUNT(*), 2) AS error_pct,
  SUM(CASE WHEN status_code = 504 OR error LIKE '%timeout%' OR error LIKE '%Timeout%' THEN 1 ELSE 0 END) AS timeouts,
  SUM(CASE WHEN status_code BETWEEN 500 AND 503 THEN 1 ELSE 0 END) AS server_errors,
  SUM(CASE WHEN status_code = 429 THEN 1 ELSE 0 END) AS rate_limited,
  SUM(CASE WHEN is_soft_block = true THEN 1 ELSE 0 END) AS soft_blocks
FROM openalex.taxicab.taxicab_results
WHERE created_date >= DATE_ADD(CURRENT_DATE(), -30)
  AND (
    TRY_PARSE_URL(url, 'HOST') LIKE '%sciencedirect.com%'
    OR TRY_PARSE_URL(url, 'HOST') LIKE '%cell.com%'
    OR TRY_PARSE_URL(url, 'HOST') LIKE '%thelancet.com%'
    OR TRY_PARSE_URL(url, 'HOST') LIKE '%els-cdn.com%'
    OR TRY_PARSE_URL(url, 'HOST') LIKE '%elsevier.com%'
  )
GROUP BY
  TRY_PARSE_URL(url, 'HOST'),
  CONCAT(
    TRY_PARSE_URL(url, 'HOST'),
    '/',
    CONCAT_WS('/',
      SLICE(SPLIT(REGEXP_REPLACE(COALESCE(TRY_PARSE_URL(url, 'PATH'), '/'), '^/', ''), '/'), 1, 2)
    )
  )
HAVING COUNT(*) >= 20
ORDER BY errors DESC
LIMIT 40

In [None]:
%sql
-- New/worsening problems: last 7 days vs prior 7 days
-- Highlights domains where error rate increased significantly
WITH recent AS (
  SELECT
    TRY_PARSE_URL(url, 'HOST') AS domain,
    COUNT(*) AS total,
    SUM(CASE WHEN error IS NOT NULL THEN 1 ELSE 0 END) AS errors,
    ROUND(100.0 * SUM(CASE WHEN error IS NOT NULL THEN 1 ELSE 0 END) / COUNT(*), 2) AS error_pct
  FROM openalex.taxicab.taxicab_results
  WHERE created_date >= DATE_ADD(CURRENT_DATE(), -7)
  GROUP BY TRY_PARSE_URL(url, 'HOST')
  HAVING COUNT(*) >= 100
),
prior AS (
  SELECT
    TRY_PARSE_URL(url, 'HOST') AS domain,
    COUNT(*) AS total,
    SUM(CASE WHEN error IS NOT NULL THEN 1 ELSE 0 END) AS errors,
    ROUND(100.0 * SUM(CASE WHEN error IS NOT NULL THEN 1 ELSE 0 END) / COUNT(*), 2) AS error_pct
  FROM openalex.taxicab.taxicab_results
  WHERE created_date >= DATE_ADD(CURRENT_DATE(), -14)
    AND created_date < DATE_ADD(CURRENT_DATE(), -7)
  GROUP BY TRY_PARSE_URL(url, 'HOST')
  HAVING COUNT(*) >= 100
)
SELECT
  COALESCE(r.domain, p.domain) AS domain,
  r.total AS recent_total,
  r.errors AS recent_errors,
  r.error_pct AS recent_error_pct,
  p.total AS prior_total,
  p.errors AS prior_errors,
  p.error_pct AS prior_error_pct,
  ROUND(COALESCE(r.error_pct, 0) - COALESCE(p.error_pct, 0), 2) AS error_pct_change
FROM recent r
FULL OUTER JOIN prior p ON r.domain = p.domain
WHERE COALESCE(r.error_pct, 0) - COALESCE(p.error_pct, 0) > 2
   OR (r.domain IS NOT NULL AND p.domain IS NULL AND r.error_pct > 5)
ORDER BY error_pct_change DESC