# SKOL IV: All the Data

In [5]:
!pip install pymupdf4llm
couchdb_package = 'org.apache.bahir:spark-sql-cloudant_2.11:2.4.0'
!pyspark --packages $couchdb_package < /dev/null




In [6]:
from io import BytesIO
import json
import os
from pathlib import Path, PurePath
import requests
from typing import Any, Dict
from urllib.robotparser import RobotFileParser

# Be sure to get version 2: https://simple-repository.app.cern.ch/project/bibtexparser/2.0.0b8/description
import bibtexparser
import couchdb
import feedparser
import fitz # PyMuPDF
import pymupdf4llm
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType, IntegerType, NullType
from uuid import uuid4

Consider using the pymupdf_layout package for a greatly improved page layout analysis.


In [8]:
couchdb_host = "127.0.0.1:5984" # e.g., "ACCOUNT.cloudant.com" or "localhost"
couchdb_username = "admin"
couchdb_password = "SU2orange!"
ingest_db_name = "skol_dev"

spark = SparkSession \
    .builder \
    .appName("CouchDB Spark SQL Example in Python using dataframes") \
    .config("cloudant.host", couchdb_host) \
    .config("cloudant.username", couchdb_username) \
    .config("cloudant.password", couchdb_password) \
    .config("spark.jars.packages", couchdb_package) \
    .getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("ERROR") # Keeps the noise down!!!

couch = couchdb.Server(f'http://{couchdb_username}:{couchdb_password}@{couchdb_host}')
if ingest_db_name not in couch:
    db = couch.create(ingest_db_name)
else:
    db = couch[ingest_db_name]

user_agent = "synoptickeyof.life"

ingenta_rp = RobotFileParser()
ingenta_rp.set_url("https://www.ingentaconnect.com/robots.txt")
ingenta_rp.read() # Reads and parses the robots.txt file from the URL

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/11/12 19:16:52 WARN Utils: Your hostname, puchpuchobs, resolves to a loopback address: 127.0.1.1; using 172.16.227.68 instead (on interface wlp130s0f0)
25/11/12 19:16:52 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
:: loading settings :: url = jar:file:/data/piggy/miniconda3/envs/skol/lib/python3.13/site-packages/pyspark/jars/ivy-2.5.3.jar!/org/apache/ivy/core/settings/ivysettings.xml
Ivy Default Cache set to: /home/piggy/.ivy2.5.2/cache
The jars for the packages stored in: /home/piggy/.ivy2.5.2/jars
org.apache.bahir#spark-sql-cloudant_2.11 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-199c8173-65a4-42f2-8836-4780f7db1876;1.0
	confs: [default]
	found org.apache.bahir#spark-sql-cloudant_2.11;2.4.0 in central
	found org.apache.bahir#bahir-common_2.11;2.4.0 in central
	found org.spark-project.spark#unused;1.0.0 in central
	found com.cloudan

## The Data Sources

## Ingestion

Each journal or other data source gets an ingester that puts PDFs into our document store.

### Ingenta RSS ingestion

In [3]:
def ingest_from_bibtex(
        db: couchdb.Database,
        content: bytes,
        bibtex_link: str,
        meta: Dict[str, Any],
        rp
        ) -> None:
    """Load documents referenced in an Ingenta BibTeX database."""
    bib_database = bibtexparser.parse_string(content)

    bibtex_data = {
        'link': bibtex_link,
        'bibtex': bibtexparser.write_string(bib_database),
    }
    
    for bib_entry in bib_database.entries:
        doc = {
            '_id': uuid4().hex,
            'meta': meta,
            'pdf_url': f"{bib_entry['url']}?crawler=true",
        }

        # Do not fetch if we already have an entry.
        selector = {'selector': {'pdf_url': doc['pdf_url']}}
        found = False
        for e in db.find(selector):
            found = True
        if found:
            print(f"Skipping {doc['pdf_url']}")
            continue

        if not rp.can_fetch(user_agent, doc['pdf_url']):
            print(f"Robot permission denied {doc['pdf_url']}")
            continue

        print(f"Adding {doc['pdf_url']}")
        for k in bib_entry.fields_dict.keys():
            doc[k] = bib_entry[k]
        
        doc_id, doc_rev = db.save(doc)
        with requests.get(doc['pdf_url'], stream=False) as pdf_f:
            pdf_f.raise_for_status()
            pdf_doc = pdf_f.content
        
        attachment_filename = 'article.pdf'
        attachment_content_type = 'application/pdf'
        attachment_file = BytesIO(pdf_doc)

        db.put_attachment(doc, attachment_file, attachment_filename, attachment_content_type)

        print("-" * 10)

# # To retrieve the attachment:
# # attachment_data = db.get_attachment(doc_id, os.path.basename(file_path)).read()
# # print(attachment_data.decode('utf-8')) # Decode if it's a text file

In [4]:
def ingest_ingenta(
        db: couchdb.Database,
        rss_url: str,
        rp
) -> None:
    """Ingest documents from an Ingenta RSS feed."""

    feed = feedparser.parse(rss_url)
    
    feed_meta = {
        'url': rss_url,
        'title': feed.feed.title,
        'link': feed.feed.link,
        'description': feed.feed.description,
    }

    for entry in feed.entriesContent-Type: text/html; charset=UTF-8
:
        entry_meta = {
            'title': entry.title,
            'link': entry.link,
        }
        if hasattr(entry, 'summary'):
            entry_meta['summary'] = entry.summary
        if hasattr(entry, 'description'):
            entry_meta['description'] = entry.description

        bibtex_link = f'{entry.link}?format=bib'
        print(f"bibtex_link: {bibtex_link}")

        if not rp.can_fetch(user_agent, bibtex_link):
            print(f"Robot permission denied {bibtex_link}")
            continue

        with requests.get(bibtex_link, stream=False) as bibtex_f:
            bibtex_f.raise_for_status()  # Raise an HTTPError for bad responses (4xx or 5xx)

            ingest_from_bibtex(
                db=db,
                content=bibtex_f.content\
                    .replace(b"\"\nparent", b"\",\nparent")\
                    .replace(b"\n", b""),
                bibtex_link=bibtex_link,
                meta={
                    'feed': feed_meta,
                    'entry': entry_meta,
                },
                rp=rp
            )
        print("=" * 20)

In [5]:
def ingest_from_local_bibtex(
    db: couchdb.Database,
    root: Path,
    rp
) -> None:
    """Ingest from a local directory with Ingenta bibtext files in it."""
    for dirpath, dirnames, filenames in os.walk(root):
        for filename in filenames:
            if not filename.endswith('format=bib'):
                continue
            full_filepath = os.path.join(dirpath, filename)
            bibtex_link = f"https://www.ingentaconnect.com/{full_filepath[len(str(root)):]}"
            with open(full_filepath) as f:
                content = f.read()\
                    .replace("\"\nparent", "\",\nparent")\
                    .replace("\n", "")
                ingest_from_bibtex(db, content, bibtex_link, meta={}, rp=rp)


In [6]:
ingest_ingenta(db=db, rss_url='https://api.ingentaconnect.com/content/mtax/mt?format=rss', rp=ingenta_rp)

bibtex_link: https://www.ingentaconnect.com/content/mtax/mt/2023/00000137/00000004?format=bib
Skipping https://www.ingentaconnect.com/content/mtax/mt/2023/00000137/00000004/art00001?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2023/00000137/00000004/art00002?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2023/00000137/00000004/art00003?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2023/00000137/00000004/art00004?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2023/00000137/00000004/art00005?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2023/00000137/00000004/art00006?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2023/00000137/00000004/art00007?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2023/00000137/00000004/art00008?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2023/00000137/00000004/art00009?crawler=tr

In [7]:
ingest_ingenta(db=db, rss_url='https://api.ingentaconnect.com/content/wfbi/sim?format=rss', rp=ingenta_rp)

bibtex_link: https://www.ingentaconnect.com/content/wfbi/sim/2025/00000111/00000001?format=bib
Skipping https://www.ingentaconnect.com/content/wfbi/sim/2025/00000111/00000001/art00002?crawler=true
Skipping https://www.ingentaconnect.com/content/wfbi/sim/2025/00000111/00000001/art00001?crawler=true
Skipping https://www.ingentaconnect.com/content/wfbi/sim/2025/00000111/00000001/art00003?crawler=true
Skipping https://www.ingentaconnect.com/content/wfbi/sim/2025/00000111/00000001/art00004?crawler=true
Skipping https://www.ingentaconnect.com/content/wfbi/sim/2025/00000111/00000001/art00005?crawler=true
bibtex_link: https://www.ingentaconnect.com/content/wfbi/sim/2025/00000110/00000001?format=bib
Skipping https://www.ingentaconnect.com/content/wfbi/sim/2025/00000110/00000001/art00001?crawler=true
Skipping https://www.ingentaconnect.com/content/wfbi/sim/2025/00000110/00000001/art00002?crawler=true
Skipping https://www.ingentaconnect.com/content/wfbi/sim/2025/00000110/00000001/art00003?crawler

In [8]:
ingest_from_local_bibtex(
    db=db,
    root=Path("/data/skol/www/www.ingentaconnect.com"),
    rp=ingenta_rpContent-Type: text/html; charset=UTF-8

)

Skipping https://www.ingentaconnect.com/content/mtax/mt/2013/00000125/00000001/art00001?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2013/00000125/00000001/art00002?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2013/00000125/00000001/art00003?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2013/00000125/00000001/art00004?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2013/00000125/00000001/art00005?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2013/00000125/00000001/art00006?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2013/00000125/00000001/art00007?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2013/00000125/00000001/art00008?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2013/00000125/00000001/art00009?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2013/00000125/00000001/art00010?cra

Unknown block type <class 'bibtexparser.model.DuplicateBlockKeyBlock'>
Unknown block type <class 'bibtexparser.model.DuplicateBlockKeyBlock'>


Skipping https://www.ingentaconnect.com/content/mtax/mt/2017/00000132/00000003/art00028?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2017/00000132/00000003/art00029?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2017/00000132/00000002/art00001?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2017/00000132/00000002/art00002?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2017/00000132/00000002/art00003?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2017/00000132/00000002/art00004?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2017/00000132/00000002/art00005?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2017/00000132/00000002/art00006?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2017/00000132/00000002/art00007?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2017/00000132/00000002/art00008?cra

Parsing of `@article ` block (line 0) aborted on line 0 due to syntactical error in bibtex:
 Expected a `=` after entry key, but found `"`.
Unknown block type <class 'bibtexparser.model.ParsingFailedBlock'>
Unknown block type <class 'bibtexparser.model.ParsingFailedBlock'>


Skipping https://www.ingentaconnect.com/content/mtax/mt/2023/00000137/00000004/art00035?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2023/00000137/00000004/art00024?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2009/00000109/00000001/art00001?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2009/00000109/00000001/art00002?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2009/00000109/00000001/art00003?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2009/00000109/00000001/art00004?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2009/00000109/00000001/art00005?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2009/00000109/00000001/art00006?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2009/00000109/00000001/art00007?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2009/00000109/00000001/art00008?cra

Unknown block type <class 'bibtexparser.model.DuplicateBlockKeyBlock'>
Unknown block type <class 'bibtexparser.model.DuplicateBlockKeyBlock'>


Skipping https://www.ingentaconnect.com/content/mtax/mt/2020/00000134/00000004/art00016?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2020/00000134/00000004/art00017?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2020/00000134/00000004/art00018?crawler=true


Unknown block type <class 'bibtexparser.model.DuplicateBlockKeyBlock'>
Unknown block type <class 'bibtexparser.model.DuplicateBlockKeyBlock'>


Skipping https://www.ingentaconnect.com/content/mtax/mt/2011/00000116/00000001/art00001?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2011/00000116/00000001/art00002?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2011/00000116/00000001/art00003?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2011/00000116/00000001/art00004?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2011/00000116/00000001/art00005?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2011/00000116/00000001/art00006?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2011/00000116/00000001/art00007?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2011/00000116/00000001/art00008?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2011/00000116/00000001/art00009?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2011/00000116/00000001/art00010?cra

Unknown block type <class 'bibtexparser.model.DuplicateBlockKeyBlock'>
Unknown block type <class 'bibtexparser.model.DuplicateBlockKeyBlock'>


Skipping https://www.ingentaconnect.com/content/mtax/mt/2011/00000115/00000001/art00061?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2011/00000115/00000001/art00062?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2018/00000132/00000004/art00001?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2018/00000132/00000004/art00002?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2018/00000132/00000004/art00003?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2018/00000132/00000004/art00004?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2018/00000132/00000004/art00005?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2018/00000132/00000004/art00006?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2018/00000132/00000004/art00007?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2018/00000132/00000004/art00008?cra

Unknown block type <class 'bibtexparser.model.DuplicateBlockKeyBlock'>
Unknown block type <class 'bibtexparser.model.DuplicateBlockKeyBlock'>


Skipping https://www.ingentaconnect.com/content/mtax/mt/2018/00000133/00000001/art00026?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2018/00000133/00000001/art00027?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2018/00000133/00000003/art00001?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2018/00000133/00000003/art00002?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2018/00000133/00000003/art00003?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2018/00000133/00000003/art00004?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2018/00000133/00000003/art00005?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2018/00000133/00000003/art00006?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2018/00000133/00000003/art00007?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2018/00000133/00000003/art00008?cra

Download the RSS

Read bibtex files and create records for each article.

Download the PDFs at the URLs in the bibtex entries.

Create a JSON record with the PDF as an attachment.

### Text extraction

Extract the text, optionally with OCR. Add as an additional attachment on the source record.

In [4]:
df = spark.read.load(
    format="org.apache.bahir.cloudant",
    database=ingest_db_name
)

Py4JJavaError: An error occurred while calling o36.load.
: org.apache.spark.SparkClassNotFoundException: [DATA_SOURCE_NOT_FOUND] Failed to find the data source: org.apache.bahir.cloudant. Make sure the provider name is correct and the package is properly registered and compatible with your Spark version. SQLSTATE: 42K02
	at org.apache.spark.sql.errors.QueryExecutionErrors$.dataSourceNotFoundError(QueryExecutionErrors.scala:722)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:681)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSourceV2(DataSource.scala:740)
	at org.apache.spark.sql.catalyst.analysis.ResolveDataSource$$anonfun$apply$1.applyOrElse(ResolveDataSource.scala:58)
	at org.apache.spark.sql.catalyst.analysis.ResolveDataSource$$anonfun$apply$1.applyOrElse(ResolveDataSource.scala:45)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.$anonfun$resolveOperatorsUpWithPruning$3(AnalysisHelper.scala:139)
	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(origin.scala:86)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.$anonfun$resolveOperatorsUpWithPruning$1(AnalysisHelper.scala:139)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper$.allowInvokingTransformsInAnalyzer(AnalysisHelper.scala:416)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.resolveOperatorsUpWithPruning(AnalysisHelper.scala:135)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.resolveOperatorsUpWithPruning$(AnalysisHelper.scala:131)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.resolveOperatorsUpWithPruning(LogicalPlan.scala:37)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.resolveOperatorsUp(AnalysisHelper.scala:112)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.resolveOperatorsUp$(AnalysisHelper.scala:111)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.resolveOperatorsUp(LogicalPlan.scala:37)
	at org.apache.spark.sql.catalyst.analysis.ResolveDataSource.apply(ResolveDataSource.scala:45)
	at org.apache.spark.sql.catalyst.analysis.ResolveDataSource.apply(ResolveDataSource.scala:43)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$execute$2(RuleExecutor.scala:242)
	at scala.collection.LinearSeqOps.foldLeft(LinearSeq.scala:183)
	at scala.collection.LinearSeqOps.foldLeft$(LinearSeq.scala:179)
	at scala.collection.immutable.List.foldLeft(List.scala:79)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$execute$1(RuleExecutor.scala:239)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$execute$1$adapted(RuleExecutor.scala:231)
	at scala.collection.immutable.List.foreach(List.scala:334)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.execute(RuleExecutor.scala:231)
	at org.apache.spark.sql.catalyst.analysis.Analyzer.org$apache$spark$sql$catalyst$analysis$Analyzer$$executeSameContext(Analyzer.scala:340)
	at org.apache.spark.sql.catalyst.analysis.Analyzer.$anonfun$execute$1(Analyzer.scala:336)
	at org.apache.spark.sql.catalyst.analysis.AnalysisContext$.withNewAnalysisContext(Analyzer.scala:234)
	at org.apache.spark.sql.catalyst.analysis.Analyzer.execute(Analyzer.scala:336)
	at org.apache.spark.sql.catalyst.analysis.Analyzer.execute(Analyzer.scala:299)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$executeAndTrack$1(RuleExecutor.scala:201)
	at org.apache.spark.sql.catalyst.QueryPlanningTracker$.withTracker(QueryPlanningTracker.scala:89)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.executeAndTrack(RuleExecutor.scala:201)
	at org.apache.spark.sql.catalyst.analysis.resolver.HybridAnalyzer.resolveInFixedPoint(HybridAnalyzer.scala:190)
	at org.apache.spark.sql.catalyst.analysis.resolver.HybridAnalyzer.$anonfun$apply$1(HybridAnalyzer.scala:76)
	at org.apache.spark.sql.catalyst.analysis.resolver.HybridAnalyzer.withTrackedAnalyzerBridgeState(HybridAnalyzer.scala:111)
	at org.apache.spark.sql.catalyst.analysis.resolver.HybridAnalyzer.apply(HybridAnalyzer.scala:71)
	at org.apache.spark.sql.catalyst.analysis.Analyzer.$anonfun$executeAndCheck$1(Analyzer.scala:330)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper$.markInAnalyzer(AnalysisHelper.scala:423)
	at org.apache.spark.sql.catalyst.analysis.Analyzer.executeAndCheck(Analyzer.scala:330)
	at org.apache.spark.sql.execution.QueryExecution.$anonfun$lazyAnalyzed$2(QueryExecution.scala:110)
	at org.apache.spark.sql.catalyst.QueryPlanningTracker.measurePhase(QueryPlanningTracker.scala:148)
	at org.apache.spark.sql.execution.QueryExecution.$anonfun$executePhase$2(QueryExecution.scala:278)
	at org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:654)
	at org.apache.spark.sql.execution.QueryExecution.$anonfun$executePhase$1(QueryExecution.scala:278)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:804)
	at org.apache.spark.sql.execution.QueryExecution.executePhase(QueryExecution.scala:277)
	at org.apache.spark.sql.execution.QueryExecution.$anonfun$lazyAnalyzed$1(QueryExecution.scala:110)
	at scala.util.Try$.apply(Try.scala:217)
	at org.apache.spark.util.Utils$.doTryWithCallerStacktrace(Utils.scala:1378)
	at org.apache.spark.util.Utils$.getTryWithCallerStacktrace(Utils.scala:1439)
	at org.apache.spark.util.LazyTry.get(LazyTry.scala:58)
	at org.apache.spark.sql.execution.QueryExecution.analyzed(QueryExecution.scala:121)
	at org.apache.spark.sql.execution.QueryExecution.assertAnalyzed(QueryExecution.scala:80)
	at org.apache.spark.sql.classic.Dataset$.$anonfun$ofRows$1(Dataset.scala:115)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:804)
	at org.apache.spark.sql.classic.Dataset$.ofRows(Dataset.scala:113)
	at org.apache.spark.sql.classic.DataFrameReader.load(DataFrameReader.scala:109)
	at org.apache.spark.sql.classic.DataFrameReader.load(DataFrameReader.scala:92)
	at org.apache.spark.sql.classic.DataFrameReader.load(DataFrameReader.scala:58)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:75)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:52)
	at java.base/java.lang.reflect.Method.invoke(Method.java:580)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:184)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:108)
	at java.base/java.lang.Thread.run(Thread.java:1583)
	Suppressed: org.apache.spark.util.Utils$OriginalTryStackTraceException: Full stacktrace of original doTryWithCallerStacktrace caller
		at org.apache.spark.sql.errors.QueryExecutionErrors$.dataSourceNotFoundError(QueryExecutionErrors.scala:722)
		at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:681)
		at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSourceV2(DataSource.scala:740)
		at org.apache.spark.sql.catalyst.analysis.ResolveDataSource$$anonfun$apply$1.applyOrElse(ResolveDataSource.scala:58)
		at org.apache.spark.sql.catalyst.analysis.ResolveDataSource$$anonfun$apply$1.applyOrElse(ResolveDataSource.scala:45)
		at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.$anonfun$resolveOperatorsUpWithPruning$3(AnalysisHelper.scala:139)
		at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(origin.scala:86)
		at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.$anonfun$resolveOperatorsUpWithPruning$1(AnalysisHelper.scala:139)
		at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper$.allowInvokingTransformsInAnalyzer(AnalysisHelper.scala:416)
		at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.resolveOperatorsUpWithPruning(AnalysisHelper.scala:135)
		at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.resolveOperatorsUpWithPruning$(AnalysisHelper.scala:131)
		at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.resolveOperatorsUpWithPruning(LogicalPlan.scala:37)
		at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.resolveOperatorsUp(AnalysisHelper.scala:112)
		at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.resolveOperatorsUp$(AnalysisHelper.scala:111)
		at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.resolveOperatorsUp(LogicalPlan.scala:37)
		at org.apache.spark.sql.catalyst.analysis.ResolveDataSource.apply(ResolveDataSource.scala:45)
		at org.apache.spark.sql.catalyst.analysis.ResolveDataSource.apply(ResolveDataSource.scala:43)
		at org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$execute$2(RuleExecutor.scala:242)
		at scala.collection.LinearSeqOps.foldLeft(LinearSeq.scala:183)
		at scala.collection.LinearSeqOps.foldLeft$(LinearSeq.scala:179)
		at scala.collection.immutable.List.foldLeft(List.scala:79)
		at org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$execute$1(RuleExecutor.scala:239)
		at org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$execute$1$adapted(RuleExecutor.scala:231)
		at scala.collection.immutable.List.foreach(List.scala:334)
		at org.apache.spark.sql.catalyst.rules.RuleExecutor.execute(RuleExecutor.scala:231)
		at org.apache.spark.sql.catalyst.analysis.Analyzer.org$apache$spark$sql$catalyst$analysis$Analyzer$$executeSameContext(Analyzer.scala:340)
		at org.apache.spark.sql.catalyst.analysis.Analyzer.$anonfun$execute$1(Analyzer.scala:336)
		at org.apache.spark.sql.catalyst.analysis.AnalysisContext$.withNewAnalysisContext(Analyzer.scala:234)
		at org.apache.spark.sql.catalyst.analysis.Analyzer.execute(Analyzer.scala:336)
		at org.apache.spark.sql.catalyst.analysis.Analyzer.execute(Analyzer.scala:299)
		at org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$executeAndTrack$1(RuleExecutor.scala:201)
		at org.apache.spark.sql.catalyst.QueryPlanningTracker$.withTracker(QueryPlanningTracker.scala:89)
		at org.apache.spark.sql.catalyst.rules.RuleExecutor.executeAndTrack(RuleExecutor.scala:201)
		at org.apache.spark.sql.catalyst.analysis.resolver.HybridAnalyzer.resolveInFixedPoint(HybridAnalyzer.scala:190)
		at org.apache.spark.sql.catalyst.analysis.resolver.HybridAnalyzer.$anonfun$apply$1(HybridAnalyzer.scala:76)
		at org.apache.spark.sql.catalyst.analysis.resolver.HybridAnalyzer.withTrackedAnalyzerBridgeState(HybridAnalyzer.scala:111)
		at org.apache.spark.sql.catalyst.analysis.resolver.HybridAnalyzer.apply(HybridAnalyzer.scala:71)
		at org.apache.spark.sql.catalyst.analysis.Analyzer.$anonfun$executeAndCheck$1(Analyzer.scala:330)
		at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper$.markInAnalyzer(AnalysisHelper.scala:423)
		at org.apache.spark.sql.catalyst.analysis.Analyzer.executeAndCheck(Analyzer.scala:330)
		at org.apache.spark.sql.execution.QueryExecution.$anonfun$lazyAnalyzed$2(QueryExecution.scala:110)
		at org.apache.spark.sql.catalyst.QueryPlanningTracker.measurePhase(QueryPlanningTracker.scala:148)
		at org.apache.spark.sql.execution.QueryExecution.$anonfun$executePhase$2(QueryExecution.scala:278)
		at org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:654)
		at org.apache.spark.sql.execution.QueryExecution.$anonfun$executePhase$1(QueryExecution.scala:278)
		at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:804)
		at org.apache.spark.sql.execution.QueryExecution.executePhase(QueryExecution.scala:277)
		at org.apache.spark.sql.execution.QueryExecution.$anonfun$lazyAnalyzed$1(QueryExecution.scala:110)
		at scala.util.Try$.apply(Try.scala:217)
		at org.apache.spark.util.Utils$.doTryWithCallerStacktrace(Utils.scala:1378)
		at org.apache.spark.util.LazyTry.tryT$lzycompute(LazyTry.scala:46)
		at org.apache.spark.util.LazyTry.tryT(LazyTry.scala:46)
		... 21 more
Caused by: java.lang.ClassNotFoundException: org.apache.bahir.cloudant.DefaultSource
	at java.base/java.net.URLClassLoader.findClass(URLClassLoader.java:445)
	at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:593)
	at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:526)
	at org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$lookupDataSource$6(DataSource.scala:665)
	at scala.util.Try$.apply(Try.scala:217)
	at org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$lookupDataSource$5(DataSource.scala:665)
	at scala.util.Failure.orElse(Try.scala:230)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:665)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSourceV2(DataSource.scala:740)
	at org.apache.spark.sql.catalyst.analysis.ResolveDataSource$$anonfun$apply$1.applyOrElse(ResolveDataSource.scala:58)
	at org.apache.spark.sql.catalyst.analysis.ResolveDataSource$$anonfun$apply$1.applyOrElse(ResolveDataSource.scala:45)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.$anonfun$resolveOperatorsUpWithPruning$3(AnalysisHelper.scala:139)
	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(origin.scala:86)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.$anonfun$resolveOperatorsUpWithPruning$1(AnalysisHelper.scala:139)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper$.allowInvokingTransformsInAnalyzer(AnalysisHelper.scala:416)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.resolveOperatorsUpWithPruning(AnalysisHelper.scala:135)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.resolveOperatorsUpWithPruning$(AnalysisHelper.scala:131)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.resolveOperatorsUpWithPruning(LogicalPlan.scala:37)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.resolveOperatorsUp(AnalysisHelper.scala:112)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.resolveOperatorsUp$(AnalysisHelper.scala:111)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.resolveOperatorsUp(LogicalPlan.scala:37)
	at org.apache.spark.sql.catalyst.analysis.ResolveDataSource.apply(ResolveDataSource.scala:45)
	at org.apache.spark.sql.catalyst.analysis.ResolveDataSource.apply(ResolveDataSource.scala:43)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$execute$2(RuleExecutor.scala:242)
	at scala.collection.LinearSeqOps.foldLeft(LinearSeq.scala:183)
	at scala.collection.LinearSeqOps.foldLeft$(LinearSeq.scala:179)
	at scala.collection.immutable.List.foldLeft(List.scala:79)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$execute$1(RuleExecutor.scala:239)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$execute$1$adapted(RuleExecutor.scala:231)
	at scala.collection.immutable.List.foreach(List.scala:334)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.execute(RuleExecutor.scala:231)
	at org.apache.spark.sql.catalyst.analysis.Analyzer.org$apache$spark$sql$catalyst$analysis$Analyzer$$executeSameContext(Analyzer.scala:340)
	at org.apache.spark.sql.catalyst.analysis.Analyzer.$anonfun$execute$1(Analyzer.scala:336)
	at org.apache.spark.sql.catalyst.analysis.AnalysisContext$.withNewAnalysisContext(Analyzer.scala:234)
	at org.apache.spark.sql.catalyst.analysis.Analyzer.execute(Analyzer.scala:336)
	at org.apache.spark.sql.catalyst.analysis.Analyzer.execute(Analyzer.scala:299)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$executeAndTrack$1(RuleExecutor.scala:201)
	at org.apache.spark.sql.catalyst.QueryPlanningTracker$.withTracker(QueryPlanningTracker.scala:89)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.executeAndTrack(RuleExecutor.scala:201)
	at org.apache.spark.sql.catalyst.analysis.resolver.HybridAnalyzer.resolveInFixedPoint(HybridAnalyzer.scala:190)
	at org.apache.spark.sql.catalyst.analysis.resolver.HybridAnalyzer.$anonfun$apply$1(HybridAnalyzer.scala:76)
	at org.apache.spark.sql.catalyst.analysis.resolver.HybridAnalyzer.withTrackedAnalyzerBridgeState(HybridAnalyzer.scala:111)
	at org.apache.spark.sql.catalyst.analysis.resolver.HybridAnalyzer.apply(HybridAnalyzer.scala:71)
	at org.apache.spark.sql.catalyst.analysis.Analyzer.$anonfun$executeAndCheck$1(Analyzer.scala:330)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper$.markInAnalyzer(AnalysisHelper.scala:423)
	at org.apache.spark.sql.catalyst.analysis.Analyzer.executeAndCheck(Analyzer.scala:330)
	at org.apache.spark.sql.execution.QueryExecution.$anonfun$lazyAnalyzed$2(QueryExecution.scala:110)
	at org.apache.spark.sql.catalyst.QueryPlanningTracker.measurePhase(QueryPlanningTracker.scala:148)
	at org.apache.spark.sql.execution.QueryExecution.$anonfun$executePhase$2(QueryExecution.scala:278)
	at org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:654)
	at org.apache.spark.sql.execution.QueryExecution.$anonfun$executePhase$1(QueryExecution.scala:278)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:804)
	at org.apache.spark.sql.execution.QueryExecution.executePhase(QueryExecution.scala:277)
	at org.apache.spark.sql.execution.QueryExecution.$anonfun$lazyAnalyzed$1(QueryExecution.scala:110)
	at scala.util.Try$.apply(Try.scala:217)
	at org.apache.spark.util.Utils$.doTryWithCallerStacktrace(Utils.scala:1378)
	at org.apache.spark.util.LazyTry.tryT$lzycompute(LazyTry.scala:46)
	at org.apache.spark.util.LazyTry.tryT(LazyTry.scala:46)
	... 21 more


In [None]:
# Content-Type: text/html; charset=UTF-8

def pdf_to_text(pdf_contents: bytes) -> bytes:
    doc = pymupdf4llm.Document(stream=pdf_contents)

    full_text = ''
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        # Possibly perform OCR on the page
        text = page.get_text("text", flags=fitz.TEXT_PRESERVE_WHITESPACE | fitz.TEXT_PRESERVE_SPACES | fitz.TEXT_DEHYPHENATE)
        full_text += f"\n--- PDF Page {page_num+1} ---\n"
        full_text += text

    return full_text.encode("utf-8")

@udf(returnType=NullType)
def add_text(row) -> None:
    for pdf_filename in row._attachments:
        pdf_path = PurePath(pdf_filename)
        if pdf_path.suffix != '.pdf':
            continue
        pdf_path = PurePath(pdf_file)
        txt_path = pdf_path.stem.append_suffix('.txt')
        if txt_path in row._attachments:
            # TODO(piggy): Recalculate text if text is terrible. Too much noise vocabulary?
            print(f"Already have text for {row.pdf_url}")
            continue
        pdf_file = db.get_attachment(row['_id'], pdf_path).read()
        txt_file = pdf_to_text(pdf_file)
        attachment_content_type = 'text/simple, charset=UTF-8'
        attachment_file = BytesIO(txt_file)

        db.put_attachment(row, attachment_file, txt_path, attachment_content_type)




In [13]:
!pwd

/data/piggy/src/github.com/piggyatbaqaqi/skol/jupyter


df.rdd.foreach(add_text)

In [None]:
## Bibliography

* doi Foundation, "DOI Citation Formatter HTTP API", https://citation.doi.org/api-docs.html, accessed 2025-11-12.

