# SKOL IV: All the Data

In [3]:
!pip install pymupdf4llm
bahir_package = 'org.apache.bahir:spark-sql-cloudant_2.12:2.4.0'
!spark-shell --packages $bahir_package < /dev/null

25/11/12 20:35:39 WARN Utils: Your hostname, puchpuchobs resolves to a loopback address: 127.0.1.1; using 172.16.227.68 instead (on interface wlp130s0f0)
25/11/12 20:35:39 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
:: loading settings :: url = jar:file:/data/piggy/miniconda3/envs/skol/lib/python3.13/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml
Ivy Default Cache set to: /home/piggy/.ivy2/cache
The jars for the packages stored in: /home/piggy/.ivy2/jars
org.apache.bahir#spark-sql-cloudant_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-11774ef8-b398-46f4-90df-24867ef605f0;1.0
	confs: [default]
	found org.apache.bahir#spark-sql-cloudant_2.12;2.4.0 in central
	found org.apache.bahir#bahir-common_2.12;2.4.0 in central
	found org.spark-project.spark#unused;1.0.0 in central
	found com.cloudant#cloudant-client;2.17.0 in central
	found com.google.code.gson#gson;2.8.2 in central
	fou

In [4]:
from io import BytesIO
import json
import os
from pathlib import Path, PurePath
import requests
from typing import Any, Dict
from urllib.robotparser import RobotFileParser

# Be sure to get version 2: https://simple-repository.app.cern.ch/project/bibtexparser/2.0.0b8/description
import bibtexparser
import couchdb
import feedparser
import fitz # PyMuPDF
import pymupdf4llm
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType, IntegerType, NullType
from uuid import uuid4

In [6]:
couchdb_host = "127.0.0.1:5984" # e.g., "ACCOUNT.cloudant.com" or "localhost"
couchdb_username = "admin"
couchdb_password = "SU2orange!"
ingest_db_name = "skol_dev"

spark = SparkSession \
    .builder \
    .appName("CouchDB Spark SQL Example in Python using dataframes") \
    .config("cloudant.protocol", "http") \
    .config("cloudant.host", couchdb_host) \
    .config("cloudant.username", couchdb_username) \
    .config("cloudant.password", couchdb_password) \
    .config("spark.jars.packages", bahir_package) \
    .getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("ERROR") # Keeps the noise down!!!

couch = couchdb.Server(f'http://{couchdb_username}:{couchdb_password}@{couchdb_host}')
if ingest_db_name not in couch:
    db = couch.create(ingest_db_name)
else:
    db = couch[ingest_db_name]

user_agent = "synoptickeyof.life"

ingenta_rp = RobotFileParser()
ingenta_rp.set_url("https://www.ingentaconnect.com/robots.txt")
ingenta_rp.read() # Reads and parses the robots.txt file from the URL

## The Data Sources

## Ingestion

Each journal or other data source gets an ingester that puts PDFs into our document store.

### Ingenta RSS ingestion

In [4]:
def ingest_from_bibtex(
        db: couchdb.Database,
        content: bytes,
        bibtex_link: str,
        meta: Dict[str, Any],
        rp
        ) -> None:
    """Load documents referenced in an Ingenta BibTeX database."""
    bib_database = bibtexparser.parse_string(content)

    bibtex_data = {
        'link': bibtex_link,
        'bibtex': bibtexparser.write_string(bib_database),
    }
    
    for bib_entry in bib_database.entries:
        doc = {
            '_id': uuid4().hex,
            'meta': meta,
            'pdf_url': f"{bib_entry['url']}?crawler=true",
        }

        # Do not fetch if we already have an entry.
        selector = {'selector': {'pdf_url': doc['pdf_url']}}
        found = False
        for e in db.find(selector):
            found = True
        if found:
            print(f"Skipping {doc['pdf_url']}")
            continue

        if not rp.can_fetch(user_agent, doc['pdf_url']):
            print(f"Robot permission denied {doc['pdf_url']}")
            continue

        print(f"Adding {doc['pdf_url']}")
        for k in bib_entry.fields_dict.keys():
            doc[k] = bib_entry[k]
        
        doc_id, doc_rev = db.save(doc)
        with requests.get(doc['pdf_url'], stream=False) as pdf_f:
            pdf_f.raise_for_status()
            pdf_doc = pdf_f.content
        
        attachment_filename = 'article.pdf'
        attachment_content_type = 'application/pdf'
        attachment_file = BytesIO(pdf_doc)

        db.put_attachment(doc, attachment_file, attachment_filename, attachment_content_type)

        print("-" * 10)

# # To retrieve the attachment:
# # attachment_data = db.get_attachment(doc_id, os.path.basename(file_path)).read()
# # print(attachment_data.decode('utf-8')) # Decode if it's a text file

In [5]:
def ingest_ingenta(
        db: couchdb.Database,
        rss_url: str,
        rp
) -> None:
    """Ingest documents from an Ingenta RSS feed."""

    feed = feedparser.parse(rss_url)
    
    feed_meta = {
        'url': rss_url,
        'title': feed.feed.title,
        'link': feed.feed.link,
        'description': feed.feed.description,
    }

    for entry in feed.entries:
        entry_meta = {
            'title': entry.title,
            'link': entry.link,
        }
        if hasattr(entry, 'summary'):
            entry_meta['summary'] = entry.summary
        if hasattr(entry, 'description'):
            entry_meta['description'] = entry.description

        bibtex_link = f'{entry.link}?format=bib'
        print(f"bibtex_link: {bibtex_link}")

        if not rp.can_fetch(user_agent, bibtex_link):
            print(f"Robot permission denied {bibtex_link}")
            continue

        with requests.get(bibtex_link, stream=False) as bibtex_f:
            bibtex_f.raise_for_status()  # Raise an HTTPError for bad responses (4xx or 5xx)

            ingest_from_bibtex(
                db=db,
                content=bibtex_f.content\
                    .replace(b"\"\nparent", b"\",\nparent")\
                    .replace(b"\n", b""),
                bibtex_link=bibtex_link,
                meta={
                    'feed': feed_meta,
                    'entry': entry_meta,
                },
                rp=rp
            )
        print("=" * 20)

In [6]:
def ingest_from_local_bibtex(
    db: couchdb.Database,
    root: Path,
    rp
) -> None:
    """Ingest from a local directory with Ingenta bibtext files in it."""
    for dirpath, dirnames, filenames in os.walk(root):
        for filename in filenames:
            if not filename.endswith('format=bib'):
                continue
            full_filepath = os.path.join(dirpath, filename)
            bibtex_link = f"https://www.ingentaconnect.com/{full_filepath[len(str(root)):]}"
            with open(full_filepath) as f:
                content = f.read()\
                    .replace("\"\nparent", "\",\nparent")\
                    .replace("\n", "")
                ingest_from_bibtex(db, content, bibtex_link, meta={}, rp=rp)


In [6]:
# Mycotaxon
ingest_ingenta(db=db, rss_url='https://api.ingentaconnect.com/content/mtax/mt?format=rss', rp=ingenta_rp)

bibtex_link: https://www.ingentaconnect.com/content/mtax/mt/2023/00000137/00000004?format=bib
Skipping https://www.ingentaconnect.com/content/mtax/mt/2023/00000137/00000004/art00001?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2023/00000137/00000004/art00002?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2023/00000137/00000004/art00003?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2023/00000137/00000004/art00004?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2023/00000137/00000004/art00005?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2023/00000137/00000004/art00006?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2023/00000137/00000004/art00007?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2023/00000137/00000004/art00008?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2023/00000137/00000004/art00009?crawler=tr

In [7]:
# Studies in Mycology
ingest_ingenta(db=db, rss_url='https://api.ingentaconnect.com/content/wfbi/sim?format=rss', rp=ingenta_rp)

bibtex_link: https://www.ingentaconnect.com/content/wfbi/sim/2025/00000111/00000001?format=bib
Skipping https://www.ingentaconnect.com/content/wfbi/sim/2025/00000111/00000001/art00002?crawler=true
Skipping https://www.ingentaconnect.com/content/wfbi/sim/2025/00000111/00000001/art00001?crawler=true
Skipping https://www.ingentaconnect.com/content/wfbi/sim/2025/00000111/00000001/art00003?crawler=true
Skipping https://www.ingentaconnect.com/content/wfbi/sim/2025/00000111/00000001/art00004?crawler=true
Skipping https://www.ingentaconnect.com/content/wfbi/sim/2025/00000111/00000001/art00005?crawler=true
bibtex_link: https://www.ingentaconnect.com/content/wfbi/sim/2025/00000110/00000001?format=bib
Skipping https://www.ingentaconnect.com/content/wfbi/sim/2025/00000110/00000001/art00001?crawler=true
Skipping https://www.ingentaconnect.com/content/wfbi/sim/2025/00000110/00000001/art00002?crawler=true
Skipping https://www.ingentaconnect.com/content/wfbi/sim/2025/00000110/00000001/art00003?crawler

In [8]:
ingest_from_local_bibtex(
    db=db,
    root=Path("/data/skol/www/www.ingentaconnect.com"),
    rp=ingenta_rpContent-Type: text/html; charset=UTF-8

)

Skipping https://www.ingentaconnect.com/content/mtax/mt/2013/00000125/00000001/art00001?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2013/00000125/00000001/art00002?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2013/00000125/00000001/art00003?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2013/00000125/00000001/art00004?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2013/00000125/00000001/art00005?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2013/00000125/00000001/art00006?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2013/00000125/00000001/art00007?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2013/00000125/00000001/art00008?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2013/00000125/00000001/art00009?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2013/00000125/00000001/art00010?cra

Unknown block type <class 'bibtexparser.model.DuplicateBlockKeyBlock'>
Unknown block type <class 'bibtexparser.model.DuplicateBlockKeyBlock'>


Skipping https://www.ingentaconnect.com/content/mtax/mt/2017/00000132/00000003/art00028?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2017/00000132/00000003/art00029?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2017/00000132/00000002/art00001?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2017/00000132/00000002/art00002?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2017/00000132/00000002/art00003?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2017/00000132/00000002/art00004?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2017/00000132/00000002/art00005?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2017/00000132/00000002/art00006?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2017/00000132/00000002/art00007?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2017/00000132/00000002/art00008?cra

Parsing of `@article ` block (line 0) aborted on line 0 due to syntactical error in bibtex:
 Expected a `=` after entry key, but found `"`.
Unknown block type <class 'bibtexparser.model.ParsingFailedBlock'>
Unknown block type <class 'bibtexparser.model.ParsingFailedBlock'>


Skipping https://www.ingentaconnect.com/content/mtax/mt/2023/00000137/00000004/art00035?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2023/00000137/00000004/art00024?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2009/00000109/00000001/art00001?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2009/00000109/00000001/art00002?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2009/00000109/00000001/art00003?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2009/00000109/00000001/art00004?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2009/00000109/00000001/art00005?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2009/00000109/00000001/art00006?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2009/00000109/00000001/art00007?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2009/00000109/00000001/art00008?cra

Unknown block type <class 'bibtexparser.model.DuplicateBlockKeyBlock'>
Unknown block type <class 'bibtexparser.model.DuplicateBlockKeyBlock'>


Skipping https://www.ingentaconnect.com/content/mtax/mt/2020/00000134/00000004/art00016?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2020/00000134/00000004/art00017?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2020/00000134/00000004/art00018?crawler=true


Unknown block type <class 'bibtexparser.model.DuplicateBlockKeyBlock'>
Unknown block type <class 'bibtexparser.model.DuplicateBlockKeyBlock'>


Skipping https://www.ingentaconnect.com/content/mtax/mt/2011/00000116/00000001/art00001?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2011/00000116/00000001/art00002?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2011/00000116/00000001/art00003?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2011/00000116/00000001/art00004?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2011/00000116/00000001/art00005?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2011/00000116/00000001/art00006?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2011/00000116/00000001/art00007?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2011/00000116/00000001/art00008?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2011/00000116/00000001/art00009?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2011/00000116/00000001/art00010?cra

Unknown block type <class 'bibtexparser.model.DuplicateBlockKeyBlock'>
Unknown block type <class 'bibtexparser.model.DuplicateBlockKeyBlock'>


Skipping https://www.ingentaconnect.com/content/mtax/mt/2011/00000115/00000001/art00061?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2011/00000115/00000001/art00062?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2018/00000132/00000004/art00001?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2018/00000132/00000004/art00002?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2018/00000132/00000004/art00003?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2018/00000132/00000004/art00004?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2018/00000132/00000004/art00005?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2018/00000132/00000004/art00006?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2018/00000132/00000004/art00007?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2018/00000132/00000004/art00008?cra

Unknown block type <class 'bibtexparser.model.DuplicateBlockKeyBlock'>
Unknown block type <class 'bibtexparser.model.DuplicateBlockKeyBlock'>


Skipping https://www.ingentaconnect.com/content/mtax/mt/2018/00000133/00000001/art00026?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2018/00000133/00000001/art00027?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2018/00000133/00000003/art00001?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2018/00000133/00000003/art00002?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2018/00000133/00000003/art00003?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2018/00000133/00000003/art00004?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2018/00000133/00000003/art00005?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2018/00000133/00000003/art00006?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2018/00000133/00000003/art00007?crawler=true
Skipping https://www.ingentaconnect.com/content/mtax/mt/2018/00000133/00000003/art00008?cra

Download the RSS

Read bibtex files and create records for each article.

Download the PDFs at the URLs in the bibtex entries.

Create a JSON record with the PDF as an attachment.

### Text extraction

Extract the text, optionally with OCR. Add as an additional attachment on the source record.

In [7]:
df = spark.read.load(
    format="org.apache.bahir.cloudant",
    database=ingest_db_name
)

                                                                                

In [10]:
df.describe()

DataFrame[summary: string, _id: string, _rev: string, abstract: string, author: string, doi: string, eissn: string, issn: string, itemtype: string, journal: string, number: string, pages: string, parent_itemid: string, pdf_url: string, publication date: string, publishercode: string, title: string, url: string, volume: string, year: string]

In [16]:
df

Column<'_attachments'>

In [97]:
# Content-Type: text/html; charset=UTF-8

def pdf_to_text(pdf_contents: bytes) -> bytes:
    doc = fitz.open(stream=BytesIO(pdf_contents), filetype="pdf")

    full_text = ''
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        # Possibly perform OCR on the page
        text = page.get_text("text", flags=fitz.TEXT_PRESERVE_WHITESPACE | fitz.TEXT_DEHYPHENATE)
        full_text += f"\n--- PDF Page {page_num+1} ---\n"
        full_text += text

    return full_text.encode("utf-8")

def add_text_to_partition(iterator) -> None:
    couch = couchdb.Server(f'http://{couchdb_username}:{couchdb_password}@{couchdb_host}')
    local_db = couch[ingest_db_name]
    for row in iterator:
        row_dict = row.asDict()
        attachment_dict = row._attachments.asDict()
        for pdf_filename in attachment_dict:
            pdf_path = PurePath(pdf_filename)
            if pdf_path.suffix != '.pdf':
                continue
            pdf_path = PurePath(pdf_filename)
            txt_path_str = pdf_path.stem + '.txt'
            print(f"DEBUG: txt_path_str: {txt_path_str} ad: {attachment_dict.keys()}")
            if txt_path_str in attachment_dict:
                # TODO(piggy): Recalculate text if text is terrible. Too much noise vocabulary?
                print(f"Already have text for {row.pdf_url}")
                continue
            print(f"{row._id}, {row.pdf_url}")
            pdf_file = local_db.get_attachment(row._id, str(pdf_path)).read()
            txt_file = pdf_to_text(pdf_file)
            attachment_content_type = 'text/simple; charset=UTF-8'
            attachment_file = BytesIO(txt_file)
            local_db.put_attachment(row_dict, attachment_file, txt_path_str, attachment_content_type)
    


In [98]:
df.select("*").limit(2).foreachPartition(add_text_to_partition)

DEBUG: txt_path_str: article.txt ad: dict_keys(['article.pdf'])    (6 + 4) / 10]
0020c88329ed456a95a18e0c219269f4, https://www.ingentaconnect.com/content/mtax/mt/2010/00000111/00000001/art00033?crawler=true
DEBUG: txt_path_str: article.txt ad: dict_keys(['article.pdf'])
00320c27b4dc456d8b1350c627d1d5cf, https://www.ingentaconnect.com/content/mtax/mt/2011/00000116/00000001/art00038?crawler=true
                                                                                

In [13]:
!pwd

/data/piggy/src/github.com/piggyatbaqaqi/skol/jupyter


df.rdd.foreach(add_text)

In [None]:
## Bibliography

* doi Foundation, "DOI Citation Formatter HTTP API", https://citation.doi.org/api-docs.html, accessed 2025-11-12.

