# SKOL IV: All the Data

In [21]:
from io import BytesIO
import os
from pathlib import Path
import requests
from typing import Any, Dict

# Be sure to get version 2: https://simple-repository.app.cern.ch/project/bibtexparser/2.0.0b8/description
import bibtexparser
import couchdb
import feedparser
import json
from uuid import uuid4

## Ingestion

Each journal or other data source gets an ingester that puts PDFs into our document store.

### Ingenta RSS ingestion

In [22]:
couch = couchdb.Server('http://admin:SU2orange!@127.0.0.1:5984/')
db_name = 'skol_dev'
if db_name not in couch:
    db = couch.create(db_name)
else:
    db = couch[db_name]


In [16]:
def ingest_from_bibtex(
        db: couchdb.Database,
        content: bytes,
        bibtex_link: str,
        meta: Dict[str, Any]
        ) -> None:
    """Load documents referenced in an Ingenta BibTeX database."""
    bib_database = bibtexparser.parse_string(content)

    bibtex_data = {
        'link': bibtex_link,
        'bibtex': bibtexparser.write_string(bib_database),
    }
    
    for bib_entry in bib_database.entries:
        doc = {
            '_id': uuid4().hex,
            'meta': meta,
            'pdf_url': f"{bib_entry['url']}?crawler=true",
        }

        # Do not fetch if we already have an entry.
        selector = {'selector': {'pdf_url': doc['pdf_url']}}
        found = False
        for e in db.find(selector):
            found = True
        if found:
            print(f"Skipping {doc['pdf_url']}")
            continue

        print(f"Adding {doc['pdf_url']}")
        for k in bib_entry.fields_dict.keys():
            doc[k] = bib_entry[k]
        
        doc_id, doc_rev = db.save(doc)
        with requests.get(doc['pdf_url'], stream=False) as pdf_f:
            pdf_f.raise_for_status()
            pdf_doc = pdf_f.content
        
        attachment_filename = 'article.pdf'
        attachment_content_type = 'application/pdf'
        attachment_file = BytesIO(pdf_doc)

        db.put_attachment(doc, attachment_file, attachment_filename, attachment_content_type)

        print("-" * 10)

# # To retrieve the attachment:
# # attachment_data = db.get_attachment(doc_id, os.path.basename(file_path)).read()
# # print(attachment_data.decode('utf-8')) # Decode if it's a text file

In [3]:
def ingest_ingenta(
        db: couchdb.Database,
        rss_url: str) -> None:
    """Ingest documents from an Ingenta RSS feed."""

    feed = feedparser.parse(rss_url)
    
    feed_meta = {
        'url': rss_url,
        'title': feed.feed.title,
        'link': feed.feed.link,
        'description': feed.feed.description,
    }

    for entry in feed.entries:
        entry_meta = {
            'title': entry.title,
            'link': entry.link,
        }
        if hasattr(entry, 'summary'):
            entry_meta['summary'] = entry.summary
        if hasattr(entry, 'description'):
            entry_meta['description'] = entry.description

        bibtex_link = f'{entry.link}?format=bib'
        print(f"bibtex_link: {bibtex_link}")

        with requests.get(bibtex_link, stream=False) as bibtex_f:
            bibtex_f.raise_for_status()  # Raise an HTTPError for bad responses (4xx or 5xx)

            ingest_from_bibtex(db=db, content=bibtex_f.content\
                             .replace(b"\"\nparent", b"\",\nparent")\
                             .replace(b"\n", b""),
                              bibtex_link=bibtex_link,
                              meta={
                                  'feed': feed_meta,
                                  'entry': entry_meta,
                              })
        print("=" * 20)

In [26]:
def ingest_from_local_bibtex(
    db: couchdb.Database,
    root: Path = Path("/data/skol/www/www.ingentaconnect.com")) -> None:
    """Ingest from a local directory with Ingenta bibtext files in it."""
    for dirpath, dirnames, filenames in os.walk(root):
        for filename in filenames:
            if not filename.endswith('format=bib'):
                continue
            full_filepath = os.path.join(dirpath, filename)
            bibtex_link = f"https://www.ingentaconnect.com/{full_filepath[len(str(root)):]}"
            with open(full_filepath) as f:
                content = f.read()\
                    .replace("\"\nparent", "\",\nparent")\
                    .replace("\n", "")
                ingest_from_bibtex(db, content, bibtex_link, meta={})


In [17]:
ingest_ingentadb(db=db, rss_url='https://api.ingentaconnect.com/content/mtax/mt?format=rss')

bibtex_link: https://www.ingentaconnect.com/content/mtax/mt/2023/00000137/00000004?format=bib
Adding https://www.ingentaconnect.com/content/mtax/mt/2023/00000137/00000004/art00001?crawler=true
----------
Adding https://www.ingentaconnect.com/content/mtax/mt/2023/00000137/00000004/art00002?crawler=true
----------
Adding https://www.ingentaconnect.com/content/mtax/mt/2023/00000137/00000004/art00003?crawler=true
----------
Adding https://www.ingentaconnect.com/content/mtax/mt/2023/00000137/00000004/art00004?crawler=true
----------
Adding https://www.ingentaconnect.com/content/mtax/mt/2023/00000137/00000004/art00005?crawler=true
----------
Adding https://www.ingentaconnect.com/content/mtax/mt/2023/00000137/00000004/art00006?crawler=true
----------
Adding https://www.ingentaconnect.com/content/mtax/mt/2023/00000137/00000004/art00007?crawler=true
----------
Adding https://www.ingentaconnect.com/content/mtax/mt/2023/00000137/00000004/art00008?crawler=true
----------
Adding https://www.ingenta

In [18]:
ingest_ingenta(db=db, rss_url='https://api.ingentaconnect.com/content/wfbi/sim?format=rss')

bibtex_link: https://www.ingentaconnect.com/content/wfbi/sim/2025/00000111/00000001?format=bib
Adding https://www.ingentaconnect.com/content/wfbi/sim/2025/00000111/00000001/art00002?crawler=true
----------
Adding https://www.ingentaconnect.com/content/wfbi/sim/2025/00000111/00000001/art00001?crawler=true
----------
Adding https://www.ingentaconnect.com/content/wfbi/sim/2025/00000111/00000001/art00003?crawler=true
----------
Adding https://www.ingentaconnect.com/content/wfbi/sim/2025/00000111/00000001/art00004?crawler=true
----------
Adding https://www.ingentaconnect.com/content/wfbi/sim/2025/00000111/00000001/art00005?crawler=true
----------
bibtex_link: https://www.ingentaconnect.com/content/wfbi/sim/2025/00000110/00000001?format=bib
Adding https://www.ingentaconnect.com/content/wfbi/sim/2025/00000110/00000001/art00001?crawler=true
----------
Adding https://www.ingentaconnect.com/content/wfbi/sim/2025/00000110/00000001/art00002?crawler=true
----------
Adding https://www.ingentaconnect

KeyboardInterrupt: 

In [27]:
ingest_from_local_bibtex(
    db=db,
    root=Path("/data/skol/www/www.ingentaconnect.com"))

Adding https://www.ingentaconnect.com/content/mtax/mt/2013/00000125/00000001/art00001?crawler=true
----------
Adding https://www.ingentaconnect.com/content/mtax/mt/2013/00000125/00000001/art00002?crawler=true
----------
Adding https://www.ingentaconnect.com/content/mtax/mt/2013/00000125/00000001/art00003?crawler=true
----------
Adding https://www.ingentaconnect.com/content/mtax/mt/2013/00000125/00000001/art00004?crawler=true
----------
Adding https://www.ingentaconnect.com/content/mtax/mt/2013/00000125/00000001/art00005?crawler=true
----------
Adding https://www.ingentaconnect.com/content/mtax/mt/2013/00000125/00000001/art00006?crawler=true
----------
Adding https://www.ingentaconnect.com/content/mtax/mt/2013/00000125/00000001/art00007?crawler=true


KeyboardInterrupt: 

Download the RSS

Read bibtex files and create records for each article.

Download the PDFs at the URLs in the bibtex entries.

Create a JSON record with the PDF as an attachment.

### Text extraction

Extract the text, optionally with OCR. Add as an additional attachment on the source record.