# SKOL IV: All the Data

## Ingestion

Each journal or other data source gets an ingester that gets us to PDFs in our document store.

### Ingenta RSS ingestion

In [1]:
from io import BytesIO
import requests

# Be sure to get version 2: https://simple-repository.app.cern.ch/project/bibtexparser/2.0.0b8/description
import bibtexparser
import couchdb
import feedparser
import json
from uuid import uuid4

def ingest_ingenta(rss_url: str) -> None:
    couch = couchdb.Server('http://admin:SU2orange!@127.0.0.1:5984/')
    db_name = 'skol_dev'
    if db_name not in couch:
        db = couch.create(db_name)
    else:
        db = couch[db_name]

    feed = feedparser.parse(rss_url)
    
    feed_meta = {
        'url': rss_url,
        'title': feed.feed.title,
        'link': feed.feed.link,
        'description': feed.feed.description,
    }

    for entry in feed.entries:
        entry_meta = {
            'title': entry.title,
            'link': entry.link,
        }
        if hasattr(entry, 'summary'):
            entry_meta['summary'] = entry.summary
        if hasattr(entry, 'description'):
            entry_meta['description'] = entry.description

        bibtex_link = f'{entry.link}?format=bib'
        print(f"bibtex_link: {bibtex_link}")
        with requests.get(bibtex_link, stream=False) as bibtex_f:
            bibtex_f.raise_for_status()  # Raise an HTTPError for bad responses (4xx or 5xx)
            bib_database = bibtexparser\
                .parse_string(bibtex_f.content\
                             .replace(b"\"\nparent", b"\",\nparent")\
                             .replace(b"\n", b""))
            bibtex_data = {
                'link': bibtex_link,
                'bibtex': bibtexparser.write_string(bib_database),
            }
            
            for bib_entry in bib_database.entries:
                doc = {
                    '_id': uuid4().hex,
                    'feed_meta': feed_meta,
                    'bibtex_meta': bibtex_data,
                    'pdf_url': f"{bib_entry['url']}?crawler=true",
                }

                # Do not fetch if we already have an entry.
                selector = {'selector': {'pdf_url': doc['pdf_url']}}
                if not list(db.find(selector)):
                    print(f"Skipping {doc['pdf_url']}")
                    continue

                print(f"Adding {doc['pdf_url']}")
                for k in bib_entry.fields_dict.keys():
                    doc[k] = bib_entry[k]
                
                doc_id, doc_rev = db.save(doc)
                with requests.get(doc['pdf_url'], stream=False) as pdf_f:
                    pdf_f.raise_for_status()
                    pdf_doc = pdf_f.content
                
                attachment_filename = 'article.pdf'
                attachment_content_type = 'application/pdf'
                attachment_file = BytesIO(pdf_doc)

                db.put_attachment(doc, attachment_file, attachment_filename, attachment_content_type)

                print("-" * 10)

            
        print("=" * 20)

# # To retrieve the attachment:
# # attachment_data = db.get_attachment(doc_id, os.path.basename(file_path)).read()
# # print(attachment_data.decode('utf-8')) # Decode if it's a text file

ingest_ingenta('https://api.ingentaconnect.com/content/mtax/mt?format=rss')

bibtex_link: https://www.ingentaconnect.com/content/mtax/mt/2023/00000137/00000004?format=bib
Adding https://www.ingentaconnect.com/content/mtax/mt/2023/00000137/00000004/art00001?crawler=true
----------
Adding https://www.ingentaconnect.com/content/mtax/mt/2023/00000137/00000004/art00002?crawler=true
----------
Adding https://www.ingentaconnect.com/content/mtax/mt/2023/00000137/00000004/art00003?crawler=true
----------
Adding https://www.ingentaconnect.com/content/mtax/mt/2023/00000137/00000004/art00004?crawler=true
----------
Adding https://www.ingentaconnect.com/content/mtax/mt/2023/00000137/00000004/art00005?crawler=true
----------
Adding https://www.ingentaconnect.com/content/mtax/mt/2023/00000137/00000004/art00006?crawler=true
----------
Adding https://www.ingentaconnect.com/content/mtax/mt/2023/00000137/00000004/art00007?crawler=true
----------
Adding https://www.ingentaconnect.com/content/mtax/mt/2023/00000137/00000004/art00008?crawler=true
----------
Skipping https://www.ingen

Download the RSS

Read bibtex files and create records for each article.

Download the PDFs at the URLs in the bibtex entries.

Create a JSON record with the PDF as an attachment.

### Text extraction

Extract the text, optionally with OCR. Add as an additional attachment on the source record.