# SKOL IV: All the Data

## Ingestion

Each journal or other data source gets an ingester that gets us to PDFs in our document store.

### Ingenta RSS ingestion

In [None]:
from io import BytesIO
import requests

# Be sure to get version 2: https://simple-repository.app.cern.ch/project/bibtexparser/2.0.0b8/description
import bibtexparser
import feedparser

def ingest_ingenta(rss_url: str) -> None:
    feed = feedparser.parse(rss_url)
    
    print(f"Feed Title: {feed.feed.title}")
    print(f"Feed Link: {feed.feed.link}")
    print(f"Feed Description: {feed.feed.description}")

    for entry in feed.entries:
        print(f"  Title: {entry.title}")
        print(f"  Link: {entry.link}")
        # Check if 'summary' or 'description' exists and print
        if hasattr(entry, 'summary'):
            print(f"  Summary: {entry.summary}")
        elif hasattr(entry, 'description'):
            print(f"  Description: {entry.description}")

        bibtex_link = f'{entry.link}?format=bib'
        print(f"bibtex_link: {bibtex_link}")
        with requests.get(bibtex_link, stream=False) as bibtex_f:
            bibtex_f.raise_for_status()  # Raise an HTTPError for bad responses (4xx or 5xx)
            bib_database = bibtexparser\
                .parse_string(bibtex_f.content\
                             .replace(b"\"\nparent", b"\",\nparent")\
                             .replace(b"\n", b""))
            for bib_entry in bib_database.entries:
                print(bib_entry)
                print("-" * 10)

            
        print("=" * 20)

#     # Connect to your CouchDB server
#     # Replace 'http://localhost:5984/' with your CouchDB server URL if different
#     couch = couchdb.Server('http://localhost:5984/')
    
#     # Specify the database name. Create it if it doesn't exist.
#     db_name = 'my_database'
#     if db_name not in couch:
#         db = couch.create(db_name)
#     else:
#         db = couch[db_name]
    
#     # Define the path to the file you want to attach
#     file_path = 'path/to/your/file.txt'  # Replace with the actual file path
    
#     # Create a new document or load an existing one
#     doc = {'_id': 'my_document_with_file'}
    
#     # If the document already exists, load it to update it
#     if 'my_document_with_file' in db:
#         doc = db['my_document_with_file']

# # Open the file in binary read mode
# with open(file_path, 'rb') as f:
#     # Attach the file to the document
#     # The 'filename' argument specifies the name of the attachment within CouchDB
#     # The 'content_type' should match the file type (e.g., 'text/plain', 'image/jpeg')
#     doc_id, doc_rev = db.put_attachment(doc, f, filename=os.path.basename(file_path), content_type='text/plain')

# print(f"File '{os.path.basename(file_path)}' attached to document '{doc_id}' with revision '{doc_rev}'")

# # To retrieve the attachment:
# # attachment_data = db.get_attachment(doc_id, os.path.basename(file_path)).read()
# # print(attachment_data.decode('utf-8')) # Decode if it's a text file

ingest_ingenta('https://api.ingentaconnect.com/content/mtax/mt?format=rss')

Feed Title: Recent Issues of MYCOTAXON RSS Feed
Feed Link: https://www.ingentaconnect.com/content/mtax/mt
Feed Description: RSS feed of the 10 most recently published issues of MYCOTAXON
  Title: Volume 137, Number 4, November 2023
  Link: https://www.ingentaconnect.com/content/mtax/mt/2023/00000137/00000004
  Summary: MYCOTAXON, Volume 137, Number 4, November 2023.
 Loaded on 2023-11-22
bibtex_link: https://www.ingentaconnect.com/content/mtax/mt/2023/00000137/00000004?format=bib
Entry (line: 0, type: `article`, key: `:2023:0093-4666:1`):
	`title` = `Volume 137-4: Contents, nomenclatural updates, corrigenda, peers, editorial`
	`journal` = `Mycotaxon`
	`volume` = `137`
	`number` = `4`
	`year` = `2023`
	`itemtype` = `article`
	`issn` = `0093-4666`
	`eissn` = `2154-8889`
	`publication date` = `2023-11-11T00:00:00`
	`pages` = `1-619`
	`url` = `https://www.ingentaconnect.com/content/mtax/mt/2023/00000137/00000004/art00001`
	`parent_itemid` = `infobike://mtax/mt`
	`publishercode` = `mtax`
--

Download the RSS

Read bibtex files and create records for each article.

Download the PDFs at the URLs in the bibtex entries.

Create a JSON record with the PDF as an attachment.

### Text extraction

Extract the text, optionally with OCR. Add as an additional attachment on the source record.