In [None]:
%pip install xmltodict

# Pubmed Data Ingestion

This notebook handles the ingestion of raw data from the Pubmed FTP server csv file(s)

In [1]:
import dlt
import xmltodict
import json
import gzip
import tempfile
from ftplib import FTP
import logging
from pyspark.sql.functions import current_timestamp
from pyspark.sql.types import StructType, StructField, StringType

# Setup Python logger
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


In [2]:
# FTP file functions
def pubmed_ftp_client():
    ftp = FTP('ftp.ncbi.nlm.nih.gov')
    ftp.login()
    ftp.cwd('/pubmed/updatefiles/')
    return ftp

def retrieve_file(ftp_client, filename):
    local_filename = tempfile.mkstemp()[1]

    with open(local_filename, 'wb') as f:
        ftp_client.retrbinary(f'RETR {filename}', f.write)

    logger.info(f'Retrieved {filename} as {local_filename}')
    return local_filename

In [3]:
def get_xml_content(xml_gz_filename):
    with gzip.open(xml_gz_filename, "rb") as xml_file:
        return xml_file.read()
    
def xml_to_json(xml_content):
    xml_dict = xmltodict.parse(xml_content)
    return json.dumps(xml_dict)


# Define table for raw Pubmed data (converted to JSON)

In [None]:
@dlt.table(
    comment="PubMed data converted to JSON",
    table_properties={"quality": "bronze"}
)
def pubmed_ingestion():
    def get_pubmed_json():
        ftp = pubmed_ftp_client()
        remote_filenames = sorted([f for f in ftp.nlst() if f.endswith('.xml.gz')])
        ftp.quit()
        
        for fname in remote_filenames[:10]:
            ftp = pubmed_ftp_client()# Limiting to 10 files for testing
            local_fname = retrieve_file(ftp, fname)
            ftp.quit()
            xml_content = get_xml_content(local_fname)
            json_content = xml_to_json(xml_content)
            for item in json_content:
                yield fname, item

    schema = StructType([
        StructField("pubmed_fname", StringType(), True),
        StructField("json_content", StringType(), True)
    ])

    df = spark.createDataFrame(get_pubmed_json(), schema=schema)

    df_with_timestamp = df.withColumn("ingestion_timestamp", current_timestamp())

    logger.info(f"PubMed data ingestion complete. Total files processed: {df_with_timestamp.count()}")
    return df_with_timestamp
    
    
    
    

