In [None]:
%pip install xmltodict

# Pubmed Data Ingestion

This notebook handles the ingestion of raw data from the Pubmed FTP server csv file(s)

In [8]:
import dlt
import xmltodict
import json
import gzip
import tempfile
from ftplib import FTP
import logging
from pyspark.sql.functions import current_timestamp
from pyspark.sql.types import StructType, StructField, StringType

# Setup Python logger
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


In [9]:
# FTP file functions
def pubmed_ftp_client():
    ftp = FTP('ftp.ncbi.nlm.nih.gov')
    ftp.login()
    ftp.cwd('/pubmed/updatefiles/')
    return ftp

def retrieve_file(ftp_client, filename):
    local_filename = tempfile.mkstemp()[1]

    with open(local_filename, 'wb') as f:
        ftp_client.retrbinary(f'RETR {filename}', f.write)

    logger.info(f'Retrieved {filename} as {local_filename}')
    return local_filename

In [10]:
def get_xml_content(xml_gz_filename):
    with gzip.open(xml_gz_filename, "rb") as xml_file:
        return xml_file.read()


# Define table for raw Pubmed data (converted to JSON)

In [11]:
@dlt.table(
    comment="PubMed data converted to JSON",
    table_properties={"quality": "bronze"}
)
def pubmed_ingestion():
    def get_pubmed_json():
        ftp = pubmed_ftp_client()
        remote_filenames = sorted([f for f in ftp.nlst() if f.endswith('.xml.gz')])
        ftp.quit()
        
        for fname in remote_filenames[:10]:
            ftp = pubmed_ftp_client() # Limiting to 10 files for testing
            local_fname = retrieve_file(ftp, fname)
            ftp.quit()
            xml_content = get_xml_content(local_fname)
            xml_dict = xmltodict.parse(xml_content)
            for item in xml_dict:
                # print(fname, json.dumps(item))
                yield fname, json.dumps(item)

    schema = StructType([
        StructField("pubmed_fname", StringType(), True),
        StructField("json_content", StringType(), True)
    ])

    df = spark.createDataFrame(get_pubmed_json(), schema=schema)

    df_with_timestamp = df.withColumn("ingestion_timestamp", current_timestamp())

    logger.info(f"PubMed data ingestion complete. Total files processed: {df_with_timestamp.count()}")
    return df_with_timestamp
    


In [12]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
df = pubmed_ingestion()

INFO:__main__:Retrieved pubmed24n1220.xml.gz as /var/folders/vg/7t02brlx24d0z7d3djc62lv00000gn/T/tmph13uljqm


pubmed24n1220.xml.gz {
pubmed24n1220.xml.gz "
pubmed24n1220.xml.gz P
pubmed24n1220.xml.gz u
pubmed24n1220.xml.gz b
pubmed24n1220.xml.gz m
pubmed24n1220.xml.gz e
pubmed24n1220.xml.gz d
pubmed24n1220.xml.gz A
pubmed24n1220.xml.gz r
pubmed24n1220.xml.gz t
pubmed24n1220.xml.gz i
pubmed24n1220.xml.gz c
pubmed24n1220.xml.gz l
pubmed24n1220.xml.gz e
pubmed24n1220.xml.gz S
pubmed24n1220.xml.gz e
pubmed24n1220.xml.gz t
pubmed24n1220.xml.gz "
pubmed24n1220.xml.gz :
pubmed24n1220.xml.gz  
pubmed24n1220.xml.gz {
pubmed24n1220.xml.gz "
pubmed24n1220.xml.gz P
pubmed24n1220.xml.gz u
pubmed24n1220.xml.gz b
pubmed24n1220.xml.gz m
pubmed24n1220.xml.gz e
pubmed24n1220.xml.gz d
pubmed24n1220.xml.gz A
pubmed24n1220.xml.gz r
pubmed24n1220.xml.gz t
pubmed24n1220.xml.gz i
pubmed24n1220.xml.gz c
pubmed24n1220.xml.gz l
pubmed24n1220.xml.gz e
pubmed24n1220.xml.gz "
pubmed24n1220.xml.gz :
pubmed24n1220.xml.gz  
pubmed24n1220.xml.gz [
pubmed24n1220.xml.gz {
pubmed24n1220.xml.gz "
pubmed24n1220.xml.gz M
pubmed24n12

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



o
pubmed24n1220.xml.gz m
pubmed24n1220.xml.gz a
pubmed24n1220.xml.gz t
pubmed24n1220.xml.gz o
pubmed24n1220.xml.gz g
pubmed24n1220.xml.gz r
pubmed24n1220.xml.gz a
pubmed24n1220.xml.gz p
pubmed24n1220.xml.gz h
pubmed24n1220.xml.gz y
pubmed24n1220.xml.gz  
pubmed24n1220.xml.gz r
pubmed24n1220.xml.gz e
pubmed24n1220.xml.gz v
pubmed24n1220.xml.gz e
pubmed24n1220.xml.gz a
pubmed24n1220.xml.gz l
pubmed24n1220.xml.gz e
pubmed24n1220.xml.gz d
pubmed24n1220.xml.gz  
pubmed24n1220.xml.gz t
pubmed24n1220.xml.gz w
pubmed24n1220.xml.gz o
pubmed24n1220.xml.gz  
pubmed24n1220.xml.gz m
pubmed24n1220.xml.gz a
pubmed24n1220.xml.gz j
pubmed24n1220.xml.gz o
pubmed24n1220.xml.gz r
pubmed24n1220.xml.gz  
pubmed24n1220.xml.gz c
pubmed24n1220.xml.gz o
pubmed24n1220.xml.gz m
pubmed24n1220.xml.gz p
pubmed24n1220.xml.gz o
pubmed24n1220.xml.gz n
pubmed24n1220.xml.gz e
pubmed24n1220.xml.gz n
pubmed24n1220.xml.gz t
pubmed24n1220.xml.gz s
pubmed24n1220.xml.gz  
pubmed24n1220.xml.gz w
pubmed24n1220.xml.gz h
pubmed24n

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



:
pubmed24n1220.xml.gz  
pubmed24n1220.xml.gz {
pubmed24n1220.xml.gz "
pubmed24n1220.xml.gz @
pubmed24n1220.xml.gz U
pubmed24n1220.xml.gz I
pubmed24n1220.xml.gz "
pubmed24n1220.xml.gz :
pubmed24n1220.xml.gz  
pubmed24n1220.xml.gz "
pubmed24n1220.xml.gz D
pubmed24n1220.xml.gz 0
pubmed24n1220.xml.gz 0
pubmed24n1220.xml.gz 4
pubmed24n1220.xml.gz 0
pubmed24n1220.xml.gz 4
pubmed24n1220.xml.gz 0
pubmed24n1220.xml.gz "
pubmed24n1220.xml.gz ,
pubmed24n1220.xml.gz  
pubmed24n1220.xml.gz "
pubmed24n1220.xml.gz @
pubmed24n1220.xml.gz M
pubmed24n1220.xml.gz a
pubmed24n1220.xml.gz j
pubmed24n1220.xml.gz o
pubmed24n1220.xml.gz r
pubmed24n1220.xml.gz T
pubmed24n1220.xml.gz o
pubmed24n1220.xml.gz p
pubmed24n1220.xml.gz i
pubmed24n1220.xml.gz c
pubmed24n1220.xml.gz Y
pubmed24n1220.xml.gz N
pubmed24n1220.xml.gz "
pubmed24n1220.xml.gz :
pubmed24n1220.xml.gz  
pubmed24n1220.xml.gz "
pubmed24n1220.xml.gz N
pubmed24n1220.xml.gz "
pubmed24n1220.xml.gz ,
pubmed24n1220.xml.gz  
pubmed24n1220.xml.gz "
pubmed24n

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)




pubmed24n1220.xml.gz d
pubmed24n1220.xml.gz "
pubmed24n1220.xml.gz :
pubmed24n1220.xml.gz  
pubmed24n1220.xml.gz {
pubmed24n1220.xml.gz "
pubmed24n1220.xml.gz Y
pubmed24n1220.xml.gz e
pubmed24n1220.xml.gz a
pubmed24n1220.xml.gz r
pubmed24n1220.xml.gz "
pubmed24n1220.xml.gz :
pubmed24n1220.xml.gz  
pubmed24n1220.xml.gz "
pubmed24n1220.xml.gz 1
pubmed24n1220.xml.gz 9
pubmed24n1220.xml.gz 7
pubmed24n1220.xml.gz 9
pubmed24n1220.xml.gz "
pubmed24n1220.xml.gz ,
pubmed24n1220.xml.gz  
pubmed24n1220.xml.gz "
pubmed24n1220.xml.gz M
pubmed24n1220.xml.gz o
pubmed24n1220.xml.gz n
pubmed24n1220.xml.gz t
pubmed24n1220.xml.gz h
pubmed24n1220.xml.gz "
pubmed24n1220.xml.gz :
pubmed24n1220.xml.gz  
pubmed24n1220.xml.gz "
pubmed24n1220.xml.gz 0
pubmed24n1220.xml.gz 5
pubmed24n1220.xml.gz "
pubmed24n1220.xml.gz ,
pubmed24n1220.xml.gz  
pubmed24n1220.xml.gz "
pubmed24n1220.xml.gz D
pubmed24n1220.xml.gz a
pubmed24n1220.xml.gz y
pubmed24n1220.xml.gz "
pubmed24n1220.xml.gz :
pubmed24n1220.xml.gz  
pubmed24n1

KeyboardInterrupt: 