# Pubmed Data Ingestion

This notebook handles the ingestion of raw data from the Pubmed FTP server csv file(s)

In [None]:
import dlt
import gzip
import tempfile
from ftplib import FTP
import logging
from pyspark.sql.functions import current_timestamp
from pyspark.sql.types import StructType, StructField, StringType

# Setup Python logger
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


In [None]:
# FTP file functions
def pubmed_ftp_client():
    ftp = FTP('ftp.ncbi.nlm.nih.gov')
    ftp.login()
    ftp.cwd('/pubmed/updatefiles/')
    return ftp

def retrieve_file(ftp_client, filename):
    local_filename = tempfile.mkstemp()[1]

    with open(local_filename, 'wb') as f:
        ftp_client.retrbinary(f'RETR {filename}', f.write)

    logger.info(f'Retrieved {filename} as {local_filename}')
    return local_filename

In [None]:
def get_xml_content(xml_gz_filename):
    with gzip.open(xml_gz_filename, "rb") as xml_file:
        return xml_file.read().decode()

# Function to fetch raw Pubmed data

In [None]:
def fetch_raw_pubmed_data():
    ftp = pubmed_ftp_client()
    remote_filenames = sorted([f for f in ftp.nlst() if f.endswith('.xml.gz')])
    
    for fname in remote_filenames[:10]:
        local_fname = retrieve_file(ftp, fname)
        xml_content = get_xml_content(local_fname)
        yield fname, xml_content

# Define table for raw Pubmed data

In [None]:
@dlt.table(
    comment="Raw Pubmed works data",
    table_properties={"quality": "bronze"}
)
def pubmed_ingestion():
    raw_data_schema = StructType([
        StructField("pubmed_fname", StringType(), True),
        StructField("xml_content", StringType(), True)
    ])
    
    df = spark.createDataFrame(fetch_raw_pubmed_data(), schema=raw_data_schema) \
           .withColumn("ingestion_timestamp", current_timestamp())
    
    logger.info(f"Raw data ingestion complete. Total rows: {df.count()}")
    return df
    
    
    
    

