In [1]:
# Process Phase - Logging Setup
import logging
import sys

LOG_FILE = "dataset-exploration.log"

logging.basicConfig(
    level=logging.INFO,
    filename=LOG_FILE,
    filemode="w",  # Overwrite previous logs on each run
    format="%(asctime)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger()

# Redirect print statements to the logger
class LoggerWriter:
    def __init__(self, level):
        self.level = level
    def write(self, message):
        if message.strip():
            self.level(message.strip())
    def flush(self):
        pass

sys.stdout = LoggerWriter(logger.info)
sys.stderr = LoggerWriter(logger.error)

print("=== Exploration Phase Notebook: Logging Setup Complete ===")


In [2]:
# Process Phase - Cell 1: Load Prepared Datasets

import os
import pandas as pd

# Define the path where the prepared datasets are stored
PREPARED_PATH = "./data/prepared/"

# Dictionary mapping dataset names to their prepared file names
files = {
    "pbj_nurse": "pbj_nurse_prepared.parquet",
    "pbj_non_nurse": "pbj_non_nurse_prepared.parquet",
    "qrp_provider": "qrp_provider_prepared.parquet",
    "nh_survey": "nh_survey_prepared.parquet",
    "nh_quality_mds": "nh_quality_mds_prepared.parquet",
    "nh_ownership": "nh_ownership_prepared.parquet",
    "nh_citations": "nh_citations_prepared.parquet"
}

loaded_datasets = {}
for key, filename in files.items():
    file_path = os.path.join(PREPARED_PATH, filename)
    try:
        loaded_datasets[key] = pd.read_parquet(file_path)
        print(f"Loaded {key} with shape {loaded_datasets[key].shape}")
    except Exception as e:
        print(f"Error loading {key}: {e}")

# Optionally, check the columns of one dataset to verify
print("Columns in pbj_nurse:", loaded_datasets['pbj_nurse'].columns.tolist())


In [3]:
# Process Phase - Cell 2: Comprehensive Dataset Logging

import logging
import pandas as pd

# Configure logging
logging.basicConfig(filename='dataset_exploration.log', level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s')

def log_dataset_info(df, dataset_name):
    """Logs comprehensive information about a DataFrame."""

    logging.info(f"--- Dataset: {dataset_name} ---")

    # Basic Shape
    logging.info(f"Shape: {df.shape}")

    # Columns
    logging.info(f"Columns: {df.columns.tolist()}")

    # Data Types
    logging.info("Data Types:\n" + str(df.dtypes))

    # Missing Values
    missing_values = df.isnull().sum()
    logging.info("Missing Values:\n" + str(missing_values))
    logging.info(f"Total Missing Values: {missing_values.sum()}")

    # Duplicate Rows
    num_duplicates = df.duplicated().sum()
    logging.info(f"Number of Duplicate Rows: {num_duplicates}")

    # Descriptive Statistics (Numeric Columns)
    numeric_cols = df.select_dtypes(include=['number']).columns
    if not numeric_cols.empty:
        logging.info("Descriptive Statistics (Numeric Columns):\n" + str(df[numeric_cols].describe()))
    else:
        logging.info("No numeric columns found for descriptive statistics.")


    # Unique Value Counts (for Categorical/Low Cardinality Columns)
    for col in df.columns:
        if df[col].nunique() < 50:  # Adjust threshold as needed
            logging.info(f"Unique Value Counts for '{col}':\n" + str(df[col].value_counts()))

    # First 5 Rows
    logging.info("First 5 Rows:\n" + str(df.head()))

    logging.info(f"--- End Dataset: {dataset_name} ---\n")


# Loop through the loaded datasets and log information
for dataset_name, df in loaded_datasets.items():
    log_dataset_info(df, dataset_name)

print("Dataset exploration complete.  See 'dataset_exploration.log' for details.")