In [1]:
# Process Phase - Logging Setup
import logging
import sys

LOG_FILE = "process_phase.log"

logging.basicConfig(
    level=logging.INFO,
    filename=LOG_FILE,
    filemode="w",  # Overwrite previous logs on each run
    format="%(asctime)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger()

# Redirect print statements to the logger
class LoggerWriter:
    def __init__(self, level):
        self.level = level
    def write(self, message):
        if message.strip():
            self.level(message.strip())
    def flush(self):
        pass

sys.stdout = LoggerWriter(logger.info)
sys.stderr = LoggerWriter(logger.error)

print("=== Starting Process Phase Notebook: Logging Setup Complete ===")


In [2]:
# Process Phase - Cell 1: Load Prepared Datasets

import os
import pandas as pd

# Define the path where the prepared datasets are stored
PREPARED_PATH = "./data/prepared/"

# Dictionary mapping dataset names to their prepared file names
files = {
    "pbj_nurse": "pbj_nurse_prepared.parquet",
    "pbj_non_nurse": "pbj_non_nurse_prepared.parquet",
    "qrp_provider": "qrp_provider_prepared.parquet",
    "nh_survey": "nh_survey_prepared.parquet",
    "nh_quality_mds": "nh_quality_mds_prepared.parquet",
    "nh_ownership": "nh_ownership_prepared.parquet",
    "nh_citations": "nh_citations_prepared.parquet"
}

loaded_datasets = {}
for key, filename in files.items():
    file_path = os.path.join(PREPARED_PATH, filename)
    try:
        loaded_datasets[key] = pd.read_parquet(file_path)
        print(f"Loaded {key} with shape {loaded_datasets[key].shape}")
    except Exception as e:
        print(f"Error loading {key}: {e}")

# Optionally, check the columns of one dataset to verify
print("Columns in pbj_nurse:", loaded_datasets['pbj_nurse'].columns.tolist())


In [3]:
# Process Phase - Cell 2: Derive RN Temporary Staffing Ratio for PBJ Nurse

# We define the RN temporary staffing ratio as:
# rn_temp_ratio = hrs_rn_ctr / (hrs_rn_emp + hrs_rn_ctr + 1e-6)
# This ratio is computed on a row-by-row basis.

df_nurse = loaded_datasets['pbj_nurse'].copy()

required_columns = ['hrs_rn_ctr', 'hrs_rn_emp']
if all(col in df_nurse.columns for col in required_columns):
    df_nurse['rn_temp_ratio'] = df_nurse['hrs_rn_ctr'] / (df_nurse['hrs_rn_emp'] + df_nurse['hrs_rn_ctr'] + 1e-6)
    print("Calculated 'rn_temp_ratio' for PBJ Nurse dataset.")
else:
    print("Required columns for RN ratio calculation are missing.")

# Display the first 5 rows of the relevant columns
print(df_nurse[['hrs_rn_emp', 'hrs_rn_ctr', 'rn_temp_ratio']].head())

# Update our dataset in the dictionary
loaded_datasets['pbj_nurse'] = df_nurse
