In [14]:
# Cell 1: Setup and Logging

import os
import sys
import logging

# Path where your Parquet files are located
# If your files are truly in the same folder as this notebook, set PARQUET_PATH = "./"
# If they're in a subfolder named "data", adjust accordingly (e.g., "./data/")
PARQUET_PATH = "./data/interim/"

# Log file name (this will capture the output of print/log statements)
LOG_FILE = "prepare_phase_parquet_output.log"

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    filename=LOG_FILE,
    filemode="w",  # Overwrite on each run
    format="%(asctime)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger()

# Redirect print statements to the logger
class LoggerWriter:
    def __init__(self, level):
        self.level = level
    def write(self, message):
        # Only log non-empty messages
        if message.strip():
            self.level(message.strip())
    def flush(self):
        pass

sys.stdout = LoggerWriter(logger.info)
sys.stderr = LoggerWriter(logger.error)

print("=== Prepare Phase Notebook: Logging Setup Complete ===")
print(f"Using PARQUET_PATH: {PARQUET_PATH}")
print(f"Logs will be written to: {LOG_FILE}")


In [15]:
# Cell 2: Define Dataset Dictionary and Load Parquet Files

import pandas as pd

print("=== Starting Prepare Phase (Parquet Version) ===")

# Dictionary mapping dataset names to Parquet file names
datasets = {
    "pbj_nurse": "preprocessed_pbj_nurse.parquet",
    "pbj_non_nurse": "preprocessed_pbj_non_nurse.parquet",
    "qrp_provider": "preprocessed_qrp_provider.parquet",
    "nh_survey": "preprocessed_nh_survey.parquet",
    "nh_quality_mds": "preprocessed_nh_quality_mds.parquet",
    "nh_ownership": "preprocessed_nh_ownership.parquet",
    "nh_citations": "preprocessed_nh_citations.parquet"
}

loaded_datasets = {}

# Loop through each dataset and attempt to load the Parquet file
for key, filename in datasets.items():
    file_path = os.path.join(PARQUET_PATH, filename)
    if os.path.exists(file_path):
        try:
            df = pd.read_parquet(file_path)
            loaded_datasets[key] = df
            print(f"Loaded '{key}' from {file_path}. Shape: {df.shape}")
        except Exception as e:
            print(f"Error loading '{key}' from {file_path}: {e}")
    else:
        print(f"File not found: {file_path}")

print("=== Summary of Loaded Datasets ===")
if loaded_datasets:
    for key, df in loaded_datasets.items():
        print(f"{key}: shape={df.shape}, columns={list(df.columns)}")
else:
    print("No datasets were loaded. Check file paths or filenames.")

print("=== Prepare Phase (Parquet) - Load Step Completed ===")


In [16]:
# Cell 3: Data Quality Checks

# We will do some simple checks on each loaded dataset:
# 1. Count missing values (NaN) in each column
# 2. Check for duplicates (if relevant)
# 3. Potentially identify basic outliers (numeric columns only)

if not loaded_datasets:
    print("No datasets to check. Please ensure data is loaded in Cell 2.")
else:
    for key, df in loaded_datasets.items():
        print(f"\n=== Data Quality Checks for {key} ===")
        
        # 1. Missing Values
        missing_counts = df.isna().sum()
        total_rows = len(df)
        print("Missing Value Counts per Column:")
        print(missing_counts)

        # 2. Duplicate Rows
        # Only check duplicates if it's meaningful for your dataset
        # (some datasets might have legitimate duplicates)
        duplicate_count = df.duplicated().sum()
        print(f"Number of Duplicate Rows: {duplicate_count}")

        # 3. Basic Outlier Detection (example: for numeric columns)
        numeric_cols = df.select_dtypes(include=["int", "float"]).columns
        if len(numeric_cols) > 0:
            describe_df = df[numeric_cols].describe()
            print("Basic Stats for Numeric Columns:")
            print(describe_df)
        else:
            print("No numeric columns found for basic outlier checks.")

print("=== Data Quality Checks Completed ===")


In [17]:
# Cell 4: Data Type Validation and Conversion

import pandas as pd

print("=== Starting Data Type Validation and Conversion ===")

# For each dataset, check and convert date columns as needed.
for key, df in loaded_datasets.items():
    print(f"\n--- Evaluating {key} ---")
    print("Current Data Types:")
    print(df.dtypes)
    
    # PBJ datasets: Convert 'WorkDate' if present
    if key in ['pbj_nurse', 'pbj_non_nurse']:
        if 'WorkDate' in df.columns:
            # Convert WorkDate (assumed to be in YYYYMMDD format)
            df['WorkDate'] = pd.to_datetime(df['WorkDate'].astype(str), format='%Y%m%d', errors='coerce')
            print(f"Converted 'WorkDate' to datetime for {key}")
    
    # QRP Provider: Convert 'Start Date' and 'End Date'
    if key == 'qrp_provider':
        for col in ['Start Date', 'End Date']:
            if col in df.columns:
                df[col] = pd.to_datetime(df[col], errors='coerce')
                print(f"Converted '{col}' to datetime for {key}")
    
    # NH Quality MDS: Convert 'Measure Period' and 'Processing Date'
    if key == 'nh_quality_mds':
        for col in ['Measure Period', 'Processing Date']:
            if col in df.columns:
                df[col] = pd.to_datetime(df[col], errors='coerce')
                print(f"Converted '{col}' to datetime for {key}")
    
    # NH Ownership: Convert 'Association Date' and 'Processing Date'
    if key == 'nh_ownership':
        for col in ['Association Date', 'Processing Date']:
            if col in df.columns:
                df[col] = pd.to_datetime(df[col], format='%Y-%m-%d', errors='coerce')
                print(f"Converted '{col}' to datetime for {key} with format='%Y-%m-%d'")

    # NH Survey: Convert 'Health Survey Date', 'Fire Safety Survey Date', 'Processing Date'
    if key == 'nh_survey':
        for col in ['Health Survey Date', 'Fire Safety Survey Date', 'Processing Date']:
            if col in df.columns:
                df[col] = pd.to_datetime(df[col], errors='coerce')
                print(f"Converted '{col}' to datetime for {key}")
    
    # NH Citations: Convert 'Survey Date', 'Correction Date', 'Processing Date'
    if key == 'nh_citations':
        for col in ['Survey Date', 'Correction Date', 'Processing Date']:
            if col in df.columns:
                df[col] = pd.to_datetime(df[col], errors='coerce')
                print(f"Converted '{col}' to datetime for {key}")
    
    # Update our dataset with the new conversions
    loaded_datasets[key] = df

print("=== Data Type Validation and Conversion Completed ===")


In [18]:
# Cell 5: Standardization & Basic Cleaning

print("=== Starting Standardization & Basic Cleaning ===")

for key, df in loaded_datasets.items():
    print(f"\n--- Standardizing columns for {key} ---")
    
    # Create a dictionary to map original column names to standardized names
    new_columns = {col: col.strip().lower().replace(" ", "_").replace("-", "_") for col in df.columns}
    
    # Rename columns using the dictionary
    df.rename(columns=new_columns, inplace=True)
    
    # Optionally, if there are known columns that require additional standardization,
    # you can add those adjustments here. For example:
    # if 'state' in df.columns:
    #     df['state'] = df['state'].str.upper()
    
    print("New columns:")
    print(list(df.columns))
    
    # Update the dataset with the standardized dataframe
    loaded_datasets[key] = df

print("=== Standardization & Basic Cleaning Completed ===")


In [19]:
# Cell 7: Final Save and Documentation (No Merge)

import os

# Define a folder to save the final prepared datasets
FINAL_OUTPUT_PATH = "./data/prepared/"
os.makedirs(FINAL_OUTPUT_PATH, exist_ok=True)

print("=== Saving Final Prepared Datasets ===")

for key, df in loaded_datasets.items():
    out_file = os.path.join(FINAL_OUTPUT_PATH, f"{key}_prepared.parquet")
    try:
        df.to_parquet(out_file, index=False)
        print(f"Saved {key} to {out_file}")
    except Exception as e:
        print(f"Error saving {key}: {e}")

print("=== Final Save Completed. Datasets are now ready for analysis. ===")


In [24]:
loaded_datasets["pbj_nurse"].head()

Unnamed: 0,provnum,provname,city,state,county_name,county_fips,cy_qtr,workdate,mdscensus,hrs_rndon,...,hrs_lpn_ctr,hrs_cna,hrs_cna_emp,hrs_cna_ctr,hrs_natrn,hrs_natrn_emp,hrs_natrn_ctr,hrs_medaide,hrs_medaide_emp,hrs_medaide_ctr
0,15009,"BURNS NURSING HOME, INC.",RUSSELLVILLE,AL,Franklin,59,2024Q2,2024-04-01,51,10.77,...,0.0,160.08,160.08,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,15009,"BURNS NURSING HOME, INC.",RUSSELLVILLE,AL,Franklin,59,2024Q2,2024-04-02,52,8.43,...,0.0,135.95,135.95,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,15009,"BURNS NURSING HOME, INC.",RUSSELLVILLE,AL,Franklin,59,2024Q2,2024-04-03,53,11.13,...,0.0,150.31,150.31,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,15009,"BURNS NURSING HOME, INC.",RUSSELLVILLE,AL,Franklin,59,2024Q2,2024-04-04,52,12.27,...,0.0,133.01,133.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,15009,"BURNS NURSING HOME, INC.",RUSSELLVILLE,AL,Franklin,59,2024Q2,2024-04-05,52,4.95,...,0.0,137.92,137.92,0.0,0.0,0.0,0.0,0.0,0.0,0.0
