In [1]:
# Cell 1: Setup Logging and Load Prepared Datasets (unchanged)

import logging
import sys
import os
import pandas as pd

LOG_FILE = "small_facilities_analysis.log" # Changed log file name

logging.basicConfig(
    level=logging.INFO,
    filename=LOG_FILE,
    filemode="w",  # Overwrite previous logs on each run
    format="%(asctime)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger()

# Redirect print statements to logger
class LoggerWriter:
    def __init__(self, level):
        self.level = level
    def write(self, message):
        if message.strip():
            self.level(message.strip())
    def flush(self):
        pass

sys.stdout = LoggerWriter(logger.info)
sys.stderr = LoggerWriter(logger.error)

print("=== Logging Setup Complete ===")

# Define path to prepared datasets (adjust if necessary)
PREPARED_PATH = "./data/prepared/"

# Dictionary mapping dataset names to their file names
files = {
    "pbj_nurse": "pbj_nurse_prepared.parquet",
    "pbj_non_nurse": "pbj_non_nurse_prepared.parquet",
    "qrp_provider": "qrp_provider_prepared.parquet",
    "nh_survey": "nh_survey_prepared.parquet",
    "nh_quality_mds": "nh_quality_mds_prepared.parquet",
    "nh_ownership": "nh_ownership_prepared.parquet",
    "nh_citations": "nh_citations_prepared.parquet"
}

# Load datasets into a dictionary
loaded_datasets = {}
for key, filename in files.items():
    file_path = os.path.join(PREPARED_PATH, filename)
    try:
        loaded_datasets[key] = pd.read_parquet(file_path)
        print(f"Loaded {key} with shape {loaded_datasets[key].shape}")
    except Exception as e:
        print(f"Error loading {key}: {e}")


# Cell 2: Identify Small Facilities (<= 120 Residents based on max census) (unchanged)

print("\n=== Identifying Small Facilities (<= 120 Residents) ===")

pbj_nurse = loaded_datasets['pbj_nurse']

# Calculate maximum daily census per facility
facility_max_census = pbj_nurse.groupby('provnum')['mdscensus'].max()

# Identify provnums of small facilities (max census <= 120)
small_facility_provnums = facility_max_census[facility_max_census <= 120].index.tolist()
print(f"Number of small facilities identified: {len(small_facility_provnums)}")

# Filter pbj_nurse to include only small facilities
small_facilities_pbj_nurse = pbj_nurse[pbj_nurse['provnum'].isin(small_facility_provnums)].copy() # Use .copy() to avoid SettingWithCopyWarning
print(f"Shape of pbj_nurse for small facilities: {small_facilities_pbj_nurse.shape}")

# Store the filtered dataset for later use
loaded_datasets['pbj_nurse_small'] = small_facilities_pbj_nurse

print("=== Small Facilities Identification Complete ===")


# Cell 3: Analyze RN Staffing Ratio in Small Facilities (Question 1 Refined) (unchanged)

print("\n=== Analyzing RN Staffing Ratio in Small Facilities (Question 1) ===")

small_facilities_pbj_nurse = loaded_datasets['pbj_nurse_small']

# Calculate total RN hours and contract RN hours
small_facilities_pbj_nurse.loc[:, 'total_rn_hours'] = small_facilities_pbj_nurse['hrs_rn_emp'] + small_facilities_pbj_nurse['hrs_rn_ctr']
small_facilities_pbj_nurse.loc[:, 'contract_rn_ratio'] = small_facilities_pbj_nurse['hrs_rn_ctr'] / small_facilities_pbj_nurse['total_rn_hours']

# Handle cases where total_rn_hours is zero to avoid division by zero (set ratio to 0)
small_facilities_pbj_nurse.loc[small_facilities_pbj_nurse['total_rn_hours'] == 0, 'contract_rn_ratio'] = 0

# Aggregate to daily level (optional, already daily data but ensures consistency)
daily_rn_ratio_small_facilities = small_facilities_pbj_nurse.groupby('workdate')['contract_rn_ratio'].mean().reset_index()

# Print summary statistics for RN contract ratio in small facilities
print("\nSummary Statistics for RN Contract Ratio in Small Facilities:")
print(daily_rn_ratio_small_facilities['contract_rn_ratio'].describe())

# Calculate average RN contract ratio for small facilities
average_rn_ratio_small_facilities = daily_rn_ratio_small_facilities['contract_rn_ratio'].mean()
print(f"\nAverage RN Contract Ratio in Small Facilities (Q2 2024): {average_rn_ratio_small_facilities:.4f}")

print("=== RN Staffing Ratio Analysis in Small Facilities Complete ===")


# Cell 4: Analyze Intra-Quarter Variation for CNAs in Small Facilities (Question 2 Refined) (unchanged)

print("\n=== Analyzing Intra-Quarter Variation for CNAs in Small Facilities (Question 2) ===")

small_facilities_pbj_nurse = loaded_datasets['pbj_nurse_small']

# Calculate total CNA hours and contract CNA hours
small_facilities_pbj_nurse.loc[:, 'total_cna_hours'] = small_facilities_pbj_nurse['hrs_cna_emp'] + small_facilities_pbj_nurse['hrs_cna_ctr']
small_facilities_pbj_nurse.loc[:, 'contract_cna_ratio'] = small_facilities_pbj_nurse['hrs_cna_ctr'] / small_facilities_pbj_nurse['total_cna_hours']

# Handle cases where total_cna_hours is zero
small_facilities_pbj_nurse.loc[small_facilities_pbj_nurse['total_cna_hours'] == 0, 'contract_cna_ratio'] = 0

# Calculate daily average CNA contract ratio and resident census for small facilities
daily_cna_ratio_census_small_facilities = small_facilities_pbj_nurse.groupby('workdate').agg(
    average_cna_ratio=('contract_cna_ratio', 'mean'),
    average_census=('mdscensus', 'mean')
).reset_index()

# Calculate correlation between daily CNA contract ratio and average census in small facilities
correlation_cna_census_small_facilities = daily_cna_ratio_census_small_facilities['average_cna_ratio'].corr(daily_cna_ratio_census_small_facilities['average_census'])
print(f"\nCorrelation between Daily CNA Contract Ratio and Average Census in Small Facilities: {correlation_cna_census_small_facilities:.4f}")

# Print summary statistics for CNA contract ratio in small facilities
print("\nSummary Statistics for CNA Contract Ratio in Small Facilities:")
print(daily_cna_ratio_census_small_facilities['average_cna_ratio'].describe())

print("=== CNA Intra-Quarter Variation Analysis in Small Facilities Complete ===")


# Cell 5: Explore Exclusive Contract Facilities and Outliers among Small Facilities (FIXED FORMAT SPECIFIER)

print("\n=== Exploring Exclusive Contract & Outlier Facilities among Small Facilities ===")

small_facilities_pbj_nurse = loaded_datasets['pbj_nurse_small']

# Identify days with near-100% contract RN usage in small facilities
threshold_high_contract = 0.99  # Define "near-100%" threshold
exclusive_contract_days_small_facilities = small_facilities_pbj_nurse[small_facilities_pbj_nurse['contract_rn_ratio'] >= threshold_high_contract]

print(f"\nNumber of days with >= {threshold_high_contract*100}% contract RN usage in small facilities: {len(exclusive_contract_days_small_facilities)}")
print(f"Number of unique small facilities with >= {threshold_high_contract*100}% contract RN usage on at least one day: {exclusive_contract_days_small_facilities['provnum'].nunique()}")

# Identify outlier facilities based on average RN contract ratio (top percentile) among small facilities
facility_avg_rn_ratio_small_facilities = small_facilities_pbj_nurse.groupby('provnum')['contract_rn_ratio'].mean()
outlier_threshold_percentile = 0.95 # Top 5% as outliers
outlier_ratio_threshold_small_facilities = facility_avg_rn_ratio_small_facilities.quantile(outlier_threshold_percentile)
outlier_small_facilities_provnums = facility_avg_rn_ratio_small_facilities[facility_avg_rn_ratio_small_facilities >= outlier_ratio_threshold_small_facilities].index.tolist()

# Corrected format specifier in f-string
print(f"\nNumber of outlier small facilities (top {100*(1-outlier_threshold_percentile):.0f}%ile by average RN contract ratio): {len(outlier_small_facilities_provnums)}")
print(f"Example Outlier Ratio Threshold: {outlier_ratio_threshold_small_facilities:.4f}")


print("=== Exclusive Contract & Outlier Facility Exploration among Small Facilities Complete ===")


# Cell 6: Analyze Quality and Deficiencies for Small Facilities with Higher Temp Staffing (unchanged)

print("\n=== Analyzing Quality & Deficiencies for Small Facilities with Higher Temp Staffing ===")

small_facilities_pbj_nurse = loaded_datasets['pbj_nurse_small']
nh_survey = loaded_datasets['nh_survey']
qrp_provider = loaded_datasets['qrp_provider'] # Load QRP for quality scores

# Calculate facility average RN contract ratio (if not already calculated)
if 'facility_avg_rn_ratio' not in locals(): # Check if already calculated in Cell 5
    facility_avg_rn_ratio_small_facilities = small_facilities_pbj_nurse.groupby('provnum')['contract_rn_ratio'].mean()

# Merge facility average RN ratio with survey and quality data
small_facilities_avg_rn_ratio_df = facility_avg_rn_ratio_small_facilities.reset_index(name='avg_rn_ratio_small_facility')
small_facilities_survey_quality = pd.merge(small_facilities_avg_rn_ratio_df, nh_survey, left_on='provnum', right_on='cms_certification_number_(ccn)', how='left')
small_facilities_survey_quality = pd.merge(small_facilities_survey_quality, qrp_provider, left_on='provnum', right_on='cms_certification_number_(ccn)', how='left') # Merge with QRP

# Calculate correlation with total health deficiencies (using survey data)
correlation_deficiencies_rn_ratio_small_facilities = small_facilities_survey_quality['avg_rn_ratio_small_facility'].corr(small_facilities_survey_quality['total_number_of_health_deficiencies'])
print(f"\nCorrelation between Avg RN Contract Ratio and Total Health Deficiencies in Small Facilities: {correlation_deficiencies_rn_ratio_small_facilities:.4f}")


# Assuming you have a relevant quality score column in qrp_provider (replace 'quality_score_column' with actual column name if different)
quality_score_column = 'score' # Example - you might need to examine qrp_provider columns to find a suitable score
# Ensure score column exists and handle potential errors if it doesn't
if quality_score_column in small_facilities_survey_quality.columns:
    # Convert score column to numeric, handling errors and NaNs
    small_facilities_survey_quality.loc[:, quality_score_column] = pd.to_numeric(small_facilities_survey_quality[quality_score_column], errors='coerce')
    correlation_quality_rn_ratio_small_facilities = small_facilities_survey_quality['avg_rn_ratio_small_facility'].corr(small_facilities_survey_quality[quality_score_column])
    print(f"Correlation between Avg RN Contract Ratio and Quality Score in Small Facilities ({quality_score_column}): {correlation_quality_rn_ratio_small_facilities:.4f}")
else:
    print(f"\nWarning: Quality score column '{quality_score_column}' not found in merged dataset. Skipping quality score correlation.")


print("=== Quality & Deficiencies Analysis for Small Facilities Complete ===")


# Cell 7: Analyze Ownership Type of Small Facilities (unchanged)

print("\n=== Analyzing Ownership Type of Small Facilities ===")

small_facilities_pbj_nurse = loaded_datasets['pbj_nurse_small']
nh_ownership = loaded_datasets['nh_ownership']

# Get unique provnums of small facilities (from pbj_nurse_small)
unique_small_facility_provnums = small_facilities_pbj_nurse['provnum'].unique()

# Filter nh_ownership to only include small facilities
nh_ownership_small_facilities = nh_ownership[nh_ownership['cms_certification_number_(ccn)'].isin(unique_small_facility_provnums)]

# Analyze ownership types - count occurrences of each owner_type
ownership_type_counts_small_facilities = nh_ownership_small_facilities['owner_type'].value_counts(normalize=True) # Normalize to percentages

print("\nOwnership Type Distribution for Small Facilities:")
print(ownership_type_counts_small_facilities)

# You can further analyze by grouping by ownership type and looking at average RN ratios if needed
# Example (needs facility_avg_rn_ratio from previous steps to be defined for ALL small facilities):
if 'facility_avg_rn_ratio_small_facilities' in locals(): # Check if calculated in Cell 5 or 6
    ownership_rn_ratio_small_facilities = pd.merge(facility_avg_rn_ratio_small_facilities.reset_index(), nh_ownership_small_facilities, left_on='provnum', right_on='cms_certification_number_(ccn)', how='left')
    avg_rn_ratio_by_ownership_small_facilities = ownership_rn_ratio_small_facilities.groupby('owner_type')['contract_rn_ratio'].mean()
    print("\nAverage RN Contract Ratio by Ownership Type in Small Facilities:")
    print(avg_rn_ratio_by_ownership_small_facilities)
else:
    print("\nSkipping Average RN Contract Ratio by Ownership Type as facility_avg_rn_ratio_small_facilities is not defined.")


print("=== Ownership Type Analysis of Small Facilities Complete ===")

print("\n=== Comprehensive Analysis of Small Facilities Completed ===")
print(f"Logs saved to: {LOG_FILE}")