In [1]:
# Cell 1: Setup Logging and Load Prepared Datasets

import logging
import sys
import os
import pandas as pd

LOG_FILE = "individually_owned_small_facilities_analysis.log"

logging.basicConfig(
    level=logging.INFO,
    filename=LOG_FILE,
    filemode="w",  # Overwrite previous logs on each run
    format="%(asctime)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger()

# Redirect print statements to logger
class LoggerWriter:
    def __init__(self, level):
        self.level = level
    def write(self, message):
        if message.strip():
            self.level(message.strip())
    def flush(self):
        pass

sys.stdout = LoggerWriter(logger.info)
sys.stderr = LoggerWriter(logger.error)

print("=== Logging Setup Complete ===")

# Define path to prepared datasets (adjust if necessary)
PREPARED_PATH = "./data/prepared/"

# Dictionary mapping dataset names to their file names
files = {
    "pbj_nurse": "pbj_nurse_prepared.parquet",
    "pbj_non_nurse": "pbj_non_nurse_prepared.parquet",
    "qrp_provider": "qrp_provider_prepared.parquet",
    "nh_survey": "nh_survey_prepared.parquet",
    "nh_quality_mds": "nh_quality_mds_prepared.parquet",
    "nh_ownership": "nh_ownership_prepared.parquet",
    "nh_citations": "nh_citations_prepared.parquet"
}

# Load datasets into a dictionary
loaded_datasets = {}
for key, filename in files.items():
    file_path = os.path.join(PREPARED_PATH, filename)
    try:
        loaded_datasets[key] = pd.read_parquet(file_path)
        print(f"Loaded {key} with shape {loaded_datasets[key].shape}")
    except Exception as e:
        print(f"Error loading {key}: {e}")

# Cell 2: Identify Individually Owned Small Facilities

print("\n=== Identifying Individually Owned Small Facilities (<= 120 Residents) ===")

pbj_nurse = loaded_datasets['pbj_nurse']
nh_ownership = loaded_datasets['nh_ownership']

# 1. Identify Small Facilities (<= 120 Residents) - Reuse logic from previous notebook
facility_max_census = pbj_nurse.groupby('provnum')['mdscensus'].max()
small_facility_provnums = facility_max_census[facility_max_census <= 120].index.tolist()
print(f"Number of small facilities (<= 120 residents) identified: {len(small_facility_provnums)}")

# 2. Identify Individually Owned Facilities
individual_ownership_provnums = nh_ownership[nh_ownership['owner_type'] == 'Individual']['cms_certification_number_(ccn)'].unique().tolist()
print(f"Number of individually owned facilities identified in nh_ownership: {len(individual_ownership_provnums)}")

# 3. Find Intersection: Facilities that are BOTH Small AND Individually Owned
individually_owned_small_facility_provnums = list(set(small_facility_provnums) & set(individual_ownership_provnums))
print(f"Number of Individually Owned Small Facilities: {len(individually_owned_small_facility_provnums)}")

# 4. Filter pbj_nurse for Individually Owned Small Facilities
individually_owned_small_facilities_pbj_nurse = pbj_nurse[pbj_nurse['provnum'].isin(individually_owned_small_facility_provnums)].copy()
print(f"Shape of pbj_nurse for Individually Owned Small Facilities: {individually_owned_small_facilities_pbj_nurse.shape}")

# Store the filtered dataset for later use
loaded_datasets['pbj_nurse_individual_small'] = individually_owned_small_facilities_pbj_nurse

print("=== Individually Owned Small Facilities Identification Complete ===")


# Cell 3: Analyze RN Staffing Ratio in Individually Owned Small Facilities

print("\n=== Analyzing RN Staffing Ratio in Individually Owned Small Facilities ===")

individual_small_facilities_pbj_nurse = loaded_datasets['pbj_nurse_individual_small']

# Calculate total RN hours and contract RN ratio (reuse logic)
individual_small_facilities_pbj_nurse.loc[:, 'total_rn_hours'] = individual_small_facilities_pbj_nurse['hrs_rn_emp'] + individual_small_facilities_pbj_nurse['hrs_rn_ctr']
individual_small_facilities_pbj_nurse.loc[:, 'contract_rn_ratio'] = individual_small_facilities_pbj_nurse['hrs_rn_ctr'] / individual_small_facilities_pbj_nurse['total_rn_hours']
individual_small_facilities_pbj_nurse.loc[individual_small_facilities_pbj_nurse['total_rn_hours'] == 0, 'contract_rn_ratio'] = 0

# Aggregate to daily level
daily_rn_ratio_individual_small_facilities = individual_small_facilities_pbj_nurse.groupby('workdate')['contract_rn_ratio'].mean().reset_index()

# Print summary statistics and average
print("\nSummary Statistics for RN Contract Ratio in Individually Owned Small Facilities:")
print(daily_rn_ratio_individual_small_facilities['contract_rn_ratio'].describe())

average_rn_ratio_individual_small_facilities = daily_rn_ratio_individual_small_facilities['contract_rn_ratio'].mean()
print(f"\nAverage RN Contract Ratio in Individually Owned Small Facilities (Q2 2024): {average_rn_ratio_individual_small_facilities:.4f}")

print("=== RN Staffing Ratio Analysis for Individually Owned Small Facilities Complete ===")


# Cell 4: Analyze CNA Intra-Quarter Variation in Individually Owned Small Facilities

print("\n=== Analyzing CNA Intra-Quarter Variation in Individually Owned Small Facilities ===")

individual_small_facilities_pbj_nurse = loaded_datasets['pbj_nurse_individual_small']

# Calculate total CNA hours and contract CNA ratio (reuse logic)
individual_small_facilities_pbj_nurse.loc[:, 'total_cna_hours'] = individual_small_facilities_pbj_nurse['hrs_cna_emp'] + individual_small_facilities_pbj_nurse['hrs_cna_ctr']
individual_small_facilities_pbj_nurse.loc[:, 'contract_cna_ratio'] = individual_small_facilities_pbj_nurse['hrs_cna_ctr'] / individual_small_facilities_pbj_nurse['total_cna_hours']
individual_small_facilities_pbj_nurse.loc[individual_small_facilities_pbj_nurse['total_cna_hours'] == 0, 'contract_cna_ratio'] = 0

# Daily aggregation for CNA ratio and census
daily_cna_ratio_census_individual_small_facilities = individual_small_facilities_pbj_nurse.groupby('workdate').agg(
    average_cna_ratio=('contract_cna_ratio', 'mean'),
    average_census=('mdscensus', 'mean')
).reset_index()

# Calculate correlation
correlation_cna_census_individual_small_facilities = daily_cna_ratio_census_individual_small_facilities['average_cna_ratio'].corr(daily_cna_ratio_census_individual_small_facilities['average_census'])
print(f"\nCorrelation between Daily CNA Contract Ratio and Average Census in Individually Owned Small Facilities: {correlation_cna_census_individual_small_facilities:.4f}")

# Print summary statistics
print("\nSummary Statistics for CNA Contract Ratio in Individually Owned Small Facilities:")
print(daily_cna_ratio_census_individual_small_facilities['average_cna_ratio'].describe())

print("=== CNA Intra-Quarter Variation Analysis for Individually Owned Small Facilities Complete ===")


# Cell 5: Explore Exclusive Contract & Outlier Facilities among Individually Owned Small Facilities

print("\n=== Exploring Exclusive Contract & Outlier Facilities among Individually Owned Small Facilities ===")

individual_small_facilities_pbj_nurse = loaded_datasets['pbj_nurse_individual_small']

# Identify days with near-100% contract RN usage (reuse logic)
threshold_high_contract = 0.99
exclusive_contract_days_individual_small_facilities = individual_small_facilities_pbj_nurse[individual_small_facilities_pbj_nurse['contract_rn_ratio'] >= threshold_high_contract]

print(f"\nNumber of days with >= {threshold_high_contract*100}% contract RN usage in Individually Owned Small Facilities: {len(exclusive_contract_days_individual_small_facilities)}")
print(f"Number of unique Individually Owned Small Facilities with >= {threshold_high_contract*100}% contract RN usage on at least one day: {exclusive_contract_days_individual_small_facilities['provnum'].nunique()}")

# Identify outlier facilities (top percentile by average RN ratio) - within this segment
facility_avg_rn_ratio_individual_small_facilities = individual_small_facilities_pbj_nurse.groupby('provnum')['contract_rn_ratio'].mean()
outlier_threshold_percentile = 0.95 # Top 5% as outliers within individually owned small facilities
outlier_ratio_threshold_individual_small_facilities = facility_avg_rn_ratio_individual_small_facilities.quantile(outlier_threshold_percentile)
outlier_individual_small_facility_provnums = facility_avg_rn_ratio_individual_small_facilities[facility_avg_rn_ratio_individual_small_facilities >= outlier_ratio_threshold_individual_small_facilities].index.tolist()

print(f"\nNumber of outlier Individually Owned Small Facilities (top {100*(1-outlier_threshold_percentile):.0f}%ile by average RN contract ratio within this segment): {len(outlier_individual_small_facility_provnums)}")
print(f"Example Outlier Ratio Threshold: {outlier_ratio_threshold_individual_small_facilities:.4f}")

print("=== Exclusive Contract & Outlier Facility Exploration for Individually Owned Small Facilities Complete ===")


# Cell 6: Analyze Quality & Deficiencies for Individually Owned Small Facilities with Higher Temp Staffing

print("\n=== Analyzing Quality & Deficiencies for Individually Owned Small Facilities with Higher Temp Staffing ===")

individual_small_facilities_pbj_nurse = loaded_datasets['pbj_nurse_individual_small']
nh_survey = loaded_datasets['nh_survey']
qrp_provider = loaded_datasets['qrp_provider']

# Calculate facility average RN contract ratio (if not already calculated)
if 'facility_avg_rn_ratio_individual_small_facilities' not in locals(): # Check if already calculated in Cell 5
    facility_avg_rn_ratio_individual_small_facilities = individual_small_facilities_pbj_nurse.groupby('provnum')['contract_rn_ratio'].mean()

# Merge facility average RN ratio with survey and quality data
individual_small_facilities_avg_rn_ratio_df = facility_avg_rn_ratio_individual_small_facilities.reset_index(name='avg_rn_ratio_individual_small_facility')
individual_small_facilities_survey_quality = pd.merge(individual_small_facilities_avg_rn_ratio_df, nh_survey, left_on='provnum', right_on='cms_certification_number_(ccn)', how='left')
individual_small_facilities_survey_quality = pd.merge(individual_small_facilities_survey_quality, qrp_provider, left_on='provnum', right_on='cms_certification_number_(ccn)', how='left')

# Calculate correlation with total health deficiencies
correlation_deficiencies_rn_ratio_individual_small_facilities = individual_small_facilities_survey_quality['avg_rn_ratio_individual_small_facility'].corr(individual_small_facilities_survey_quality['total_number_of_health_deficiencies'])
print(f"\nCorrelation between Avg RN Contract Ratio and Total Health Deficiencies in Individually Owned Small Facilities: {correlation_deficiencies_rn_ratio_individual_small_facilities:.4f}")


# Quality Score Correlation (reuse logic, adjust quality_score_column if needed)
quality_score_column = 'score'
if quality_score_column in individual_small_facilities_survey_quality.columns:
    individual_small_facilities_survey_quality.loc[:, quality_score_column] = pd.to_numeric(individual_small_facilities_survey_quality[quality_score_column], errors='coerce')
    correlation_quality_rn_ratio_individual_small_facilities = individual_small_facilities_survey_quality['avg_rn_ratio_individual_small_facility'].corr(individual_small_facilities_survey_quality[quality_score_column])
    print(f"Correlation between Avg RN Contract Ratio and Quality Score in Individually Owned Small Facilities ({quality_score_column}): {correlation_quality_rn_ratio_individual_small_facilities:.4f}")
else:
    print(f"\nWarning: Quality score column '{quality_score_column}' not found in merged dataset. Skipping quality score correlation.")


print("=== Quality & Deficiencies Analysis for Individually Owned Small Facilities Complete ===")


# Cell 7: Further Geographic Analysis - State Distribution of Individually Owned Small Facilities

print("\n=== Further Geographic Analysis - State Distribution of Individually Owned Small Facilities ===")

individual_small_facilities_pbj_nurse = loaded_datasets['pbj_nurse_individual_small']

# Merge with qrp_provider to get state information
individual_small_facilities_with_state = pd.merge(individual_small_facilities_pbj_nurse[['provnum']].drop_duplicates(),
                                                     loaded_datasets['qrp_provider'][['cms_certification_number_(ccn)', 'state']],
                                                     left_on='provnum', right_on='cms_certification_number_(ccn)',
                                                     how='left')

# Count facilities per state
state_counts_individual_small_facilities = individual_small_facilities_with_state['state'].value_counts(normalize=True) # Normalize to percentages
print("\nState Distribution of Individually Owned Small Facilities (Percentage):")
print(state_counts_individual_small_facilities)

print("=== Geographic Analysis of Individually Owned Small Facilities Complete ===")


print("\n=== Comprehensive Analysis of Individually Owned Small Facilities Completed ===")
print(f"Logs saved to: {LOG_FILE}")