In [1]:
# Cell 1: Setup Logging and Load Prepared Datasets

import logging
import sys
import os
import pandas as pd

LOG_FILE = "exclusive_contract_days_analysis.log"

logging.basicConfig(
    level=logging.INFO,
    filename=LOG_FILE,
    filemode="w",  # Overwrite previous logs on each run
    format="%(asctime)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger()

# Redirect print statements to logger
class LoggerWriter:
    def __init__(self, level):
        self.level = level
    def write(self, message):
        if message.strip():
            self.level(message.strip())
    def flush(self):
        pass

sys.stdout = LoggerWriter(logger.info)
sys.stderr = LoggerWriter(logger.error)

print("=== Logging Setup Complete ===")

# Define path to prepared datasets (adjust if necessary)
PREPARED_PATH = "./data/prepared/"

# Dictionary mapping dataset names to their file names
files = {
    "pbj_nurse": "pbj_nurse_prepared.parquet",
    "pbj_non_nurse": "pbj_non_nurse_prepared.parquet",
    "qrp_provider": "qrp_provider_prepared.parquet",
    "nh_survey": "nh_survey_prepared.parquet",
    "nh_quality_mds": "nh_quality_mds_prepared.parquet",
    "nh_ownership": "nh_ownership_prepared.parquet",
    "nh_citations": "nh_citations_prepared.parquet"
}

# Load datasets into a dictionary
loaded_datasets = {}
for key, filename in files.items():
    file_path = os.path.join(PREPARED_PATH, filename)
    try:
        loaded_datasets[key] = pd.read_parquet(file_path)
        print(f"Loaded {key} with shape {loaded_datasets[key].shape}")
    except Exception as e:
        print(f"Error loading {key}: {e}")


# Cell 2: Identify Exclusive Contract RN Days

print("\n=== Identifying Exclusive Contract RN Days (>= 99% Contract Ratio) ===")

pbj_nurse = loaded_datasets['pbj_nurse']

# Calculate total RN hours and contract RN ratio (if not already calculated)
if 'total_rn_hours' not in pbj_nurse.columns:
    pbj_nurse.loc[:, 'total_rn_hours'] = pbj_nurse['hrs_rn_emp'] + pbj_nurse['hrs_rn_ctr']
if 'contract_rn_ratio' not in pbj_nurse.columns:
    pbj_nurse.loc[:, 'contract_rn_ratio'] = pbj_nurse['hrs_rn_ctr'] / pbj_nurse['total_rn_hours']
    pbj_nurse.loc[pbj_nurse['total_rn_hours'] == 0, 'contract_rn_ratio'] = 0

# Define threshold for "exclusive contract" days
threshold_high_contract = 0.99

# Filter for days with near-100% contract RN usage
exclusive_contract_days_df = pbj_nurse[pbj_nurse['contract_rn_ratio'] >= threshold_high_contract].copy()
print(f"Shape of DataFrame with Exclusive Contract RN Days: {exclusive_contract_days_df.shape}")

print(f"Number of days with >= {threshold_high_contract*100}% contract RN usage: {len(exclusive_contract_days_df)}")
print(f"Number of unique facilities with >= {threshold_high_contract*100}% contract RN usage on at least one day: {exclusive_contract_days_df['provnum'].nunique()}")

# Store the filtered dataset for later use
loaded_datasets['exclusive_contract_days'] = exclusive_contract_days_df

print("=== Exclusive Contract RN Days Identification Complete ===")


# Cell 3: Analyze Day of Week Distribution of Exclusive Contract Days

print("\n=== Analyzing Day of Week Distribution of Exclusive Contract Days ===")

exclusive_contract_days_df = loaded_datasets['exclusive_contract_days']

# Extract day of the week from 'workdate'
exclusive_contract_days_df.loc[:, 'day_of_week'] = exclusive_contract_days_df['workdate'].dt.day_name()

# Count occurrences by day of the week
day_of_week_counts = exclusive_contract_days_df['day_of_week'].value_counts(normalize=True) # Normalize to percentages
print("\nDay of Week Distribution of Exclusive Contract RN Days (Percentage):")
print(day_of_week_counts)

print("=== Day of Week Distribution Analysis Complete ===")


# Cell 4: Analyze Census Distribution on Exclusive Contract Days vs. All Days

print("\n=== Analyzing Census Distribution on Exclusive Contract Days vs. All Days ===")

exclusive_contract_days_df = loaded_datasets['exclusive_contract_days']
pbj_nurse = loaded_datasets['pbj_nurse']

# Census distribution on exclusive contract days
print("\nCensus Statistics on Exclusive Contract RN Days:")
print(exclusive_contract_days_df['mdscensus'].describe())

# Census distribution on ALL days
print("\nCensus Statistics on ALL Days (from pbj_nurse):")
print(pbj_nurse['mdscensus'].describe())

print("=== Census Distribution Analysis Complete ===")


# Cell 5: Analyze Quality and Deficiencies for Facilities with Exclusive Contract Days vs. All Facilities

print("\n=== Analyzing Quality & Deficiencies for Facilities with Exclusive Contract Days vs. All Facilities ===")

exclusive_contract_days_df = loaded_datasets['exclusive_contract_days']
nh_survey = loaded_datasets['nh_survey']
qrp_provider = loaded_datasets['qrp_provider']

# Get unique provnums of facilities with exclusive contract days
exclusive_facility_provnums = exclusive_contract_days_df['provnum'].unique()
print(f"\nNumber of facilities with exclusive contract days: {len(exclusive_facility_provnums)}")

# Filter survey and quality data for these facilities
exclusive_facilities_survey_quality = nh_survey[nh_survey['cms_certification_number_(ccn)'].isin(exclusive_facility_provnums)].copy()
exclusive_facilities_survey_quality = pd.merge(exclusive_facilities_survey_quality, qrp_provider, left_on='cms_certification_number_(ccn)', right_on='cms_certification_number_(ccn)', how='left')

all_facilities_survey_quality = nh_survey.copy() # Using all surveys for comparison
all_facilities_survey_quality = pd.merge(all_facilities_survey_quality, qrp_provider, left_on='cms_certification_number_(ccn)', right_on='cms_certification_number_(ccn)', how='left')


print("\n--- Quality & Deficiency Metrics for Facilities WITH Exclusive Contract Days ---")
print("\nAverage Total Health Deficiencies:")
print(exclusive_facilities_survey_quality['total_number_of_health_deficiencies'].mean())
quality_score_column = 'score' # Adjust if needed
if quality_score_column in exclusive_facilities_survey_quality.columns:
    exclusive_facilities_survey_quality.loc[:, quality_score_column] = pd.to_numeric(exclusive_facilities_survey_quality[quality_score_column], errors='coerce')
    print(f"Average Quality Score ({quality_score_column}):")
    print(exclusive_facilities_survey_quality[quality_score_column].mean())
else:
    print(f"Warning: Quality score column '{quality_score_column}' not found for facilities with exclusive contract days.")


print("\n--- Quality & Deficiency Metrics for ALL Facilities (for comparison) ---")
print("\nAverage Total Health Deficiencies:")
print(all_facilities_survey_quality['total_number_of_health_deficiencies'].mean())
if quality_score_column in all_facilities_survey_quality.columns:
    all_facilities_survey_quality.loc[:, quality_score_column] = pd.to_numeric(all_facilities_survey_quality[quality_score_column], errors='coerce')
    print(f"Average Quality Score ({quality_score_column}):")
    print(all_facilities_survey_quality[quality_score_column].mean())
else:
    print(f"Warning: Quality score column '{quality_score_column}' not found for all facilities.")


print("=== Quality & Deficiencies Analysis Complete ===")


# Cell 6: Analyze Ownership Type of Facilities with Exclusive Contract Days

print("\n=== Analyzing Ownership Type of Facilities with Exclusive Contract Days ===")

exclusive_contract_days_df = loaded_datasets['exclusive_contract_days']
nh_ownership = loaded_datasets['nh_ownership']

# Get unique provnums of facilities with exclusive contract days (already defined in Cell 5: exclusive_facility_provnums)

# Filter nh_ownership to only include facilities with exclusive contract days
exclusive_facilities_ownership = nh_ownership[nh_ownership['cms_certification_number_(ccn)'].isin(exclusive_facility_provnums)]

# Analyze ownership types - count occurrences of each owner_type
ownership_type_counts_exclusive_facilities = exclusive_facilities_ownership['owner_type'].value_counts(normalize=True) # Normalize to percentages
print("\nOwnership Type Distribution for Facilities with Exclusive Contract Days:")
print(ownership_type_counts_exclusive_facilities)

# Compare to ownership distribution for ALL facilities (for context)
all_facilities_ownership = nh_ownership['owner_type'].value_counts(normalize=True)
print("\nOwnership Type Distribution for ALL Facilities (for comparison):")
print(all_facilities_ownership)


print("=== Ownership Type Analysis of Facilities with Exclusive Contract Days Complete ===")


# Cell 7: Geographic Distribution (State) of Facilities with Exclusive Contract Days

print("\n=== Geographic Distribution (State) of Facilities with Exclusive Contract Days ===")

exclusive_contract_days_df = loaded_datasets['exclusive_contract_days']
qrp_provider = loaded_datasets['qrp_provider']

# Get unique provnums of facilities with exclusive contract days (already defined: exclusive_facility_provnums)

# Filter qrp_provider to only include facilities with exclusive contract days
exclusive_facilities_qrp = qrp_provider[qrp_provider['cms_certification_number_(ccn)'].isin(exclusive_facility_provnums)]

# Count facilities per state
state_counts_exclusive_facilities = exclusive_facilities_qrp['state'].value_counts(normalize=True) # Normalize to percentages
print("\nState Distribution of Facilities with Exclusive Contract Days (Percentage):")
print(state_counts_exclusive_facilities.head(10)) # Show top 10 states

# Compare to state distribution for ALL facilities (from qrp_provider - top 10 for brevity)
all_facilities_state_counts = qrp_provider['state'].value_counts(normalize=True)
print("\nState Distribution for ALL Facilities (Top 10 - for comparison):")
print(all_facilities_state_counts.head(10))


print("=== Geographic Distribution Analysis of Facilities with Exclusive Contract Days Complete ===")


print("\n=== Comprehensive Analysis of Exclusive Contract Days Completed ===")
print(f"Logs saved to: {LOG_FILE}")