In [3]:
import os
import pandas as pd
import logging

# Define directory paths
RAW_DATA_PATH = "./data/raw/"
INTERIM_DATA_PATH = "./data/interim/"
LOG_FILE = "./data/prepare_phase.log"

# Ensure interim data directory exists
os.makedirs(INTERIM_DATA_PATH, exist_ok=True)

# Set up logging
logging.basicConfig(filename=LOG_FILE, level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s')
logging.info("Starting Prepare Phase Script")

# Dictionary of datasets to load
datasets = {
    "pbj_nurse": "PBJ_Daily_Nurse_Staffing_Q2_2024.csv",
    "pbj_non_nurse": "PBJ_Daily_Non_Nurse_Staffing_Q2_2024.csv",
    "qrp_provider": "Skilled_Nursing_Facility_Quality_Reporting_Program_Provider_Data_Jan2025.csv",
    "nh_survey": "NH_SurveySummary_Jan2025.csv",
    "nh_quality_mds": "NH_QualityMsr_MDS_Jan2025.csv",
    "nh_ownership": "NH_Ownership_Jan2025.csv",
    "nh_citations": "NH_HealthCitations_Jan2025.csv"
}

# Function to load datasets safely
def load_dataset(file_path):
    encodings = ["utf-8", "latin1", "ISO-8859-1", "windows-1252"]
    for enc in encodings:
        try:
            df = pd.read_csv(file_path, encoding=enc, low_memory=False)
            print(f"Successfully loaded {os.path.basename(file_path)} with encoding: {enc}")
            return df
        except Exception as e:
            print(f"Failed with encoding {enc}: {e}")
    logging.error(f"Could not load {file_path} with any encoding.")
    return None

# Load all datasets
loaded_datasets = {}
for key, filename in datasets.items():
    file_path = os.path.join(RAW_DATA_PATH, filename)
    if os.path.exists(file_path):
        df = load_dataset(file_path)
        if df is not None:
            loaded_datasets[key] = df
            print(f"Dataset '{key}' loaded successfully. Shape: {df.shape}")
    else:
        logging.warning(f"Dataset {filename} not found in {RAW_DATA_PATH}")
        print(f"Warning: {filename} not found.")

# Verify all datasets loaded
print("\nSummary of Loaded Datasets:")
for key, df in loaded_datasets.items():
    print(f"{key}: {df.shape}")
    logging.info(f"{key} loaded with shape {df.shape}")

# Save processed datasets in interim folder as Parquet
for key, df in loaded_datasets.items():
    interim_path = os.path.join(INTERIM_DATA_PATH, f"preprocessed_{key}.parquet")
    df.to_parquet(interim_path, index=False)
    logging.info(f"Preprocessed dataset saved: {interim_path}")


Failed with encoding utf-8: 'utf-8' codec can't decode byte 0x92 in position 43: invalid start byte
Successfully loaded PBJ_Daily_Nurse_Staffing_Q2_2024.csv with encoding: latin1
Dataset 'pbj_nurse' loaded successfully. Shape: (1325324, 33)
Failed with encoding utf-8: 'utf-8' codec can't decode byte 0x92 in position 43: invalid start byte
Successfully loaded PBJ_Daily_Non_Nurse_Staffing_Q2_2024.csv with encoding: latin1
Dataset 'pbj_non_nurse' loaded successfully. Shape: (1325324, 82)
Successfully loaded Skilled_Nursing_Facility_Quality_Reporting_Program_Provider_Data_Jan2025.csv with encoding: utf-8
Dataset 'qrp_provider' loaded successfully. Shape: (710016, 16)
Successfully loaded NH_SurveySummary_Jan2025.csv with encoding: utf-8
Dataset 'nh_survey' loaded successfully. Shape: (44189, 41)
Successfully loaded NH_QualityMsr_MDS_Jan2025.csv with encoding: utf-8
Dataset 'nh_quality_mds' loaded successfully. Shape: (251464, 23)
Successfully loaded NH_Ownership_Jan2025.csv with encoding: u