In [1]:
import os
import pandas as pd
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq
import logging

# Define directory paths
RAW_DATA_PATH = "./data/raw/"
PROCESSED_DATA_PATH = "./data/processed/"
LOG_FILE = "./data/prepare_phase.log"

# Ensure processed data directory exists
os.makedirs(PROCESSED_DATA_PATH, exist_ok=True)

# Set up logging
logging.basicConfig(filename=LOG_FILE, level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s')

logging.info("Starting Prepare Phase Script")

# Dictionary of datasets to load
datasets = {
    "pbj_nurse": "PBJ_Daily_Nurse_Staffing_Q2_2024.csv",
    "pbj_non_nurse": "PBJ_Daily_Non_Nurse_Staffing_Q2_2024.csv",
    "qrp_provider": "Skilled_Nursing_Facility_Quality_Reporting_Program_Provider_Data_Jan2025.csv",
    "nh_survey": "NH_SurveySummary_Jan2025.csv",
    "nh_quality_mds": "NH_QualityMsr_MDS_Jan2025.csv",  # Corrected filename - "Msr"
    "nh_ownership": "NH_Ownership_Jan2025.csv",
    "nh_citations": "NH_HealthCitations_Jan2025.csv"
}

dataframes = {}

for dataset_name, filename in datasets.items():
    file_path = os.path.join(RAW_DATA_PATH, filename)
    parquet_filepath = os.path.join(PROCESSED_DATA_PATH, dataset_name + ".parquet")

    logging.info(f"Attempting to load dataset: {dataset_name} from {file_path}")

    try:
        # Apply encoding='latin1' for PBJ datasets
        if dataset_name in ["pbj_nurse", "pbj_non_nurse"]:
            df = pd.read_csv(file_path, low_memory=False, encoding='latin1')
        else: # Default encoding for other datasets
            df = pd.read_csv(file_path, low_memory=False)

        dataframes[dataset_name] = df
        logging.info(f"Dataset: {dataset_name} loaded successfully.")

        logging.info(f"Converting {dataset_name} to Parquet format and saving to {parquet_filepath}")
        table = pa.Table.from_pandas(df)
        pq.write_table(table, parquet_filepath)
        logging.info(f"Dataset: {dataset_name} converted to Parquet and saved.")

        del df

    except FileNotFoundError:
        logging.error(f"Error: File not found for dataset: {dataset_name} at {file_path}")
        print(f"Error: File not found for dataset: {dataset_name} at {file_path}")
    except Exception as e:
        logging.error(f"Error loading dataset: {dataset_name} from {file_path}. Error: {e}")
        print(f"Error loading dataset: {dataset_name} from {file_path}. Error: {e}")

logging.info("Data loading and conversion to Parquet completed.")

# --- Initial Data Inspection for All Datasets ---
for dataset_name in datasets.keys():
    print(f"\n--- Initial Data Inspection - Dataset: {dataset_name} ---")
    parquet_filepath = os.path.join(PROCESSED_DATA_PATH, dataset_name + ".parquet")

    if dataset_name in dataframes or os.path.exists(parquet_filepath):
        if dataset_name in dataframes:
            df = dataframes[dataset_name] # Use DataFrame if in memory
            print(f"Inspecting DataFrame from memory: {dataset_name}")
        else:
            logging.info(f"Loading {dataset_name} dataset from Parquet for inspection.")
            df = pd.read_parquet(parquet_filepath) # Load from Parquet
            logging.info(f"{dataset_name} dataset loaded from Parquet for inspection.")
            print(f"Inspecting DataFrame from Parquet: {dataset_name}")

        print("\n--- DataFrame Head: ---")
        print(df.head())

        print("\n--- DataFrame Info (Concise): ---")
        print(df.info(verbose=False, memory_usage="deep"))

        print("\n--- Missing Values (Top 5 Columns): ---")
        print(df.isnull().sum().sort_values(ascending=False).head(5))

        print("\n--- Data Types (Value Counts): ---")
        print(df.dtypes.value_counts())
    else:
        print(f"{dataset_name} DataFrame not loaded. Inspection skipped.")

logging.info("Prepare Phase Script Completed")
print("\nPrepare Phase Script Completed. Check prepare_phase.log for details.")


--- Initial Data Inspection - Dataset: pbj_nurse ---
Inspecting DataFrame from memory: pbj_nurse

--- DataFrame Head: ---
  PROVNUM                  PROVNAME          CITY STATE COUNTY_NAME  \
0  015009  BURNS NURSING HOME, INC.  RUSSELLVILLE    AL    Franklin   
1  015009  BURNS NURSING HOME, INC.  RUSSELLVILLE    AL    Franklin   
2  015009  BURNS NURSING HOME, INC.  RUSSELLVILLE    AL    Franklin   
3  015009  BURNS NURSING HOME, INC.  RUSSELLVILLE    AL    Franklin   
4  015009  BURNS NURSING HOME, INC.  RUSSELLVILLE    AL    Franklin   

   COUNTY_FIPS  CY_Qtr  WorkDate  MDScensus  Hrs_RNDON  ...  Hrs_LPN_ctr  \
0           59  2024Q2  20240401         51      10.77  ...          0.0   
1           59  2024Q2  20240402         52       8.43  ...          0.0   
2           59  2024Q2  20240403         53      11.13  ...          0.0   
3           59  2024Q2  20240404         52      12.27  ...          0.0   
4           59  2024Q2  20240405         52       4.95  ...          0