In [1]:
# Cell 1: Setup and Logging

import os
import sys
import logging

# Define the path to your raw CSV files
RAW_DATA_PATH = "./data/raw/"
LOG_FILE = "raw_data_verification.log"

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    filename=LOG_FILE,
    filemode="w",  # Overwrite on each run
    format="%(asctime)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger()

# Redirect print statements to the logger
class LoggerWriter:
    def __init__(self, level):
        self.level = level
    def write(self, message):
        if message.strip():
            self.level(message.strip())
    def flush(self):
        pass

sys.stdout = LoggerWriter(logger.info)
sys.stderr = LoggerWriter(logger.error)

print("=== Raw Data Verification Notebook: Logging Setup Complete ===")
print(f"Using RAW_DATA_PATH: {RAW_DATA_PATH}")
print(f"Logs will be written to: {LOG_FILE}")


In [2]:
# Cell 2: Define Dataset Dictionary & Load Raw CSV Files

import pandas as pd

print("=== Starting Raw Data Verification ===")

# Dictionary mapping dataset names to the raw CSV filenames
raw_datasets = {
    "nh_citations_raw": "NH_HealthCitations_Jan2025.csv",
    "nh_ownership_raw": "NH_Ownership_Jan2025.csv",
    "nh_quality_mds_raw": "NH_QualityMsr_MDS_Jan2025.csv",
    "nh_survey_raw": "NH_SurveySummary_Jan2025.csv",
    "pbj_non_nurse_raw": "PBJ_Daily_Non_Nurse_Staffing_Q2_2024.csv",
    "pbj_nurse_raw": "PBJ_Daily_Nurse_Staffing_Q2_2024.csv",
    "qrp_provider_raw": "Skilled_Nursing_Facility_Quality_Reporting_Program_Provider_Data_Jan2025.csv"
}

loaded_raw_datasets = {}

def load_csv_safely(file_path):
    """Attempt to load CSV with multiple encodings."""
    encodings = ["utf-8", "latin1", "ISO-8859-1", "windows-1252"]
    for enc in encodings:
        try:
            df = pd.read_csv(file_path, encoding=enc, low_memory=False)
            print(f"Loaded {os.path.basename(file_path)} with encoding={enc}, shape={df.shape}")
            return df
        except Exception as e:
            print(f"Failed with encoding={enc}: {e}")
    print(f"All encodings failed for {file_path}")
    return None

# Loop through each dataset and load it
for key, filename in raw_datasets.items():
    file_path = os.path.join(RAW_DATA_PATH, filename)
    if os.path.exists(file_path):
        df = load_csv_safely(file_path)
        if df is not None:
            loaded_raw_datasets[key] = df
    else:
        print(f"File not found: {file_path}")

print("=== Summary of Loaded Raw Datasets ===")
if loaded_raw_datasets:
    for key, df in loaded_raw_datasets.items():
        print(f"{key}: shape={df.shape}, columns={list(df.columns)}")
else:
    print("No raw datasets loaded. Check file paths or filenames.")

print("=== Raw Data Verification - Load Step Completed ===")


In [3]:
# Cell 3: Basic Checks (Missing, Duplicates, Numeric Stats)

if not loaded_raw_datasets:
    print("No raw datasets to check. Please ensure data is loaded in Cell 2.")
else:
    for key, df in loaded_raw_datasets.items():
        print(f"\n=== Data Quality Checks for {key} ===")
        
        # 1. Missing Values
        missing_counts = df.isna().sum()
        print("Missing Value Counts per Column:")
        print(missing_counts)

        # 2. Duplicate Rows
        duplicates = df.duplicated().sum()
        print(f"Number of Duplicate Rows: {duplicates}")

        # 3. Numeric Stats
        numeric_cols = df.select_dtypes(include=["int", "float"]).columns
        if len(numeric_cols) > 0:
            stats = df[numeric_cols].describe()
            print("Basic Stats for Numeric Columns:")
            print(stats)
        else:
            print("No numeric columns found for outlier checks.")

print("=== Raw Data Verification - Basic Checks Completed ===")


In [4]:
# Cell 4: (Optional) Compare with Known Preprocessed Shapes

# If you want to confirm the raw shapes match your final preprocessed shapes,
# you can hard-code the shapes you saw in your logs from the preprocessed data:

preprocessed_shapes = {
    "pbj_nurse": (1325324, 33),
    "pbj_non_nurse": (1325324, 82),
    "qrp_provider": (710016, 16),
    "nh_survey": (44189, 41),
    "nh_quality_mds": (251464, 23),
    "nh_ownership": (144651, 13),
    "nh_citations": (406789, 23)
}

# Now let's compare them to the raw shapes
for key, shape in preprocessed_shapes.items():
    # The raw dataset dictionary keys end with '_raw', so let's adapt the name:
    raw_key = key + "_raw"
    if raw_key in loaded_raw_datasets:
        raw_df = loaded_raw_datasets[raw_key]
        raw_shape = raw_df.shape
        print(f"Comparing {raw_key} to preprocessed '{key}':")
        print(f"  Raw shape:          {raw_shape}")
        print(f"  Preprocessed shape: {shape}")
        
        # If the shape is the same, or close, that suggests minimal changes.
        # If different, that suggests either cleaning, filtering, or combining happened.
        if raw_shape == shape:
            print("  => Shapes match exactly!")
        else:
            print("  => Shapes differ. This could be normal if you removed duplicates, filtered rows, or changed columns.")
    else:
        print(f"Raw dataset not loaded for {raw_key}.")
