## 1. Load libraries, and set path once for all

In [1]:
import os, sys
import pandas as pd

project_root = os.path.dirname(os.getcwd())  # 1 level up
sys.path.insert(0, project_root)

print(f"Current working directory: {os.getcwd()}")
print(f"Project root added to path: {project_root}")
print(f"Updated sys.path[0]: {sys.path[0]}")

Current working directory: C:\Users\pfaha\PROJECTS\brfss-diabetes-surveys\notebooks
Project root added to path: C:\Users\pfaha\PROJECTS\brfss-diabetes-surveys
Updated sys.path[0]: C:\Users\pfaha\PROJECTS\brfss-diabetes-surveys


In [2]:
from src.main.scripts.loaders.data_loader import DataLoader
from src.main.scripts.loaders.description_loader import DescriptionLoader
from src.main.scripts.cleaners.description_cleaner import DescriptionCleaner
from src.main.scripts.cleaners.data_cleaner import DataCleaner

## 2. Load data and description

In [3]:
data_loader = DataLoader(raw_data_path='../data/LLCP2024.xpt')
data_loader.read_data()
raw_df = data_loader.raw_df

2026-02-01 17:05:58,357 - src.main.scripts.loaders.data_loader - INFO - Initializing DataLoader with raw_data_path='../data/LLCP2024.xpt'
2026-02-01 17:05:58,358 - src.main.scripts.loaders.data_loader - INFO - DataLoader initialized successfully
2026-02-01 17:05:58,358 - src.main.scripts.loaders.data_loader - INFO - Reading XPT data from: ../data/LLCP2024.xpt
2026-02-01 17:06:49,914 - src.main.scripts.loaders.data_loader - INFO - Data loaded successfully: 457670 rows, 301 columns


In [4]:
description_loader = DescriptionLoader(raw_description_path='../docs/USCODE24_LLCP_082125.HTML')
description_loader.read_description()
raw_description = description_loader.raw_description

2026-02-01 17:06:50,016 - src.main.scripts.loaders.description_loader - INFO - Initializing DescriptionLoader with description_path='../docs/USCODE24_LLCP_082125.HTML'
2026-02-01 17:06:50,018 - src.main.scripts.loaders.description_loader - INFO - DescriptionLoader initialized successfully
2026-02-01 17:06:50,018 - src.main.scripts.loaders.description_loader - INFO - Reading HTML description from: ../docs/USCODE24_LLCP_082125.HTML
2026-02-01 17:06:54,361 - src.main.scripts.loaders.description_loader - INFO - Description loaded successfully: 265751 characters


## 3. Clean description, then data

In [5]:
description_cleaner = DescriptionCleaner(raw_description=raw_description)
description_cleaner.clean()
clean_description = description_cleaner.clean_description

2026-02-01 17:06:54,404 - src.main.scripts.cleaners.description_cleaner - INFO - Initializing DescriptionCleaner with your 'raw_description'
2026-02-01 17:06:54,406 - src.main.scripts.cleaners.description_cleaner - INFO - Cleaning your text of 265699 characters
2026-02-01 17:06:54,449 - src.main.scripts.cleaners.description_cleaner - INFO - Cleaned your text from 265699 to 265233 characters


In [6]:
data_cleaner = DataCleaner(raw_df=raw_df, clean_description=clean_description)
data_cleaner.clean()
clean_df = data_cleaner.df

2026-02-01 17:06:54,473 - src.main.scripts.cleaners.data_cleaner - INFO - Initializing DataCleaner with your 'raw_df' and 'clean_description'
2026-02-01 17:06:54,762 - src.main.scripts.cleaners.data_cleaner - INFO - Starting clean()
2026-02-01 17:06:55,394 - src.main.scripts.cleaners.data_cleaner - INFO - Removed 60 BRFSS computed variables successfully
2026-02-01 17:06:56,267 - src.main.scripts.cleaners.data_cleaner - INFO - Removed 8 BRFSS record identification variables successfully
2026-02-01 17:06:59,132 - src.main.scripts.cleaners.data_cleaner - INFO - Removed 7 variables having an extremely dominant value successfully
2026-02-01 17:06:59,951 - src.main.scripts.cleaners.data_cleaner - INFO - Removed 6 unwanted 'DIAB' columns successfully
2026-02-01 17:07:00,798 - src.main.scripts.cleaners.data_cleaner - INFO - Reorganized DIABETE4 column successfully
2026-02-01 17:07:04,500 - src.main.scripts.cleaners.data_cleaner - INFO - Encoded 207 variables as categorical successfully
2026-02

## 4. Perform EDA