In [14]:
import pandas as pd
from sqlalchemy import create_engine, text

# -------------------------------------
# DATABASE CONNECTION
# -------------------------------------

db_url = (
    "postgres://ufnbfacj9c7u80:"
    "pa129f8c5adad53ef2c90db10cce0c899f8c7bdad022cca4e85a8729b19aad68d"
    "@ceq2kf3e33g245.cluster-czrs8kj4isg7.us-east-1.rds.amazonaws.com:5432/d9f89h4ju1lleh"
)

db_url = db_url.replace("postgres://", "postgresql://")
engine = create_engine(db_url)

# -------------------------------------
# CHECK COUNTS (optional)
# -------------------------------------

with engine.connect() as conn:
    row_count = conn.execute(text("SELECT COUNT(*) FROM acs_housing_final")).scalar()
    print(f"acs_housing_final rows: {row_count:,}")

# -------------------------------------
# SQL
# -------------------------------------

query = """
    SELECT
        ahf.serialno,
        ahf.valp,
        ahf.hincp,
        ahf.fincp,
        ahf.bds,
        ahf.yrblt,
        ahf.np,
        ahf.region,
        ahf.puma_normalized AS puma,
        ahf.house_age,
        gcz.zip,
        gcz.county
    FROM acs_housing_final ahf
    LEFT JOIN geo_corr_zip gcz
        ON ahf.puma_normalized::text = gcz.puma22
    WHERE ahf.valp > 0
"""

print("Querying DB...")
df = pd.read_sql(query, engine)

print("\nSample rows:")
print(df.head())

print("\nColumns:", df.columns.tolist())
print("\nShape:", df.shape)
print("\nUnique ZIPs:", df["zip"].nunique())

# -------------------------------------
# Ready for ML prep
# -------------------------------------

# Example: Create log target and safe numeric types
df["valp_log"] = df["valp"].clip(lower=1).apply(lambda x: np.log(x))
df["hincp"] = pd.to_numeric(df["hincp"], errors="coerce").fillna(0)
df["fincp"] = pd.to_numeric(df["fincp"], errors="coerce").fillna(0)

print("\nReady for model training!")

acs_housing_final rows: 209,431
Querying DB...

Sample rows:
        serialno       valp     hincp     fincp  bds  yrblt  np  region  puma  \
0  2018HU0654721   175000.0   50000.0   50000.0    3   1980   2       4    -9   
1  2018HU0654727  2100000.0  235000.0  235000.0    5   2000   2       4    -9   
2  2018HU0654730   180000.0   40100.0   40100.0    3   1970   2       4    -9   
3  2018HU0654829   600000.0  106400.0  106400.0    3   1940   9       4    -9   
4  2018HU0654843   700000.0  315000.0  315000.0    4   1960   4       4    -9   

   house_age   zip county  
0         45  None   None  
1         25  None   None  
2         55  None   None  
3         85  None   None  
4         65  None   None  

Columns: ['serialno', 'valp', 'hincp', 'fincp', 'bds', 'yrblt', 'np', 'region', 'puma', 'house_age', 'zip', 'county']

Shape: (209949, 12)

Unique ZIPs: 4

Ready for model training!
