In [3]:
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit
from pathlib import Path

# make dirs
Path("../data/train").mkdir(parents=True, exist_ok=True)
Path("../data/test").mkdir(parents=True, exist_ok=True)

# load raw data
df = pd.read_csv("../data/raw/housing.csv")

# stratify by income_cat
df["income_cat"] = pd.cut(df["median_income"],
                          bins=[0., 1.5, 3., 4.5, 6., df["median_income"].max()],
                          labels=[1,2,3,4,5])
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_idx, test_idx in split.split(df, df["income_cat"]):
    strat_train = df.loc[train_idx].drop(columns=["income_cat"])
    strat_test = df.loc[test_idx].drop(columns=["income_cat"])

# pick 13 original columns (including target + ocean_proximity)
cols = ["longitude","latitude","housing_median_age","total_rooms",
        "total_bedrooms","population","households","median_income",
        "median_house_value","ocean_proximity"]

strat_train[cols].to_csv("data/train/housing_train.csv", index=False)
strat_test[cols].to_csv("data/test/housing_test.csv", index=False)

In [5]:
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.impute import SimpleImputer

train_raw = pd.read_csv("data/train/housing_train.csv")

target = "median_house_value"
X = train_raw.drop(columns=[target])
y = train_raw[target]

# engineered features
def add_features(A):
    A = np.asarray(A, dtype=float)
    total_rooms, households = A[:,3], A[:,5]
    total_bedrooms, population = A[:,4], A[:,6]
    rooms_per_household = total_rooms / np.maximum(households,1)
    bedrooms_per_room = total_bedrooms / np.maximum(total_rooms,1)
    population_per_household = population / np.maximum(households,1)
    return np.c_[A, rooms_per_household, bedrooms_per_room, population_per_household]

num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = [c for c in X.columns if c not in num_cols]

num_pipe = Pipeline([
    ("impute", SimpleImputer(strategy="median")),
    ("engineer", FunctionTransformer(add_features, feature_names_out="one-to-one")),
    ("scale", StandardScaler())
])

cat_pipe = Pipeline([
    ("impute", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

pre = ColumnTransformer([
    ("num", num_pipe, num_cols),
    ("cat", cat_pipe, cat_cols)
])

Xt = pre.fit_transform(X)
Xt = Xt.toarray() if hasattr(Xt,"toarray") else Xt

proc = pd.DataFrame(Xt)
proc[target] = y.values
proc.to_csv("data/train/housing_train_processed.csv", index=False)