# 02 — Feature engineering & caching

This notebook:
- builds a model table (tabular + raw text)
- saves it to `data/processed/` to speed up later experiments

In [2]:
import sys
from pathlib import Path

PROJECT_ROOT = Path.cwd().resolve()
if (PROJECT_ROOT / "src").exists() is False and (PROJECT_ROOT.parent / "src").exists():
    PROJECT_ROOT = PROJECT_ROOT.parent

sys.path.insert(0, str(PROJECT_ROOT))

print("PROJECT_ROOT:", PROJECT_ROOT)
print("src exists:", (PROJECT_ROOT / "src").exists())

PROJECT_ROOT: /Users/sergey/code/renthop-lightautoml-vs-custom
src exists: True


In [3]:
import pandas as pd

from src.config import Paths
from src.data.io import load_renthop_json
from src.features.build import build_model_table

paths = Paths()
train, test = load_renthop_json(paths.data_raw/"train.json", paths.data_raw/"test.json")

full = pd.concat([train, test], ignore_index=True)
model_df, spec = build_model_table(full)

print("Full model table:", model_df.shape)
display(model_df.head(3))

Full model table: (124011, 34)


Unnamed: 0,listing_id,interest_level,bathrooms,bedrooms,price,latitude,longitude,n_photos,n_features,price_per_bed,...,feat_txt_n_unique,feat_txt_excl_cnt,feat_txt_caps_ratio,manager_id,building_id,street_address_clean,display_address_clean,description,features_text,created_dt
0,7170325,medium,1.0,1,2400,40.7108,-73.9539,12,7,1200.0,...,13.0,0.0,0.141304,a10db4590843d78c784171a107bdacb4,8579a0b0d54db803821a35a4a615e97a,145 borinquen place,145 borinquen place,Spacious 1 Bedroom 1 Bathroom in Williamsburg!...,Dining Room Pre-War Laundry in Building Dishwa...,2016-06-16 05:55:27
1,7092344,low,1.0,2,3800,40.7513,-73.9722,6,6,1266.666667,...,10.0,0.0,0.128571,955db33477af4f40004820b4aed804a0,b8e75fc949a6cd8225b455648a951712,230 east 44th,east 44th,BRAND NEW GUT RENOVATED TRUE 2 BEDROOMFind you...,Doorman Elevator Laundry in Building Dishwashe...,2016-06-01 05:44:33
2,7158677,medium,1.0,2,3495,40.7575,-73.9625,6,6,1165.0,...,9.0,0.0,0.113924,c8b10a317b766204f08e613cef4ce7a0,cd759a988b8f23924b5a2058d5ab2b49,405 east 56th street,east 56th street,**FLEX 2 BEDROOM WITH FULL PRESSURIZED WALL**L...,Doorman Elevator Laundry in Building Laundry i...,2016-06-14 15:19:59


In [7]:
# Save
paths.data_processed.mkdir(parents=True, exist_ok=True)
out_path = paths.data_processed / "model_table.pkl"
model_df.to_pickle(out_path)
print("Saved:", out_path)

Saved: /Users/sergey/code/renthop-lightautoml-vs-custom/data/processed/model_table.pkl


In [8]:
# Save feature spec (for reproducible column lists)
import json
spec_path = paths.data_processed / "feature_spec.json"
spec_dict = {
    "numeric_cols": spec.numeric_cols,
    "categorical_cols": spec.categorical_cols,
    "text_cols": spec.text_cols,
    "datetime_cols": spec.datetime_cols,
}
spec_path.write_text(json.dumps(spec_dict, indent=2), encoding="utf-8")
print("Saved:", spec_path)

Saved: /Users/sergey/code/renthop-lightautoml-vs-custom/data/processed/feature_spec.json
