# 00 â€” Setup & data

This notebook:
- checks the folder structure
- validates that Kaggle data is present
- loads raw JSON files and prints a quick sanity summary

In [None]:
import sys
from pathlib import Path

PROJECT_ROOT = Path.cwd().resolve()
if (PROJECT_ROOT / "src").exists() is False and (PROJECT_ROOT.parent / "src").exists():
    PROJECT_ROOT = PROJECT_ROOT.parent

sys.path.insert(0, str(PROJECT_ROOT))

print("PROJECT_ROOT:", PROJECT_ROOT)
print("src exists:", (PROJECT_ROOT / "src").exists())

PROJECT_ROOT: /Users/sergey/code/renthop-lightautoml-vs-custom
src exists: True


In [12]:
from pathlib import Path
import pandas as pd

from src.config import Paths
from src.data.io import load_renthop_json, load_sample_submission
from src.utils.seed import set_global_seed

paths = Paths()
set_global_seed(42)

paths.data_raw.mkdir(parents=True, exist_ok=True)
paths.data_processed.mkdir(parents=True, exist_ok=True)
paths.models.mkdir(parents=True, exist_ok=True)
paths.oof.mkdir(parents=True, exist_ok=True)
paths.submissions.mkdir(parents=True, exist_ok=True)

print("Data raw dir:", paths.data_raw.resolve())
print("Artifacts dir:", paths.artifacts.resolve())

Data raw dir: /Users/sergey/code/renthop-lightautoml-vs-custom/data/raw
Artifacts dir: /Users/sergey/code/renthop-lightautoml-vs-custom/artifacts


## Download data

If you have Kaggle CLI configured:

```bash
kaggle competitions download -c two-sigma-connect-rental-listing-inquiries -p data/raw
unzip -o data/raw/two-sigma-connect-rental-listing-inquiries.zip -d data/raw
```

Expected files:
- `data/raw/train.json`
- `data/raw/test.json`
- `data/raw/sample_submission.csv`

In [13]:
from src.config import Paths
paths = Paths()

train_path = paths.data_raw / "train.json"
test_path  = paths.data_raw / "test.json"
sub_path   = paths.data_raw / "sample_submission.csv"

assert train_path.exists(), f"Missing: {train_path}"
assert test_path.exists(),  f"Missing: {test_path}"
assert sub_path.exists(),   f"Missing: {sub_path}"

train, test = load_renthop_json(train_path, test_path)
sub = load_sample_submission(sub_path)

print("train:", train.shape)
print("test :", test.shape)
print("submission template:", sub.shape)
train.head(2)

train: (49352, 15)
test : (74659, 15)
submission template: (74659, 4)


Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,latitude,listing_id,longitude,manager_id,photos,price,street_address,interest_level
4,1.0,1,8579a0b0d54db803821a35a4a615e97a,2016-06-16 05:55:27,Spacious 1 Bedroom 1 Bathroom in Williamsburg!...,145 Borinquen Place,"[Dining Room, Pre-War, Laundry in Building, Di...",40.7108,7170325,-73.9539,a10db4590843d78c784171a107bdacb4,[https://photos.renthop.com/2/7170325_3bb5ac84...,2400,145 Borinquen Place,medium
6,1.0,2,b8e75fc949a6cd8225b455648a951712,2016-06-01 05:44:33,BRAND NEW GUT RENOVATED TRUE 2 BEDROOMFind you...,East 44th,"[Doorman, Elevator, Laundry in Building, Dishw...",40.7513,7092344,-73.9722,955db33477af4f40004820b4aed804a0,[https://photos.renthop.com/2/7092344_7663c19a...,3800,230 East 44th,low


In [14]:
# Quick checks
display(train["interest_level"].value_counts(dropna=False))
display(train.dtypes.head(15))

# Any nulls?
display(train.isna().mean().sort_values(ascending=False).head(10))

interest_level
low       34284
medium    11229
high       3839
Name: count, dtype: int64

bathrooms          float64
bedrooms             int64
building_id         object
created             object
description         object
display_address     object
features            object
latitude           float64
listing_id           int64
longitude          float64
manager_id          object
photos              object
price                int64
street_address      object
interest_level      object
dtype: object

bathrooms          0.0
bedrooms           0.0
building_id        0.0
created            0.0
description        0.0
display_address    0.0
features           0.0
latitude           0.0
listing_id         0.0
longitude          0.0
dtype: float64