# 00 â€” Setup & data

This notebook:
- checks the folder structure
- validates that Kaggle data is present
- loads raw JSON files and prints a quick sanity summary

In [None]:
from pathlib import Path
import pandas as pd

from src.config import Paths
from src.data.io import load_renthop_json, load_sample_submission
from src.utils.seed import set_global_seed

paths = Paths()
set_global_seed(42)

paths.data_raw.mkdir(parents=True, exist_ok=True)
paths.data_processed.mkdir(parents=True, exist_ok=True)
paths.models.mkdir(parents=True, exist_ok=True)
paths.oof.mkdir(parents=True, exist_ok=True)
paths.submissions.mkdir(parents=True, exist_ok=True)

print("Data raw dir:", paths.data_raw.resolve())
print("Artifacts dir:", paths.artifacts.resolve())

## Download data

If you have Kaggle CLI configured:

```bash
kaggle competitions download -c two-sigma-connect-rental-listing-inquiries -p data/raw
unzip -o data/raw/two-sigma-connect-rental-listing-inquiries.zip -d data/raw
```

Expected files:
- `data/raw/train.json`
- `data/raw/test.json`
- `data/raw/sample_submission.csv`

In [None]:
from src.config import Paths
paths = Paths()

train_path = paths.data_raw / "train.json"
test_path  = paths.data_raw / "test.json"
sub_path   = paths.data_raw / "sample_submission.csv"

assert train_path.exists(), f"Missing: {train_path}"
assert test_path.exists(),  f"Missing: {test_path}"
assert sub_path.exists(),   f"Missing: {sub_path}"

train, test = load_renthop_json(train_path, test_path)
sub = load_sample_submission(sub_path)

print("train:", train.shape)
print("test :", test.shape)
print("submission template:", sub.shape)
train.head(2)

In [None]:
# Quick checks
display(train["interest_level"].value_counts(dropna=False))
display(train.dtypes.head(15))

# Any nulls?
display(train.isna().mean().sort_values(ascending=False).head(10))