# 01 — EDA (target + features)

Scoring requirements we cover here:
- target distribution + class balance
- target analysis in time context
- feature typization + distributions
- missing values + anomalies
- dependencies between features
- correlations / importance proxy
- hypothesis-driven mini-research (with plots)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from src.config import Paths, TARGET_COL
from src.data.io import load_renthop_json
from src.features.build import build_base_dataframe

paths = Paths()
train, test = load_renthop_json(paths.data_raw/"train.json", paths.data_raw/"test.json")
df = build_base_dataframe(train)
df.head()

## 1) Target analysis

In [None]:
vc = df[TARGET_COL].value_counts()
display(vc)
display((vc / vc.sum()).rename("share"))

In [None]:
# Bar plot of class balance
ax = df[TARGET_COL].value_counts().plot(kind="bar")
ax.set_title("Target distribution (interest_level)")
ax.set_xlabel("class")
ax.set_ylabel("count")
plt.show()

### Target in time context (created timestamp)

In [None]:
# Interest levels over time (daily counts)
tmp = df.copy()
tmp["date"] = tmp["created_dt"].dt.date
daily = tmp.groupby(["date", TARGET_COL]).size().reset_index(name="cnt")
pivot = daily.pivot(index="date", columns=TARGET_COL, values="cnt").fillna(0)

ax = pivot.rolling(7).mean().plot(figsize=(10,4))
ax.set_title("7-day rolling mean of listings by interest level")
ax.set_xlabel("date")
ax.set_ylabel("count")
plt.show()

## 2) Feature typization + distributions

In [None]:
numeric_cols = ["price", "bedrooms", "bathrooms", "latitude", "longitude", "n_photos", "n_features", "price_per_bed", "price_per_bath"]
cat_cols = ["manager_id", "building_id", "street_address_clean", "display_address_clean"]

df[numeric_cols].describe().T

In [None]:
# Numeric distributions (a few key ones)
for col in ["price", "bedrooms", "bathrooms", "n_photos", "n_features"]:
    ax = df[col].hist(bins=50)
    ax.set_title(f"Distribution: {col}")
    plt.show()

## 3) Anomalies (examples)

In [None]:
# Simple anomaly checks
display(df.loc[df["price"] <= 0, ["listing_id","price"]].head())
display(df.loc[df["price"] > df["price"].quantile(0.995), ["listing_id","price","bedrooms","bathrooms"]].head())

# Lat/long sanity (NYC approx ranges)
lat_bad = df[(df["latitude"] < 40.3) | (df["latitude"] > 41.1)]
lon_bad = df[(df["longitude"] < -74.5) | (df["longitude"] > -73.5)]
print("Bad latitude rows:", len(lat_bad))
print("Bad longitude rows:", len(lon_bad))

## 4) Missing values

In [None]:
miss = df.isna().mean().sort_values(ascending=False)
display(miss.head(20))

ax = miss.head(20).sort_values().plot(kind="barh", figsize=(8,5))
ax.set_title("Top-20 columns by missing rate")
ax.set_xlabel("missing rate")
plt.show()

## 5) Feature dependencies

In [None]:
# Correlation heatmap for numeric features (matplotlib only)
corr = df[numeric_cols].corr(numeric_only=True)
fig, ax = plt.subplots(figsize=(8,6))
im = ax.imshow(corr.values, aspect="auto")
ax.set_xticks(range(len(corr.columns)), corr.columns, rotation=90)
ax.set_yticks(range(len(corr.index)), corr.index)
fig.colorbar(im, ax=ax, fraction=0.046, pad=0.04)
ax.set_title("Correlation (numeric features)")
plt.tight_layout()
plt.show()

## 6) Hypothesis checks (example mini-research)

Example hypotheses you can keep/refine:

1) **Price efficiency matters**: lower `price_per_bed` → higher probability of `high` interest  
2) **Photos help**: more photos → higher interest  
3) **Text richness matters**: longer descriptions → higher interest (up to saturation)

In [None]:
def boxplot_by_target(col: str):
    groups = [df.loc[df[TARGET_COL]==k, col].dropna().values for k in ["low","medium","high"] if k in df[TARGET_COL].unique()]
    fig, ax = plt.subplots(figsize=(6,4))
    ax.boxplot(groups, labels=["low","medium","high"])
    ax.set_title(f"{col} by interest_level")
    ax.set_ylabel(col)
    plt.show()

boxplot_by_target("price_per_bed")
boxplot_by_target("n_photos")