
# EDA Geospatial Data — **GeoPandas Only** (Convert from PyDeck)

Notebook ini mengganti seluruh visualisasi **PyDeck** menjadi **GeoPandas + Matplotlib** (tanpa 3D).  
Seluruh visualisasi 3D diekivalenkan menjadi **poligon** dengan gradasi **merah → kuning** (*YlOrRd*: merah = intensitas tertinggi).

### Output visual yang tersedia
1. **Sebaran titik PU** (warna per klaster jika tersedia `cluster`).
2. **Hull poligon per klaster** (pengganti ColumnLayer 3D), warna berdasarkan **jumlah titik**.
3. **Choropleth PU** per wilayah (jika boundary tersedia).
4. **Choropleth DO** per wilayah (jika boundary tersedia).
5. **Label centroid** pada hull (identitas cluster & jumlah titik).

> Jika belum ada kolom `cluster`, scatter tetap tampil (warna seragam) dan hull per-klaster akan dilewati.


In [None]:

# (Opsional) install hanya jika paket belum tersedia
try:
    import geopandas as gpd  # noqa
except ImportError:
    %pip install -q geopandas shapely pyproj rtree matplotlib pandas numpy


In [None]:

import os
import warnings
import pandas as pd
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt
from shapely.geometry import Point
from shapely.ops import unary_union

warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", 50)
plt.rcParams["figure.dpi"] = 140


In [None]:

# ====== KONFIGURASI FILE INPUT (auto-detect) ======
CANDIDATE_DIRS = [".", "./data", "/mnt/data"]

TRIPS_FILE_NAMES = ["jakarta_ride_trips_5000.csv", "ride_trips.csv", "trips.csv"]
POIS_FILE_NAMES = ["jakarta_pois.csv", "pois.csv"]
AREAS_FILE_NAMES = ["jakarta_areas.csv", "areas.csv", "areas.geojson", "areas.geo.csv"]

def find_first_existing(candidates, filenames):
    for d in candidates:
        for f in filenames:
            p = os.path.join(d, f)
            if os.path.exists(p):
                return p
    return None

TRIPS_PATH = find_first_existing(CANDIDATE_DIRS, TRIPS_FILE_NAMES)
POIS_PATH  = find_first_existing(CANDIDATE_DIRS, POIS_FILE_NAMES)
AREAS_PATH = find_first_existing(CANDIDATE_DIRS, AREAS_FILE_NAMES)

print("TRIPS_PATH :", TRIPS_PATH)
print("POIS_PATH  :", POIS_PATH)
print("AREAS_PATH :", AREAS_PATH)


In [None]:

# ====== LOAD DATA ======
df_trips, df_pois, gdf_areas = None, None, None

if TRIPS_PATH and os.path.exists(TRIPS_PATH):
    df_trips = pd.read_csv(TRIPS_PATH)
    print("Loaded trips:", df_trips.shape)
else:
    print("Peringatan: File trips tidak ditemukan. Set TRIPS_PATH manual.")

if POIS_PATH and os.path.exists(POIS_PATH):
    try:
        df_pois = pd.read_csv(POIS_PATH)
        print("Loaded POIs:", df_pois.shape)
    except Exception as e:
        print("POI tidak terbaca sebagai CSV:", e)

# Boundary (CSV WKT/GeoJSON)
if AREAS_PATH and os.path.exists(AREAS_PATH):
    if AREAS_PATH.lower().endswith(".geojson"):
        gdf_areas = gpd.read_file(AREAS_PATH)
    else:
        try:
            df_areas = pd.read_csv(AREAS_PATH)
            wkt_cols = [c for c in df_areas.columns if c.lower() in ["geometry","wkt","geom"]]
            if wkt_cols:
                from shapely import wkt as _wkt
                geom_col = wkt_cols[0]
                gdf_areas = gpd.GeoDataFrame(
                    df_areas.copy(),
                    geometry=df_areas[geom_col].apply(lambda x: _wkt.loads(x) if isinstance(x, str) and x.strip() else None),
                    crs="EPSG:4326"
                )
            else:
                print("Tidak ditemukan kolom WKT pada areas CSV; boundary dilewati.")
                gdf_areas = None
        except Exception as e:
            print("Gagal membaca areas. Boundary akan di-skip:", e)
else:
    print("Info: File boundary opsional tidak ditemukan.")


In [None]:

# ====== DETEKSI KOLOM LON/LAT ======
def _first_match(cols, names):
    lower = {c.lower(): c for c in cols}
    for n in names:
        if n in lower:
            return lower[n]
    return None

def guess_lon_lat(df, prefixes):
    for pre in prefixes:
        lon = _first_match(df.columns, [f"{pre}_longitude", f"{pre}_lon", f"{pre}_lng"])
        lat = _first_match(df.columns, [f"{pre}_latitude", f"{pre}_lat"])
        if lon and lat:
            return lon, lat
    return None, None

pu_lon, pu_lat, do_lon, do_lat = None, None, None, None
if df_trips is not None:
    pu_lon, pu_lat = guess_lon_lat(df_trips, ["pickup","pu","start"])
    do_lon, do_lat = guess_lon_lat(df_trips, ["dropoff","do","end"])
    print("PU lon/lat:", pu_lon, pu_lat)
    print("DO lon/lat:", do_lon, do_lat)


In [None]:

# ====== KONVERSI KE GEODATAFRAME ======
def to_gdf_points(df, lon_col, lat_col, crs="EPSG:4326"):
    if (df is None) or (lon_col is None) or (lat_col is None):
        return None
    good = df.dropna(subset=[lon_col, lat_col]).copy()
    good[lon_col] = pd.to_numeric(good[lon_col], errors="coerce")
    good[lat_col] = pd.to_numeric(good[lat_col], errors="coerce")
    good = good.dropna(subset=[lon_col, lat_col])
    return gpd.GeoDataFrame(good, geometry=gpd.points_from_xy(good[lon_col], good[lat_col]), crs=crs)

gdf_pu = to_gdf_points(df_trips, pu_lon, pu_lat)
gdf_do = to_gdf_points(df_trips, do_lon, do_lat)

if gdf_pu is not None: print("gdf_pu:", gdf_pu.shape)
if gdf_do is not None: print("gdf_do:", gdf_do.shape)



## 1) Sebaran Titik PU — GeoPandas Scatter (pengganti PyDeck ScatterplotLayer)
Jika ada `cluster` (atau `pu_cluster` / `dbscan_cluster`), warna kategori per klaster.


In [None]:

fig, ax = plt.subplots(figsize=(8, 8))

if gdf_pu is None or len(gdf_pu) == 0:
    ax.set_title("PU tidak tersedia")
else:
    cluster_col = None
    for c in ["cluster","pu_cluster","dbscan_cluster"]:
        if c in gdf_pu.columns:
            cluster_col = c; break

    if cluster_col is None:
        gdf_pu.plot(ax=ax, color="#555555", markersize=2, linewidth=0, alpha=0.6)
        ax.set_title("Sebaran PU (tanpa cluster)")
    else:
        gdf_pu.plot(ax=ax, column=cluster_col, categorical=True, legend=True, markersize=3, alpha=0.95)
        ax.set_title(f"Sebaran PU per Klaster ({cluster_col})")

ax.set_xlabel("Longitude"); ax.set_ylabel("Latitude")
ax.set_aspect("equal", adjustable="box"); plt.show()



## 2) Hull Poligon per Klaster — (pengganti ColumnLayer 3D)
- Buat **convex hull** titik-titik PU per klaster.
- Warna **YlOrRd** (merah = jumlah tertinggi).
- Label centroid: `Cluster (N)`.


In [None]:

fig, ax = plt.subplots(figsize=(8, 8))

if gdf_pu is None or len(gdf_pu) == 0:
    ax.set_title("PU tidak tersedia")
else:
    cluster_col = None
    for c in ["cluster","pu_cluster","dbscan_cluster"]:
        if c in gdf_pu.columns:
            cluster_col = c; break

    if cluster_col is None:
        ax.set_title("Tidak ada kolom cluster pada PU — lewati hull per klaster.")
        gdf_pu.plot(ax=ax, color="#888888", markersize=2, alpha=0.5)
    else:
        hull_records = []
        for cid, sub in gdf_pu.groupby(cluster_col):
            if len(sub) < 3:
                continue  # butuh minimal 3 titik untuk hull
            geom = unary_union(sub.geometry).convex_hull
            hull_records.append({"cluster": cid, "count": len(sub), "geometry": geom})
        if len(hull_records) == 0:
            ax.set_title("Tidak cukup titik untuk membuat hull.")
        else:
            gdf_hull = gpd.GeoDataFrame(hull_records, crs="EPSG:4326")
            gdf_hull.plot(ax=ax, column="count", cmap="YlOrRd", legend=True, alpha=0.7, edgecolor="#333333", linewidth=0.8)
            # label centroid
            try:
                gdf_hull["centroid"] = gdf_hull.geometry.centroid
                for _, r in gdf_hull.iterrows():
                    cx, cy = r["centroid"].x, r["centroid"].y
                    txt = f"{r['cluster']} ({r['count']})"
                    ax.text(cx, cy, txt, fontsize=8, ha="center", va="center",
                            color="#111111", bbox=dict(facecolor="white", alpha=0.6, boxstyle="round,pad=0.2"))
            except Exception as e:
                print("Gagal membuat label centroid:", e)
            ax.set_title("Hull Poligon per Klaster (warna = jumlah trip)")

ax.set_xlabel("Longitude"); ax.set_ylabel("Latitude")
ax.set_aspect("equal", adjustable="box"); plt.show()



## 3) Choropleth **PU** per Wilayah — (pengganti PolygonLayer 3D PU)
- Spatial join (predicate `within`), agregasi jumlah **PU** per wilayah.
- Warna **YlOrRd** (merah = tinggi).


In [None]:

fig, ax = plt.subplots(figsize=(8, 8))

if (gdf_areas is None) or (gdf_pu is None) or (len(gdf_pu) == 0) or (len(gdf_areas) == 0):
    ax.set_title("Boundary atau PU tidak tersedia — lewati choropleth PU.")
else:
    key_col = None
    for c in gdf_areas.columns:
        if c.lower() in ["name","nama","kecamatan","kelurahan","kabupaten","kota","provinsi","id","kode"]:
            key_col = c; break
    if key_col is None:
        key_col = "index"; gdf_areas = gdf_areas.reset_index()

    joined = gpd.sjoin(gdf_pu.to_crs(4326), gdf_areas.to_crs(4326), how="left", predicate="within")
    agg = joined.groupby(key_col).size().reset_index(name="PU_Count")
    gdf_pu_area = gdf_areas.merge(agg, on=key_col, how="left").fillna({"PU_Count":0})

    gdf_pu_area.plot(ax=ax, column="PU_Count", cmap="YlOrRd", legend=True, alpha=0.75, edgecolor="#333333", linewidth=0.4)
    ax.set_title("Choropleth PU per Wilayah (merah = tinggi)")

ax.set_xlabel("Longitude"); ax.set_ylabel("Latitude")
ax.set_aspect("equal", adjustable="box"); plt.show()



## 4) Choropleth **DO** per Wilayah — (pengganti PolygonLayer 3D DO)
- Spatial join (predicate `within`), agregasi jumlah **DO** per wilayah.
- Warna **YlOrRd** (merah = tinggi).


In [None]:

fig, ax = plt.subplots(figsize=(8, 8))

if (gdf_areas is None) or (gdf_do is None) or (len(gdf_do) == 0) or (len(gdf_areas) == 0):
    ax.set_title("Boundary atau DO tidak tersedia — lewati choropleth DO.")
else:
    key_col = None
    for c in gdf_areas.columns:
        if c.lower() in ["name","nama","kecamatan","kelurahan","kabupaten","kota","provinsi","id","kode"]:
            key_col = c; break
    if key_col is None:
        key_col = "index"; gdf_areas = gdf_areas.reset_index()

    joined = gpd.sjoin(gdf_do.to_crs(4326), gdf_areas.to_crs(4326), how="left", predicate="within")
    agg = joined.groupby(key_col).size().reset_index(name="DO_Count")
    gdf_do_area = gdf_areas.merge(agg, on=key_col, how="left").fillna({"DO_Count":0})

    gdf_do_area.plot(ax=ax, column="DO_Count", cmap="YlOrRd", legend=True, alpha=0.75, edgecolor="#333333", linewidth=0.4)
    ax.set_title("Choropleth DO per Wilayah (merah = tinggi)")

ax.set_xlabel("Longitude"); ax.set_ylabel("Latitude")
ax.set_aspect("equal", adjustable="box"); plt.show()



---
**Generated:** 2025-08-17 06:02:35 UTC  
Konversi otomatis PyDeck → GeoPandas oleh ChatGPT.
