In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np
import altair as alt
import json
import re
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
import umap

In [2]:
pd.options.display.max_columns = None

# Data Loading and Transformations

In [3]:
dataset = pd.read_csv('merged.csv')
dataset

  dataset = pd.read_csv('merged.csv')


Unnamed: 0,date,violation_type,description,address,neighborhood,zip_code,parcel,owner,year_built,year_remodeled,...,insulate_exposed_ducts,insulate_exposed_pipes,interior_wall_insulation_blow_in,exterior_wall_insulation_at_replacement,exterior_wall_insulation,interior_wall_insulation_board,insulate_spandrel,asbestos,seal_elevator_vent_shafts,env_recommendation_count
0,2025-09-21 20:31:44.823+00,Housing Complaints,Pest Infestation - Residential,"200 Corey Rd, 02135",Brighton,2135,2101707660,COREY SETON MANOR LLC - LSE,1899.0,2004.0,...,f,f,f,f,f,f,f,f,f,0
1,2025-09-21 18:58:19.533+00,Sanitation Requests,Abandoned Vehicles,"80 Mascot St, 02124",Dorchester,2124,1403791000,MARSHALL WINSTON,1905.0,,...,t,f,t,f,f,f,f,f,f,3
2,2025-09-21 16:00:00+00,Housing Complaints,Working Beyond Hours,"188 Woodrow Ave, 02124",Dorchester,2124,1403492000,LIANG SULIAN,1900.0,,...,f,t,t,t,f,f,f,f,f,4
3,2025-09-21 11:26:00+00,Housing Complaints,Electrical,"75 Everett St, 02128",East Boston,2128,104963000,ACETO GEORGE J,1905.0,,...,f,f,t,f,f,f,f,f,f,2
4,2025-09-21 10:21:00+00,Civic Maintenance Requests,Contractor Complaints,"91 Edgemere Rd, 02132",West Roxbury,2132,2011847000,VASALLO ALBERTO JR,1983.0,,...,t,f,f,t,f,f,f,f,f,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180177,2020-10-18 17:35:00+00,Sanitation Requests,Abandoned Vehicles,"61 Wayland St, 02125",Dorchester,2125,1300892000,ALSTON GERALD SR,1920.0,,...,f,t,t,f,f,f,f,f,f,3
180178,2020-10-18 16:21:00+00,Housing Complaints,Unsatisfactory Living Conditions,"350 K St, 02127",South Boston,2127,702347000,DONALD WILLIAM H ETAL,1910.0,,...,t,f,t,f,f,f,f,f,f,3
180179,2020-10-18 15:24:00+00,Housing Complaints,Unsatisfactory Living Conditions,"71 Allston St, 02134",Allston,2134,2101100002,KUANG WO XI TS,1885.0,,...,f,t,t,f,f,f,f,f,f,3
180180,2020-10-18 12:35:00+00,Sanitation Requests,Rodent Activity,"88 Hudson St, 02111",Boston,2111,304981010,88 HUDSON STREET LEASEHOLD,2016.0,,...,f,f,f,f,f,f,f,f,f,0


In [32]:
# ===========================
# LOAD DATA
# ===========================

df = dataset.copy()
df.reset_index(inplace=True)
df.rename(columns={"index": "record_id"}, inplace=True)


# extracting temporal features
df['date']= pd.to_datetime(df['date'], format='mixed', utc=True, errors='coerce')
df["year"] = df["date"].dt.year
df["month"] = df["date"].dt.month
df["dayofweek"] = df["date"].dt.dayofweek
df["hour"] = df["date"].dt.hour
df["is_weekend"] = df["dayofweek"].isin([5,6]).astype(int)

temporal_cols = ["year","month","dayofweek","hour","is_weekend"]


# one-hot encoding for categorical features
cat_cols = [
    "violation_type", "neighborhood", "property_type",
    "building_typology", "building_subtypology", "use_class"
]

onehot_frames = []
for col in cat_cols:
    if col not in df.columns:
        continue
    top_n = 10 if col == "violation_type" else 15
    top_vals = df[col].value_counts().nlargest(top_n).index
    reduced = df[col].where(df[col].isin(top_vals), "Other")
    dummies = pd.get_dummies(reduced, prefix=col)
    onehot_frames.append(dummies)

onehot_df = pd.concat(onehot_frames, axis=1) if onehot_frames else pd.DataFrame(index=df.index)


# numerical features
numeric_cols = [
    "year_built","year_remodeled","land_sf","gross_area","living_area","sqft",
    "num_floors","num_bldgs","units_res","units_com","units_mixed",
    "total_site_energy_kbtu","perc_electricity","perc_gas","perc_steam",
    "ct_pop_disability","ct_pop_children_under_5","ct_pop_over_65",
    "ct_pop_low_to_no_income","ct_pop_limited_english_proficiency",
    "ct_pop_poc","ct_pop_med_illness","ct_perc_disability",
    "ct_perc_children_under_5","ct_perc_over_65","ct_perc_low_to_no_income",
    "ct_perc_limited_english_proficiency","ct_perc_poc","ct_perc_med_illness",
    "ct_hh_income_200000_or_more","ct_perc_income_200000_or_more",
    "hp_recommendation_count","env_recommendation_count",
    "latitude","longitude"
]

numeric_cols = [c for c in numeric_cols if c in df.columns]

for c in numeric_cols:
    df[c] = pd.to_numeric(df[c], errors="coerce")
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())

# Log-transform high skewed data
for c in ["land_sf","gross_area","living_area","sqft","total_site_energy_kbtu"]:
    if c in df.columns:
        df[f"log_{c}"] = np.log1p(df[c])

# boolean features, for retrofit information
bool_cols = [
    "cashp_only","cashp_and_elec_upgrade","cashp_outdoor",
    "dashp_only","dashp_and_elec_upgrade","dashp_outdoor",
    "gshp_only","gshp_and_elec_upgrade","vrf_only","vrf_and_elec_upgrade",
    "awhp_only","awhp_and_elec_upgrade","insulate_attic",
    "insulate_attic_converted","ext_roof_insulation","insulate_exposed_ducts",
    "insulate_exposed_pipes","interior_wall_insulation_blow_in",
    "exterior_wall_insulation_at_replacement","exterior_wall_insulation",
    "interior_wall_insulation_board","insulate_spandrel","asbestos",
    "seal_elevator_vent_shafts"
]
bool_cols = [c for c in bool_cols if c in df.columns]

for c in bool_cols:
    df[c] = df[c].replace({"t":1,"f":0,"T":1,"F":0}).astype(float)


# creating feature matrix
feature_df = pd.concat([
    df[numeric_cols + temporal_cols],
    onehot_df,
    df[bool_cols]
], axis=1)

X = feature_df.values
print("Feature matrix:", X.shape)


# standardizing values
scaler = StandardScaler()
X_std = scaler.fit_transform(X)

# PCA embedding
pca = PCA(n_components=2, random_state=42)
X_pca = pca.fit_transform(X_std)
df["pca_x"] = X_pca[:,0]
df["pca_y"] = X_pca[:,1]

# UMAP embedding
umap_model = umap.UMAP(
    n_neighbors=30,
    min_dist=0.3,
    n_components=2,
    random_state=42
)
X_umap = umap_model.fit_transform(X_std)
df["umap_x"] = X_umap[:,0]
df["umap_y"] = X_umap[:,1]


# saving outputs to csv 
df_out_hd = pd.concat([df[["record_id","id","parcel"]], feature_df], axis=1)
df_out_hd.to_csv("embeddings.csv", index=False)

df_out_2d = df[
    ["record_id","id","parcel","umap_x","umap_y","pca_x","pca_y",
     "violation_type","neighborhood","year_built","property_type"]
].copy()
df_out_2d.to_csv("embeddings_2d.csv", index=False)

print("Saved embeddings.csv and embeddings_2d.csv")


  df[c] = df[c].replace({"t":1,"f":0,"T":1,"F":0}).astype(float)
  df[c] = df[c].replace({"t":1,"f":0,"T":1,"F":0}).astype(float)
  df[c] = df[c].replace({"t":1,"f":0,"T":1,"F":0}).astype(float)
  df[c] = df[c].replace({"t":1,"f":0,"T":1,"F":0}).astype(float)
  df[c] = df[c].replace({"t":1,"f":0,"T":1,"F":0}).astype(float)
  df[c] = df[c].replace({"t":1,"f":0,"T":1,"F":0}).astype(float)
  df[c] = df[c].replace({"t":1,"f":0,"T":1,"F":0}).astype(float)
  df[c] = df[c].replace({"t":1,"f":0,"T":1,"F":0}).astype(float)
  df[c] = df[c].replace({"t":1,"f":0,"T":1,"F":0}).astype(float)
  df[c] = df[c].replace({"t":1,"f":0,"T":1,"F":0}).astype(float)
  df[c] = df[c].replace({"t":1,"f":0,"T":1,"F":0}).astype(float)
  df[c] = df[c].replace({"t":1,"f":0,"T":1,"F":0}).astype(float)
  df[c] = df[c].replace({"t":1,"f":0,"T":1,"F":0}).astype(float)
  df[c] = df[c].replace({"t":1,"f":0,"T":1,"F":0}).astype(float)
  df[c] = df[c].replace({"t":1,"f":0,"T":1,"F":0}).astype(float)
  df[c] = df[c].replace({

Feature matrix: (180182, 130)


  warn(


Saved embeddings.csv and embeddings_2d.csv


In [69]:
# Load the 2D embedding
emb = pd.read_csv("embeddings_2d.csv")

sample_size = 5000

emb = emb.sample(n=sample_size, random_state=42)


# brush selection
brush = alt.selection_interval()

# PCA scatterplot
embedding_chart = (
    alt.Chart(emb)
    .mark_circle(size=40, opacity=0.6)
    .encode(
        x=alt.X("pca_x:Q", title="PCA X"),
        y=alt.Y("pca_y:Q", title="PCA Y"),
        color=alt.condition(
            brush,
            alt.Color("neighborhood:N", title="Neighborhood"),
            alt.value("lightgray")
        ),
        tooltip=[
            alt.Tooltip("record_id:Q", title="Record ID"),
            alt.Tooltip("violation_type:N", title="Violation type"),
            alt.Tooltip("neighborhood:N", title="Neighborhood"),
            alt.Tooltip("year_built:Q", title="Year built"),
            alt.Tooltip("property_type:N", title="Property type"),
        ],
    )
    .add_params(brush)
    .properties(
        width=600,
        height=500,
        title="PCA embedding of Boston Building Dataset"
    )
)

embedding_chart


Index(['record_id', 'id', 'parcel', 'umap_x', 'umap_y', 'pca_x', 'pca_y',
       'violation_type', 'neighborhood', 'year_built', 'property_type'],
      dtype='object')


## Option 2: complaints embedding

In [None]:
df = dataset.copy()
df.reset_index(inplace=True)
df.rename(columns={"index": "record_id"}, inplace=True)

# Fix timezone
df["date"] = df["date"].astype(str).str.replace(r"\+00$", "+00:00", regex=True)
df["date"] = pd.to_datetime(df["date"], utc=True, errors="coerce")

# temporal features
df["year"] = df["date"].dt.year
df["month"] = df["date"].dt.month
df["hour"] = df["date"].dt.hour
df["dayofweek"] = df["date"].dt.dayofweek
df["is_weekend"] = df["dayofweek"].isin([5,6]).astype(int)

temporal_cols = ["year","month","hour","dayofweek","is_weekend"]
df[temporal_cols] = df[temporal_cols].fillna(df[temporal_cols].median())

# apply TF-IDF of complaint descriptions
vec = TfidfVectorizer(
    max_features=2000,
    stop_words="english"
)

tfidf = vec.fit_transform(df["description"].astype(str))

# Reduce TFIDF to 10 components
pca_text = PCA(n_components=10, random_state=42)
text_emb = pca_text.fit_transform(tfidf.toarray())

text_cols = [f"text_pca_{i}" for i in range(10)]
text_df = pd.DataFrame(text_emb, columns=text_cols)


# reduced one-hot encoding for categorical features
def reduced_one_hot(frame, col, top_n):
    vals = frame[col].value_counts().nlargest(top_n).index
    reduced = frame[col].where(frame[col].isin(vals), "Other")
    return pd.get_dummies(reduced, prefix=col, drop_first=False)

onehot_blocks = []

cat_specs = {
    "violation_type": 7,
    "neighborhood": 10,
    "property_type": 6,
    "building_typology": 6,
}

for col, n in cat_specs.items():
    if col in df.columns:
        onehot_blocks.append(reduced_one_hot(df, col, n))

onehot_df = pd.concat(onehot_blocks, axis=1)


# create matrix
feature_cf = pd.concat(
    [
        df[temporal_cols],   # time features
        text_df,             # text embedding
        onehot_df            # complaint + some building categories
    ],
    axis=1
)

# Convert numerical features and inpute missing values
feature_cf = feature_cf.apply(pd.to_numeric, errors="coerce")
feature_cf = feature_cf.fillna(feature_cf.median(numeric_only=True)).fillna(0)

X_cf = feature_cf.values
print("Complaint-focused feature shape:", X_cf.shape)


# standardize features
scaler_cf = StandardScaler()
X_cf_std = scaler_cf.fit_transform(X_cf)

# PCA projection
pca_cf = PCA(n_components=2, random_state=42)
X_cf_pca = pca_cf.fit_transform(X_cf_std)
df["pcaC_x"] = X_cf_pca[:, 0]
df["pcaC_y"] = X_cf_pca[:, 1]

# UMAP projection
umap_cf = umap.UMAP(
    n_neighbors=30,
    min_dist=0.2,
    n_components=2,
    random_state=42
)
X_cf_umap = umap_cf.fit_transform(X_cf_std)
df["umapC_x"] = X_cf_umap[:, 0]
df["umapC_y"] = X_cf_umap[:, 1]

# save outputs
out_cols = [
    "record_id","id","parcel",
    "violation_type","neighborhood","property_type",
    "pcaC_x","pcaC_y",
    "umapC_x","umapC_y"
]

df_cf = df[out_cols]
df_cf.to_csv("embeddings_complaint_2d.csv", index=False)



Complaint-focused feature shape: (180182, 46)


  warn(


Saved embeddings_complaint_2d.csv


In [9]:
emb_c = pd.read_csv("embeddings_complaint_2d.csv")


sample_n = 5000
emb_c = emb_c.sample(n=sample_n, random_state=42).reset_index(drop=True)


# PCA scatterplot
brush = alt.selection_interval()

chart_complaint = (
    alt.Chart(emb_c)
    .mark_circle(size=40, opacity=0.6)
    .encode(
        x=alt.X("pcaC_x:Q", title="Complaint PCA X"),
        y=alt.Y("pcaC_y:Q", title="Complaint éCA Y"),
        color=alt.condition(
            brush,
            alt.Color("violation_type:N", title="Violation Type"),
            alt.value("lightgray")
        ),
        tooltip=[
            alt.Tooltip("record_id:Q", title="Record ID"),
            alt.Tooltip("violation_type:N", title="Violation type"),
            alt.Tooltip("neighborhood:N", title="Neighborhood"),
            alt.Tooltip("property_type:N", title="Property type"),
        ],
    )
    .add_params(brush)
    .properties(
        width=600,
        height=500,
        title="Complaint-focused PCA embedding"
    )
)

chart_complaint


# Interactive visualization with embedding

In [5]:
# Load the 2D embedding
emb = pd.read_csv("embeddings_2d.csv")

sample_size = 5000

emb = emb.sample(n=sample_size, random_state=42)

# adding spatial information for interactive view
df = dataset.copy()
df.reset_index(inplace=True)
df.rename(columns={"index": "record_id"}, inplace=True)
df['date']= pd.to_datetime(df['date'], format='mixed', utc=True, errors='coerce')
emb = emb.merge(df[['record_id','date', 'latitude', 'longitude']], on='record_id')

In [6]:
import geopandas as gpd

# adding neighborhood geojson
neigh = gpd.read_file("boston_neighborhood.geojson").to_crs(4326)
neigh = neigh.rename(columns={"name": "neighborhood"})

In [98]:
brush = alt.selection_interval()

vt_domain = sorted(emb["violation_type"].dropna().unique().tolist())

# Legend-based selection on violation_type
typePick = alt.selection_point(
    fields=["violation_type"],
    bind="legend",
    name="typePick",
    empty="all"     # show all types when nothing is selected
)


# PCA scatterplot
embedding_chart = (
    alt.Chart(emb)
    .mark_circle(size=40, opacity=0.6)
    .encode(
        x=alt.X("pca_x:Q", title="PCA X"),
        y=alt.Y("pca_y:Q", title="PCA Y"),
        color=alt.condition(
            brush,
            alt.Color("neighborhood:N", title="Neighborhood"),
            alt.value("lightgray")
        ),
        tooltip=[
            alt.Tooltip("record_id:Q", title="Record ID"),
            alt.Tooltip("violation_type:N", title="Violation type"),
            alt.Tooltip("neighborhood:N", title="Neighborhood"),
            alt.Tooltip("year_built:Q", title="Year built"),
            alt.Tooltip("property_type:N", title="Property type"),
        ],
    )
    .add_params(brush)
    .properties(
        width=600,
        height=500,
        title="PCA embedding of Boston Building Dataset"
    )
)


# timeline plot
timeline_multi = (
    alt.Chart(emb)
    .transform_filter(brush)
    .transform_filter(typePick)            
    .transform_timeunit(
        month="yearmonth(date)"
    )
    .transform_aggregate(
        count="count()",
        groupby=["month", "violation_type"]
    )
    .mark_line(point=True)
    .encode(
        x=alt.X("month:T", title="Month"),
        y=alt.Y("count:Q", title="Violations (selected cluster)"),
        color=alt.condition(
            typePick,
            alt.Color("violation_type:N",
                      scale=alt.Scale(domain=vt_domain),
                      title="Violation Type"),
            alt.value("lightgray")           
        ),
        tooltip=[
            alt.Tooltip("month:T", title="Month"),
            alt.Tooltip("violation_type:N", title="Type"),
            alt.Tooltip("count:Q", title="Count")
        ]
    )
    .add_params(typePick)
    .properties(width=450, height=200, title="Violation trends within selected cluster")
)


neigh_4326 = neigh.to_crs(4326)

# Base map: neighborhood polygons
base_map = (
    alt.Chart(neigh_4326)
    .mark_geoshape(stroke="black", strokeWidth=0.5, fill="#f5f5f5")
    .encode(
        tooltip=[alt.Tooltip("neighborhood:N", title="Neighborhood")]
    )
    .project(type="identity", reflectY=True)
    .properties(width=450, height=400, title="Locations of selected complaints")
)

# Points: complaints in the brushed cluster (and time window)
points = (
    alt.Chart(emb)
    .transform_filter(brush)
    .transform_filter(typePick)          
    .mark_circle(size=40, opacity=0.7)
    .encode(
        longitude="longitude:Q",
        latitude="latitude:Q",
        color=alt.Color(
            "violation_type:N",
            title="Violation type",
            scale=alt.Scale(domain=vt_domain)
        ),
        tooltip=[
            "violation_type:N",
            "neighborhood:N",
            "property_type:N",
            "date:T"
        ]
    )
)

map_view = (base_map + points).properties(title="Map of selected cluster")

In [99]:
layout = alt.hconcat(
    embedding_chart,
    alt.vconcat(timeline_multi, map_view)
).resolve_scale(
    color="independent"
).configure_title(
    fontSize=18, font='Helvetica', anchor='middle', color='black'
).configure_axis(
    labelFontSize=12, titleFontSize=14
).configure_legend(
    titleFontSize=13, labelFontSize=12, symbolSize=150,
    titleFont='Helvetica', labelFont='Helvetica'
)

layout


Saving embedding sampled to csv

In [8]:
emb.to_csv("embeddings_2d_sampled.csv", index=False)