National-Scale CAD Crime Classification Using Zero-Shot Embedding Prototypes

In [None]:
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import numpy as np
import pandas as pd
import plotly.io as pio
import plotly.express as px
import seaborn as sns
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings("ignore")
pio.renderers.default = "notebook"  # or "notebook", "colab", "browser", etc.
# Make plots look professional
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("husl")
px.defaults.width = 1000
px.defaults.height = 600

# Load your dataset
df_nlp = pd.read_parquet("./data/opd_nlp_dataset.parquet")
df_nlp

In [None]:
embedder = SentenceTransformer("paraphrase-MiniLM-L3-v2") # Best fast embedder (2025 gold standard for police text)
# embedder = SentenceTransformer("all-MiniLM-L6-v2") # Slightly slower and slightly worse on police jargon
# embedder = SentenceTransformer("sentence-transformers/all-distilroberta-v1") # Higher quality embeddings, but overkill for most CAD work
# embedder = SentenceTransformer("sentence-transformers/paraphrase-TinyBERT-L6-v2") # Fast but loses too much nuance — “SHOTS FIRED” and “FIREWORKS” end up too close
embedder.max_seq_length = 64

# 20 clean categories ("few-shot" prompt)
CATEGORIES= [
    "Theft from Motor Vehicle",
    "Stolen Vehicle",
    "Theft / Petty Theft / Shoplifting",
    "Residential Burglary",
    "Battery / Domestic Violence",
    "Simple Assault / Fight",
    "Traffic Violation / Traffic Stop / Citation",
    "DUI / Drunk Driving",
    "Shots Fired / Person with Gun",               # ← weapons only
    "Suspicious Person / Prowler",                 # ← loitering, weird behavior
    "911 Hangup / Open Line",                      # ← NEW: separate & correct
    "Disturbance / Loud Party / Neighbor Dispute",
    "Welfare Check / Mental Health Crisis",        # ← 911 hangups often go here too
    "Trespass / Unwanted Person",
    "Vandalism / Property Damage",
    "Narcotics / Drug Possession",
    "Prostitution / Vice",
    "Robbery / Street Robbery",
    "Alarm Call / False Alarm",
    "Officer-Initiated / On-View Activity",
    "Non-Emergency / Information Call",
    "Fire / Medical / Ambulance Request",
    "Missing Person / Runaway",
    "Juvenile / Truancy / Curfew",
    "Fraud / Identity Theft / Scam",
    "Threats / Harassment / Stalking",
    "Other / Unknown / Administrative"
]

In [None]:
df_nlp['crime_category'] = "pending"
for table_type in ["CALLS FOR SERVICE", "INCIDENTS"]:
    print(f"\n{'='*20} {table_type} {'='*20}")
    mask = df_nlp['table_type'] == table_type
    subset = df_nlp.loc[mask]
    print(f"   → {len(subset):,} rows")

    # 1. Create embeddings - this can take several minutes
    print("Computing embeddings...")
    unique_desc, inverse = np.unique(
        subset['crime_description'].astype(str).values, 
        return_inverse=True
    )
    embeddings = embedder.encode(
        unique_desc.tolist(),
        batch_size=8192,
        show_progress_bar=True,
        normalize_embeddings=True,
        convert_to_numpy=True,
        device='cpu',
    )

    # 2. Few-shot "training" using category names as prototypes
    print("   → Creating category prototypes (few-shot style)...")
    proto_emb = embedder.encode(CATEGORIES, normalize_embeddings=True)

    # 3. Nearest prototype = prediction (this is zero-shot classification)
    print("   → zero-shot classification...")
    sim = cosine_similarity(embeddings, proto_emb)
    predictions_idx = sim.argmax(axis=1)
    predictions = [CATEGORIES[i] for i in predictions_idx]
    
    # Map back to full data
    print("   → add prediction to full data...")
    full_predictions = np.array(predictions)[inverse]
    df_nlp.loc[mask, 'crime_category'] = full_predictions

df_nlp.to_parquet("./data/opd_nlp_dataset_fitted.parquet", compression="zstd")
df_nlp

In [None]:
# POST-CLASSIFICATION SANITY CHECK — THIS IS PURE GOLD
print("="*80)
print("SANITY CHECK: What does each crime category actually contain?")
print("="*80)

def analyze_category(cat):
    subset = df_nlp[df_nlp['crime_category'] == cat]
    total = len(subset)
    if total == 0:
        return
    
    print(f"\n→ {cat}")
    print(f"   Total records: {total:,} ({total/len(df_nlp):.1%} of all data)")
    
    # Top 15 most common descriptions
    top_desc = subset['crime_description'].value_counts().head(15)
    print("   Top real descriptions:")
    for desc, count in top_desc.items():
        pct = count / total * 100
        print(f"     • {desc:45} → {count:9,} ({pct:5.1f}%)")

# Run for all categories, sorted by size
categories_by_size = df_nlp['crime_category'].value_counts().index

for cat in categories_by_size:
    analyze_category(cat)

In [None]:
# Interactive sanity dashboard
summary = []
for cat in df_nlp['crime_category'].unique():
    subset = df_nlp[df_nlp['crime_category'] == cat]
    top3 = subset['crime_description'].value_counts().head(3)
    summary.append({
        'crime_category': cat,
        'total_records': len(subset),
        'top_description_1': top3.index[0] if len(top3)>0 else "",
        'count_1': top3.iloc[0] if len(top3)>0 else 0,
        'top_description_2': top3.index[1] if len(top3)>1 else "",
        'count_2': top3.iloc[1] if len(top3)>1 else 0,
        'top_description_3': top3.index[2] if len(top3)>2 else "",
        'count_3': top3.iloc[2] if len(top3)>2 else 0,
    })

summary_df = pd.DataFrame(summary).sort_values('total_records', ascending=False)
summary_df['% of total'] = (summary_df['total_records'] / len(df_nlp) * 100).round(2)
summary_df.reset_index(drop=True, inplace=True)

# Show as beautiful table
summary_df.style.background_gradient(cmap='Blues').format({
    'total_records': '{:,}',
    'count_1': '{:,}', 'count_2': '{:,}', 'count_3': '{:,}'
})

In [None]:
print(f"Dataset: {len(df_nlp):,} rows | {df_nlp['city'].nunique()} cities | {df_nlp['state'].nunique()} states")

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 7))
# --- Left: INCIDENTS ---
top_inc = df_nlp[df_nlp['table_type'] == 'INCIDENTS']['crime_category'].value_counts().head(10)
ax1.barh(range(len(top_inc)), top_inc.values[::-1], color='steelblue', height=0.6)
ax1.set_yticks(range(len(top_inc)))
ax1.set_yticklabels(top_inc.index[::-1])
ax1.set_title("Top 10 Crime Categories – INCIDENTS (Official Reports)", fontsize=14, pad=20)
ax1.set_xlabel("Number of Records", fontsize=12)
ax1.grid(True, axis='x', alpha=0.3, linestyle='-', linewidth=0.7)
ax1.xaxis.set_major_formatter(mticker.FuncFormatter(lambda x, _: f'{int(x):,}'))
# --- Right: CALLS FOR SERVICE ---
top_call = df_nlp[df_nlp['table_type'] == 'CALLS FOR SERVICE']['crime_category'].value_counts().head(10)
ax2.barh(range(len(top_call)), top_call.values[::-1], color='crimson', height=0.6)
ax2.set_yticks(range(len(top_call)))
ax2.set_yticklabels(top_call.index[::-1])
ax2.set_title("Top 10 Crime Categories – CALLS FOR SERVICE", fontsize=14, pad=20)
ax2.set_xlabel("Number of Records", fontsize=12)
ax2.grid(True, axis='x', alpha=0.3, linestyle='-', linewidth=0.7)
ax2.xaxis.set_major_formatter(mticker.FuncFormatter(lambda x, _: f'{int(x):,}'))
for ax in (ax1, ax2):
    ax.tick_params(axis='x', rotation=45)
for ax in (ax1, ax2):
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['left'].set_visible(False)
    ax.spines['bottom'].set_color('#DDDDDD')
    ax.tick_params(axis='both', which='both', length=0)
plt.tight_layout()
plt.show()

In [None]:
df = df_nlp.copy()
df = df[(df['incident_date']>='2010-01-01') & (df['incident_date']<='2025-12-31')]
monthly = (df
           .assign(month = df['incident_date'].dt.to_period('M').astype(str))
           .groupby(['month', 'table_type', 'crime_category'])
           .size()
           .reset_index(name='count')
           .sort_values('month'))

fig = px.area(monthly, 
              x='month', y='count', color='crime_category',
              facet_col='table_type',
              title="Crime Category Trends Over Time (Calls vs Incidents)",
              height=600)
fig.update_xaxes(tickangle=45)
fig.show()

In [None]:
top_cities = df_nlp['city'].value_counts().head(15).index

heat = (df_nlp[df_nlp['city'].isin(top_cities)]
        .groupby(['city', 'crime_category'])
        .size()
        .unstack(fill_value=0))

# Normalize per city (so Chicago doesn't dominate everything)
heat_norm = heat.div(heat.sum(axis=1), axis=0)* 100

plt.figure(figsize=(14, 10))
sns.heatmap(heat_norm, cmap="YlOrRd", linewidths=.5, annot=True, fmt=".1f")
plt.title("Crime Category Distribution (%) – Top 15 Cities (Row-Normalized)", fontsize=16, pad=20)
plt.ylabel("City")
plt.xlabel("Unified Crime Category")
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
call_only = df_nlp[df_nlp['table_type'] == 'CALLS FOR SERVICE']
inc_only  = df_nlp[df_nlp['table_type'] == 'INCIDENTS']

dark_figure = (call_only['crime_category'].value_counts().head(12)
               .to_frame(name="Calls")
               .join(inc_only['crime_category'].value_counts().rename("Incidents"), how='left')
               .fillna(0))

dark_figure['Ratio Calls→Incidents'] = dark_figure['Incidents'] / dark_figure['Calls']
dark_figure = dark_figure.sort_values("Ratio Calls→Incidents")

dark_figure.plot(kind='barh', figsize=(12, 8), width=0.8)
plt.title("The 'Dark Figure' – How Many Calls Become Official Incidents?", pad=20, fontsize=15)
plt.xlabel("Count (log scale)")
plt.xscale('log')
plt.tight_layout()
plt.grid(True, axis='x', alpha=0.3, linestyle='--')
plt.show()

print("Most under-reported (calls >> incidents):")
print(dark_figure.head(5))

In [None]:
# Prepare the data
treemap_data = (df_nlp
                .groupby(['state', 'city', 'crime_category'])
                .size()
                .reset_index(name='count'))
# Calculate percentages for each crime category
total_crimes = treemap_data['count'].sum()
treemap_data['percentage'] = (treemap_data['count'] / total_crimes * 100)
# Format functions
def format_count(x):
    if x >= 1_000_000:
        return f"{x/1_000_000:.2f}M"
    elif x >= 1_000:
        return f"{x/1_000:.1f}K"
    else:
        return f"{x:,}"
# Create the treemap
fig = px.treemap(
    treemap_data,
    path=['state', 'city', 'crime_category'],
    values='count',
    color='count',
    color_continuous_scale='Reds',
    title="<b>National Crime Category Distribution</b><br><i>State → City → Crime Category</i>",
)
# Update traces with our custom labels
fig.update_traces(
    textinfo="label+value+percent parent",
    textposition="middle center",
    textfont=dict(
        family="Arial Black",
        size=11,
        color="black"  # BLACK TEXT
    ),
    marker=dict(
        line=dict(width=2, color="white")
    ),
    hovertemplate="<b>%{label}</b><br>Records: %{value:,}<br>Percent: %{percentParent}<extra></extra>"
)
# Update layout - NO uniformtext to avoid hiding labels
fig.update_layout(
    margin=dict(t=100, l=10, r=10, b=10),
    paper_bgcolor="white",
    plot_bgcolor="white",
    title_x=0.5,
    title_font=dict(size=22, family="Arial", color="black"),
    font=dict(family="Arial", color="black"),
    height=800
)
fig.show()