In [None]:
import pandas as pd
import geopandas as gpd
from shapely import wkt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

# Load and parse GeoDataFrame
df = pd.read_csv('merged.csv', low_memory=False)

# Parse geometry from WKT (if needed)
if df['geometry'].dtype == object:
    df['geometry'] = df['geometry'].apply(wkt.loads)

gdf = gpd.GeoDataFrame(df, geometry='geometry')
gdf.set_crs(epsg=4326, inplace=True)

# Extract centroid coordinates (in projected CRS for clustering)
gdf_proj = gdf.to_crs(epsg=6493)  # EPSG 6493 = NAD83 / Michigan Central (meters)
gdf['centroid_x'] = gdf_proj.geometry.centroid.x
gdf['centroid_y'] = gdf_proj.geometry.centroid.y

# Select relevant features for clustering
# Drop obvious non-features (like IDs), add centroids for spatial awareness
ignore_cols = ['geometry', 'standardized_id_num', 'partisan_temp_category']
feature_cols = [col for col in df.columns if col not in ignore_cols and pd.api.types.is_numeric_dtype(df[col])]

# Parameters
geo_weight = 90  # Try 10–100; higher = stronger spatial grouping

# Prepare features
features = gdf[feature_cols + ['centroid_x', 'centroid_y']].copy()

# Clean feature set
features = features.apply(pd.to_numeric, errors='coerce')
features = features.dropna(axis=1, how='all')
features = features.fillna(features.mean(numeric_only=True))

# Scale
scaler = StandardScaler()
X_scaled = scaler.fit_transform(features)

# Identify columns
spatial_index_x = features.columns.get_loc('centroid_x')
spatial_index_y = features.columns.get_loc('centroid_y')

# Re-weight spatial coordinates post-scaling
X_scaled[:, spatial_index_x] *= geo_weight
X_scaled[:, spatial_index_y] *= geo_weight

# Remove old cluster column if present
if 'cluster' in gdf.columns:
    gdf = gdf.drop(columns='cluster')

# Cluster
k = 30
kmeans = KMeans(n_clusters=k, random_state=42)
gdf['cluster'] = kmeans.fit_predict(X_scaled)

# Plot (very basic plot, can improve)
gdf.plot(column='cluster', legend=True, figsize=(10, 6), cmap='tab10')
plt.title("Clustered Voting Precincts")
plt.axis('off')
plt.show()

In [None]:
def makeCensusFeatureLabels(feature_name, year):
    # We need to find the directory by only knowing the first part
    # of the name, which is the census id/code.
    if year == 2024: # No 2024 data yet.
        year = 2023
    feature_name_code = ''
    if feature_name[:1] == 'S':
        feature_name_code = feature_name[:5].upper()
        feature_name_label = feature_name[6:]
        data_type = 'ACSST5Y'
    elif feature_name[:1] == 'B':
        feature_name_code = feature_name[:6].upper()
        feature_name_label = feature_name[7:]
        data_type = 'ACSDT5Y'
    elif feature_name[:1] == 'D':
        feature_name_code = feature_name[:4].upper()
        feature_name_label = feature_name[5:]
        data_type = 'ACSDP5Y'
    else:
        # Not a census feature
        return feature_name

    partial_dir = feature_name_code.lower()
    base_path = 'data/census/'
    
    matching_dir = glob.glob(os.path.join(base_path, partial_dir + '*'))
    
    if matching_dir:
        target_dir = matching_dir[0]
        
        dataset_name = after_underscore = target_dir.split("_", 1)[1] # characters following the code.
        dataset_name = dataset_name.replace('_', ' ').title()
        file_path = os.path.join(target_dir, f'{data_type}{year}.{feature_name_code}-Column-Metadata.csv')

        df_columns = pd.read_csv(file_path)
        label = df_columns[df_columns['Column Name'] == feature_name].values[0][1]
        parts = label.split('!!')
        short_label = ' | '.join(parts)
        feature_label = f'{feature_name} | {dataset_name} | {short_label}'

        return feature_label

In [None]:
year = '2022'

In [None]:
import numpy as np
import glob
import os

# List of fields to exclude from analysis
exclude_cols = [
    'Michigan County Code', 'City/Township Code', 'Ward Number', 'Precinct Number', 'Precinct Label', 
    'Office Code', 'District Code', 'Status Code', 'Office Description', 'County Name', 'Census County Code', 
    'Election Year', 'Election Type', 'City/Township Description', 
    'standardized_id', 'standardized_id_num', 
    'dem_votes', 'oth_votes', 'rep_votes', 
    'total_votes', 
    'dem_share', 'rep_share', 'oth_share', 
    'partisan_temp', 
    'registered_voters', 'turnout_pct', 
    # 'dem_share_prev', 'rep_share_prev', 'oth_share_prev', 
    # 'dem_share_change_prev', 'rep_share_change_prev', 'oth_share_change_prev', 
    'dem_share_change_curr', 'rep_share_change_curr', 'oth_share_change_curr', 
    # 'dem_votes_change_prev', 'rep_votes_change_prev', 'oth_votes_change_prev', 
    'dem_votes_change_curr', 'rep_votes_change_curr', 'oth_votes_change_curr', 
    # 'registered_voters_change_prev', 'turnout_pct_change_prev', 
    'registered_voters_change_curr', 'turnout_pct_change_curr', 
    # 'partisan_temp_prev', 'partisan_temp_change_prev', 
    'partisan_temp_change_curr', 
    'nearest_bound_school_district', 'nearest_bound_census_tract', 'nearest_bound_zipcode', 
    'geometry', 'tractce_tract', 'geoid_tract', 'geoidfq_tract', 
    'name_tract', 'aland_tract', 'awater_tract', 'geometry_tract', 
    'true_label', 'predicted_label',
]

# List of excluded fields generated by clustering.
exclude_added_cols = [
    'latitude', 'longitude', 'centroid_x', 'centroid_y',
]

exclude_cols.extend(exclude_added_cols)

# Use the same features used for clustering (excluding spatial + excluded fields)
cluster_features = features.drop(columns=[col for col in exclude_cols if col in features.columns], errors='ignore')
feature_names = cluster_features.columns

# Combine cluster labels with features
clustered = pd.concat([gdf[['cluster']], cluster_features], axis=1)

# Overall mean and std for z-score reference
global_mean = cluster_features.mean()
global_std = cluster_features.std()

# Dictionary to hold top features per cluster
cluster_summary = {}

top_features_names = []

for cluster_id in sorted(clustered['cluster'].unique()):
    cluster_data = clustered[clustered['cluster'] == cluster_id]
    cluster_mean = cluster_data.mean(numeric_only=True)

    # z-score: how different is this feature in this cluster from the overall mean
    z_scores = (cluster_mean - global_mean) / global_std

    # Get top N defining features (by absolute deviation)
    top_features = z_scores.abs().sort_values(ascending=False).head(100)
    top_features = top_features.index.tolist()
    labeled_features = [makeCensusFeatureLabels(str(feature_name), year) for feature_name in top_features]
    
    cluster_summary[cluster_id] = {
        'top_features': labeled_features,
        'z_scores': z_scores[top_features].round(3).to_dict(),
    }

# Store summaries
cluster_dfs = {}
for cluster_id, info in cluster_summary.items():
    cluster_rows = []

    # Get coded feature names
    feature_codes = list(info['z_scores'].keys())

    # Load feature labels
    labeled_features = [makeCensusFeatureLabels(str(code), year) for code in feature_codes]

    for code, label in zip(feature_codes, labeled_features):
        z = info['z_scores'].get(code, None)
        cluster_rows.append({
            'feature_code': code,
            'feature_label': label,
            'z_score': z
        })

    cluster_dfs[cluster_id] = pd.DataFrame(cluster_rows)

In [None]:
# for cluster_id, df in cluster_dfs.items():
#     print(f"\nCluster {cluster_id} — Top 10 Features:")
    
#     # Sort by absolute z-score descending, just in case it's not sorted already
#     top10 = df.sort_values(by='z_score', key=abs, ascending=False).head(10)
    
#     for i, row in top10.iterrows():
#         print(f"  {row['feature_label']} (z = {row['z_score']:.2f})\n")

In [None]:
top10_dfs = {}

for cluster_id, df in cluster_dfs.items():
    # Sort by absolute z-score and select top 10 rows
    top10 = df.sort_values(by='z_score', key=abs, ascending=False).head(10)
    
    # Keep only the feature_label column (or others if desired)
    top10_dfs[cluster_id] = top10[['feature_label', 'z_score']].reset_index(drop=True)

In [None]:
top10_dfs[0]

In [None]:
cluster_dfs[0].sample()

In [None]:
# 1. Combine all cluster DataFrames into one
combined_df = pd.concat(cluster_dfs.values(), ignore_index=True)

# 2. Count how many clusters each feature_label appears in
label_counts = combined_df['feature_label'].value_counts()

# 3. Only keep labels that appear in *all* clusters
min_cluster_count = 5  # or len(cluster_dfs)
common_labels = label_counts[label_counts >= min_cluster_count].index.tolist()

# 4. Filter to only common features
common_df = combined_df[combined_df['feature_label'].isin(common_labels)]

# 5. Group and average z_scores
summary_df = (
    common_df.groupby('feature_label')['z_score']
    .mean()
    .reset_index()
)

# 6. Order by absolute z_score magnitude (descending)
summary_df['abs_z'] = summary_df['z_score'].abs()
summary_df = summary_df.sort_values(by='abs_z', ascending=False).drop(columns='abs_z')

# Done!
print(summary_df.head(10))

In [None]:
len(cluster_dfs)