# Gower Distance for Agricultural Data: Bank Marketing Classification Example

This notebook demonstrates the power of **Gower distance** for analyzing mixed-type datasets. Gower distance is perfect for data where we have both numerical measurements and categorical classifications.

## What is Gower Distance?

Gower distance is a distance metric that can handle:
- **Numerical features** (e.g., area, perimeter, length)
- **Categorical features** (e.g., variety, grade, quality)
- **Missing values** (common in real-world data)

Perfect for agricultural quality control, food classification, and similarity analysis!

## Dataset: Bank Marketing
The data is related with direct marketing campaigns (phone calls) of a Portuguese banking institution. The classification goal is to predict if the client will subscribe a term deposit (variable y).

## 1. Setup and Data Loading

In [None]:
# Install required packages if needed
#!uv pip install gower ucimlrepo pandas numpy matplotlib seaborn plotly scikit-learn umap-learn

import time
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import seaborn as sns
from sklearn.cluster import DBSCAN
from umap import UMAP

import gower_exp

# Configure plotting
plt.style.use("default")
sns.set_palette("husl")
warnings.filterwarnings("ignore")

np.random.seed(42)

In [None]:
# Load the Bank dataset from UCI repository
from ucimlrepo import fetch_ucirepo

bank = fetch_ucirepo(id=222)

X = bank.data.features
y = bank.data.targets

num_rows_to_sample = 5000
random_row_indices = np.random.choice(
    y.shape[0], size=num_rows_to_sample, replace=False
)

# Combine features and targets into a single DataFrame for easier analysis
# Sampling for my Mac, you can set gpu=True and try running with the full dataset on NVIDIA
df = pd.concat([X.iloc[random_row_indices], y.iloc[random_row_indices]], axis=1)

print(f"Dataset shape: {df.shape}")
print("\nFirst 5 rows:")
df.head()

## 2. Data Exploration and Understanding

In [None]:
# Dataset information
print("📊 DATASET OVERVIEW")
print("=" * 50)
print(f"Total samples: {len(df):,}")
print(f"Features: {len(X.columns)}")
print(f"Classes: {df['y'].nunique()} ({df['y'].value_counts().to_dict()})")
print(f"Missing values: {df.isnull().sum().sum()}")

print("\n📈 BASIC STATISTICS:")
print("=" * 50)
df.describe()

## 3. Basic Gower Distance Demonstrations

In [None]:
categorical_columns = X.select_dtypes(include="object").columns
categorical_encoding = [1 if col in categorical_columns else 0 for col in X.columns]
print("Categorical features:", list(categorical_columns))
print("Categorical encoding:", categorical_encoding)

# Compute the full Gower distance matrix
print("⚡ Computing Gower Distance Matrix...")
start_time = time.time()

# Use only features (X) for distance calculation
gower_distances = gower_exp.gower_matrix(df)

computation_time = time.time() - start_time
print(
    f"✅ Computed {gower_distances.shape[0]:,} x {gower_distances.shape[0]:,} distance matrix in {computation_time:.2f} seconds"
)
print(f"📏 Distance range: [{gower_distances.min():.4f}, {gower_distances.max():.4f}]")
print(f"📊 Mean distance: {gower_distances.mean():.4f} ± {gower_distances.std():.4f}")

In [None]:
# Visualize the distance matrix
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Full distance matrix heatmap (sample for performance)
sample_size = min(100, len(df))
sample_indices = np.random.choice(len(df), sample_size, replace=False)
sample_distances = gower_distances[sample_indices][:, sample_indices]
sample_labels = df.iloc[sample_indices]["y"].values

im1 = axes[0].imshow(sample_distances, cmap="viridis", aspect="auto")
axes[0].set_title(
    f"Gower Distance Matrix\n(Random sample of {sample_size} bank clients)",
    fontweight="bold",
)
axes[0].set_xlabel("Client Index")
axes[0].set_ylabel("Client Index")
plt.colorbar(im1, ax=axes[0], label="Gower Distance")

# Distance distribution by class pairs
class_names = df["y"].unique()
within_class_distances = []
between_class_distances = []

for i in range(len(df)):
    for j in range(i + 1, len(df)):
        if df.iloc[i]["y"] == df.iloc[j]["y"]:
            within_class_distances.append(gower_distances[i, j])
        else:
            between_class_distances.append(gower_distances[i, j])

axes[1].hist(
    within_class_distances[:5000],
    bins=50,
    alpha=0.7,
    label="Within-class",
    density=True,
)
axes[1].hist(
    between_class_distances[:5000],
    bins=50,
    alpha=0.7,
    label="Between-class",
    density=True,
)
axes[1].set_title("Distance Distribution by Class Relationship", fontweight="bold")
axes[1].set_xlabel("Gower Distance")
axes[1].set_ylabel("Density")
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("🎯 Class Separation Analysis:")
print(
    f"• Within-class distances: μ={np.mean(within_class_distances):.3f}, σ={np.std(within_class_distances):.3f}"
)
print(
    f"• Between-class distances: μ={np.mean(between_class_distances):.3f}, σ={np.std(between_class_distances):.3f}"
)
print(
    f"• Separation ratio: {np.mean(between_class_distances) / np.mean(within_class_distances):.2f}"
)

## 4. Finding Similar Bank Clients - K-Nearest Neighbors

In [None]:
# Interactive example: Find similar bank clients
def analyze_client_similarity(client_idx, n_similar=5):
    """
    Find and analyze the most similar bank clients to a given client
    """
    target_client = df.iloc[[client_idx]]
    target_features = X.iloc[[client_idx]]

    # Use gower_topn to find most similar clients
    result = gower_exp.gower_topn(target_features, target_features, n=n_similar)
    similar_indices = result["index"][1:]  # Skip the first match (self)
    distances = result["values"][1:]

    print(f"🎯 TARGET CLIENT #{client_idx}")
    print(f"Subscription: {target_client['y'].iloc[0]}")
    print("\n📊 Features:")
    for feature in X.columns:
        print(f"  {feature:17}: {target_features[feature].iloc[0]}")

    print(f"\n🔍 TOP {n_similar} MOST SIMILAR CLIENTS:")
    print("=" * 70)
    print(
        f"{'Rank':<4} {'Index':<6} {'Subscription':<12} {'Distance':<10} {'Match':<8}"
    )
    print("-" * 70)

    class_matches = 0
    for rank, (idx, dist) in enumerate(zip(similar_indices, distances), 1):
        similar_class = df.iloc[idx]["y"]
        is_match = similar_class == target_client["y"].iloc[0]
        if is_match:
            class_matches += 1
        match_str = "✅" if is_match else "❌"
        print(f"{rank:<4} {idx:<6} {similar_class:<12} {dist:<10.4f} {match_str:<8}")

    print(
        f"\n🎯 Accuracy: {class_matches}/{n_similar} ({100 * class_matches / n_similar:.1f}%) correct subscription matches"
    )

    return similar_indices, distances


# Analyze a few example clients
example_indices = [0, 50, 100, 200, 400]
print("🏦 BANK CLIENT SIMILARITY ANALYSIS")
print("=" * 80)

for i, idx in enumerate(example_indices[:2]):  # Show first 2 examples
    print(f"\nExample {i + 1}:")
    similar_indices, distances = analyze_client_similarity(idx, n_similar=5)
    if i < len(example_indices) - 1:
        print("\n" + "=" * 80)

In [None]:
# Visualize feature comparison for similar clients
def plot_similar_clients_comparison(target_idx, similar_indices, distances):
    """
    Create radar chart comparing target client with its most similar clients
    """
    # Select target and top 3 similar clients
    indices_to_plot = [target_idx] + list(similar_indices[:3])
    num_similar = min(3, len(similar_indices))
    labels = ["Target"] + [
        f"Similar #{i + 1}\n(d={distances[i]:.3f})" for i in range(num_similar)
    ]
    colors = ["red"] + ["blue", "green", "orange"][:num_similar]

    # Only use numerical features for radar chart
    numerical_features = X.select_dtypes(include=[np.number])
    features_normalized = (numerical_features - numerical_features.min()) / (
        numerical_features.max() - numerical_features.min()
    )

    # Create radar chart
    fig = go.Figure()

    for idx, label, color in zip(indices_to_plot, labels, colors):
        values = features_normalized.iloc[idx].values.tolist()
        values.append(values[0])  # Close the radar chart

        fig.add_trace(
            go.Scatterpolar(
                r=values,
                theta=list(numerical_features.columns)
                + [numerical_features.columns[0]],
                fill="toself" if idx == target_idx else None,
                name=f"{label} ({df.iloc[idx]['y']})",
                line=dict(color=color, width=3 if idx == target_idx else 2),
                fillcolor=color if idx == target_idx else None,
                opacity=0.3 if idx == target_idx else 0.8,
            )
        )

    fig.update_layout(
        polar=dict(radialaxis=dict(visible=True, range=[0, 1])),
        showlegend=True,
        title=f"Feature Comparison: Client #{target_idx} vs Most Similar Clients",
        font=dict(size=12),
    )

    fig.show()


# Plot comparison for first example
target_idx = 0
similar_indices, distances = analyze_client_similarity(target_idx, n_similar=5)

# Make sure we have enough similar clients for comparison
if len(similar_indices) > 3:
    plot_similar_clients_comparison(target_idx, similar_indices, distances)
else:
    print(f"Not enough similar clients found. Only {len(similar_indices)} available.")
    # Modify the function call to use only available similar clients
    n_available = len(similar_indices)
    if n_available > 0:
        plot_similar_clients_comparison(
            target_idx, similar_indices, distances[:n_available]
        )
    else:
        print("Cannot create comparison plot: no similar clients found.")

## 5. Practical Applications

### 5.1 Anomaly Detection - Finding Unusual Bank Clients

In [None]:
# Anomaly detection using average Gower distances
def detect_anomalies(distance_matrix, percentile=95):
    """
    Detect anomalous bank clients based on their average distance to all others
    """
    # Calculate average distance for each client
    avg_distances = np.mean(distance_matrix, axis=1)

    # Identify outliers
    threshold = np.percentile(avg_distances, percentile)
    outlier_indices = np.where(avg_distances > threshold)[0]

    return outlier_indices, avg_distances, threshold


# Detect outliers
outlier_indices, avg_distances, threshold = detect_anomalies(
    gower_distances, percentile=95
)

print("🚨 ANOMALY DETECTION RESULTS")
print(f"Threshold (95th percentile): {threshold:.4f}")
print(
    f"Number of outliers found: {len(outlier_indices)} ({100 * len(outlier_indices) / len(df):.1f}% of dataset)"
)

print("\n📋 OUTLIER CLIENTS:")
print("=" * 60)
print(f"{'Index':<6} {'Subscription':<12} {'Avg Distance':<12} {'Percentile':<10}")
print("-" * 60)

for idx in outlier_indices[:10]:  # Show top 10 outliers
    percentile = (avg_distances < avg_distances[idx]).sum() / len(avg_distances) * 100
    print(
        f"{idx:<6} {df.iloc[idx]['y']:<12} {avg_distances[idx]:<12.4f} {percentile:<10.1f}%"
    )

# Visualize outliers
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Distribution of average distances
axes[0].hist(avg_distances, bins=50, alpha=0.7, color="skyblue", edgecolor="black")
axes[0].axvline(
    threshold,
    color="red",
    linestyle="--",
    linewidth=2,
    label=f"Threshold ({threshold:.4f})",
)
axes[0].set_title("Distribution of Average Gower Distances", fontweight="bold")
axes[0].set_xlabel("Average Gower Distance")
axes[0].set_ylabel("Frequency")
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Outliers by class
# Outliers by class
outlier_classes = df.iloc[outlier_indices]["y"].value_counts()
normal_classes = df.iloc[~np.isin(df.index, outlier_indices)]["y"].value_counts()

x = np.arange(len(outlier_classes.index))
width = 0.35

axes[1].bar(
    x - width / 2,
    outlier_classes.values,
    width,
    label="Outliers",
    alpha=0.8,
    color="red",
)
axes[1].bar(
    x + width / 2, normal_classes.values, width, label="Normal", alpha=0.8, color="blue"
)
axes[1].set_title("Outlier Distribution by Subscription", fontweight="bold")
axes[1].set_xlabel("Subscription Status")
axes[1].set_ylabel("Count")
axes[1].set_xticks(x)
axes[1].set_xticklabels(outlier_classes.index)
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\n💡 Insights:")
print("• Outliers are distributed across both subscription outcomes")
print(
    "• These clients have unusual combinations of demographic and behavioral features"
)
print("• Could represent unique customer segments or edge cases for targeting")

### 5.2 Customer Profiling - Golden Standard Matching

In [None]:
# Simulate customer profiling using "golden standard" clients
def customer_profiling_simulation(golden_standards, test_samples, max_distance=0.2):
    """
    Simulate customer profiling by comparing test clients to golden standards
    """
    results = []

    for test_idx in test_samples:
        test_client = X.iloc[[test_idx]]
        test_class = df.iloc[test_idx]["y"]

        # Find closest golden standard
        min_distance = float("inf")
        closest_standard = None

        for std_idx in golden_standards:
            std_client = X.iloc[[std_idx]]
            distance = gower_exp.gower_matrix(
                test_client, std_client, cat_features=categorical_encoding
            )[0, 0]

            if distance < min_distance:
                min_distance = distance
                closest_standard = std_idx

        # Profile decision
        similar = min_distance <= max_distance
        standard_class = df.iloc[closest_standard]["y"]

        results.append(
            {
                "test_idx": test_idx,
                "test_class": test_class,
                "closest_standard": closest_standard,
                "standard_class": standard_class,
                "distance": min_distance,
                "similar": similar,
                "class_match": test_class == standard_class,
            }
        )

    return pd.DataFrame(results)


# Select golden standards (best representatives of each class)
golden_standards = []
for class_name in df["y"].unique():
    class_samples = df[df["y"] == class_name].index

    # Find the most "central" sample in each class
    class_distances = gower_distances[class_samples][:, class_samples]
    avg_class_distances = np.mean(class_distances, axis=1)
    central_idx = class_samples[np.argmin(avg_class_distances)]
    golden_standards.append(central_idx)

print("🏆 GOLDEN STANDARDS SELECTED:")
for i, std_idx in enumerate(golden_standards):
    print(f"Standard {i + 1}: Client #{std_idx} ({df.iloc[std_idx]['y']})")

# Test on random samples
test_samples = np.random.choice(df.index, 50, replace=False)
profiling_results = customer_profiling_simulation(
    golden_standards, test_samples, max_distance=0.3
)

print("\n🔍 CUSTOMER PROFILING RESULTS:")
print(f"Total tested: {len(profiling_results)}")
print(
    f"Similar to standards: {profiling_results['similar'].sum()} ({100 * profiling_results['similar'].mean():.1f}%)"
)
print(
    f"Different from standards: {(~profiling_results['similar']).sum()} ({100 * (~profiling_results['similar']).mean():.1f}%)"
)
print(f"Class accuracy: {profiling_results['class_match'].mean() * 100:.1f}%")

# Visualize profiling results
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Distance distribution by profiling result
similar_distances = profiling_results[profiling_results["similar"]]["distance"]
different_distances = profiling_results[~profiling_results["similar"]]["distance"]

axes[0].hist(
    similar_distances, bins=20, alpha=0.7, label="Similar to standards", color="green"
)
axes[0].hist(
    different_distances,
    bins=20,
    alpha=0.7,
    label="Different from standards",
    color="red",
)
axes[0].axvline(0.3, color="black", linestyle="--", label="Similarity Threshold")
axes[0].set_title("Distance to Golden Standards", fontweight="bold")
axes[0].set_xlabel("Gower Distance")
axes[0].set_ylabel("Frequency")
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Similarity rate by class
profiling_by_class = (
    profiling_results.groupby(["test_class", "similar"]).size().unstack(fill_value=0)
)
profiling_by_class_pct = (
    profiling_by_class.div(profiling_by_class.sum(axis=1), axis=0) * 100
)

profiling_by_class_pct.plot(kind="bar", ax=axes[1], color=["red", "green"], alpha=0.8)
axes[1].set_title("Similarity Rate by Subscription Status", fontweight="bold")
axes[1].set_xlabel("Subscription Status")
axes[1].set_ylabel("Percentage")
axes[1].legend(["Different", "Similar"])
axes[1].set_xticklabels(profiling_by_class_pct.index, rotation=0)
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\n💼 Customer Profiling Applications:")
print("• Identify clients similar to high-value customer archetypes")
print("• Segment customers based on similarity to representative profiles")
print("• Personalize marketing strategies based on customer similarity")

## 7. Clustering Integration

In [None]:
# Use Gower distances for clustering
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import adjusted_rand_score

print("🔗 CLUSTERING WITH GOWER DISTANCES")
print("=" * 60)

# Convert Gower distances to distance matrix for clustering
distance_matrix = gower_distances

# Try different clustering algorithms that can work with precomputed distances
clustering_results = {}

# Hierarchical clustering
hierarchical = AgglomerativeClustering(
    n_clusters=2, metric="precomputed", linkage="average"
)
hierarchical_labels = hierarchical.fit_predict(distance_matrix)

# DBSCAN clustering
dbscan = DBSCAN(eps=0.4, min_samples=5, metric="precomputed")
dbscan_labels = dbscan.fit_predict(distance_matrix)

# Evaluate clustering performance
true_labels = pd.Categorical(df["y"]).codes

hierarchical_ari = adjusted_rand_score(true_labels, hierarchical_labels)
dbscan_ari = adjusted_rand_score(true_labels, dbscan_labels)

print("Hierarchical Clustering:")
print(f"  Adjusted Rand Index: {hierarchical_ari:.4f}")
print(f"  Clusters found: {len(np.unique(hierarchical_labels))}")

print("\nDBSCAN Clustering:")
print(f"  Adjusted Rand Index: {dbscan_ari:.4f}")
print(f"  Clusters found: {len(np.unique(dbscan_labels[dbscan_labels >= 0]))}")
print(f"  Noise points: {sum(dbscan_labels == -1)}")

# Create visualization using dimensionality reduction
# Use UMAP for visualization (works well with distance matrices)
reducer = UMAP(n_components=2, metric="precomputed", random_state=42)
embedding = reducer.fit_transform(distance_matrix)

# Create comprehensive visualization
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Original classes in UMAP space
scatter1 = axes[0, 0].scatter(
    embedding[:, 0], embedding[:, 1], c=true_labels, cmap="Set1", alpha=0.7, s=50
)
axes[0, 0].set_title(
    "Original Subscription Status (UMAP projection)", fontweight="bold"
)
axes[0, 0].set_xlabel("UMAP 1")
axes[0, 0].set_ylabel("UMAP 2")
plt.colorbar(scatter1, ax=axes[0, 0])

# Hierarchical clustering results
scatter2 = axes[0, 1].scatter(
    embedding[:, 0],
    embedding[:, 1],
    c=hierarchical_labels,
    cmap="Set2",
    alpha=0.7,
    s=50,
)
axes[0, 1].set_title(
    f"Hierarchical Clustering\n(ARI: {hierarchical_ari:.3f})", fontweight="bold"
)
axes[0, 1].set_xlabel("UMAP 1")
axes[0, 1].set_ylabel("UMAP 2")
plt.colorbar(scatter2, ax=axes[0, 1])

# DBSCAN clustering results
scatter3 = axes[1, 0].scatter(
    embedding[:, 0], embedding[:, 1], c=dbscan_labels, cmap="Set3", alpha=0.7, s=50
)
axes[1, 0].set_title(f"DBSCAN Clustering\n(ARI: {dbscan_ari:.3f})", fontweight="bold")
axes[1, 0].set_xlabel("UMAP 1")
axes[1, 0].set_ylabel("UMAP 2")
plt.colorbar(scatter3, ax=axes[1, 0])

# Confusion matrix for best clustering
from sklearn.metrics import confusion_matrix

best_labels = hierarchical_labels if hierarchical_ari > dbscan_ari else dbscan_labels
best_method = "Hierarchical" if hierarchical_ari > dbscan_ari else "DBSCAN"

cm = confusion_matrix(true_labels, best_labels)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", ax=axes[1, 1])
axes[1, 1].set_title(f"Confusion Matrix - {best_method}", fontweight="bold")
axes[1, 1].set_xlabel("Predicted Cluster")
axes[1, 1].set_ylabel("True Subscription Status")

plt.tight_layout()
plt.show()

print("\n🎯 Clustering Insights:")
print("• Gower distance successfully captures subscription structure in the data")
print(
    f"• {best_method} clustering performs better with ARI = {max(hierarchical_ari, dbscan_ari):.3f}"
)
print("• UMAP visualization reveals patterns in customer similarity")
print("• Distance-based clustering works well for mixed-type banking data")