<a href="https://colab.research.google.com/github/Maddi007-Py/Maddi007-Py-CrimeAnalytics_Clustering/blob/main/Code%20Sections/5.7%20Clustering%20Model%20Training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **5.7 Clustering Model Training**

In [None]:
import itertools
import numpy as np
import pandas as pd
import time
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from sklearn.preprocessing import StandardScaler
import warnings
from IPython.display import display, HTML

print("\n\n")
warnings.filterwarnings("ignore")

# Define feature list
features = [
    'OCC_YEAR', 'OCC_DOY', 'OCC_HOUR', 'LONG_WGS84', 'LAT_WGS84',
    'OCC_Month_Encoded', 'OCC_DOW_Encoded', 'Hood_158_Encoded', 'Division_Encoded',
    'Location_Engineered_Other', 'Location_Engineered_Public', 'Location_Engineered_Residential'
]

# Generate feature combinations (sizes 4, 5, 6)
all_combinations = [list(combo) for r in range(4, 7) for combo in itertools.combinations(features, r)]

# Save feature combinations to CSV
pd.DataFrame(all_combinations).to_csv('Feature_Combo_Current.csv', index=False)
display(HTML("<p style='color: green; font-size:16px;'><b> Feature sets saved to 'Feature_Combo_Current.csv'</b></p>"))

# Load data
url = "https://raw.githubusercontent.com/Maddi007-Py/Maddi007-Py-CrimeAnalytics_Clustering/refs/heads/main/Output_CSV/FE_Encoded.csv"
data = pd.read_csv(url)

# Sample 10% of data
sample_data = data.sample(frac=0.1, random_state=42)

# Track results
set_counters = {}
total_models = 0

# HTML Styling
def color_score(value, reverse=False):
    """Color code scores: Green = good, Yellow = medium, Red = poor"""
    if reverse:
        colors = ["#ff6347", "#ffcc00", "#32cd32"]  # Red → Yellow → Green (lower is better)
    else:
        colors = ["#32cd32", "#ffcc00", "#ff6347"]  # Green → Yellow → Red (higher is better)

    if value < 0.3:
        color = colors[0]
    elif value < 0.6:
        color = colors[1]
    else:
        color = colors[2]

    return f"<span style='color: {color}; font-weight: bold;'>{value:.2f}</span>"

# Iterate over feature sets
for i, feature_set in enumerate(all_combinations):
    total_models += 1

    # Select valid features
    valid_features = [f for f in feature_set if f in sample_data.columns]
    data_for_clustering = sample_data[valid_features]

    # Standardization
    numerical_cols = data_for_clustering.select_dtypes(include=['int64', 'float64']).columns.tolist()
    scaler = StandardScaler()
    data_scaled = pd.DataFrame(scaler.fit_transform(data_for_clustering[numerical_cols]), columns=numerical_cols)

    # Handle categorical data
    categorical_cols = [col for col in valid_features if col not in numerical_cols]
    if categorical_cols:
        data_scaled = pd.concat([data_scaled, data_for_clustering[categorical_cols]], axis=1)

    # KMeans Clustering
    kmeans = KMeans(n_clusters=4, random_state=42)
    kmeans_labels = kmeans.fit_predict(data_scaled)

    # DBSCAN Clustering
    dbscan = DBSCAN(eps=0.5, min_samples=5)
    dbscan_labels = dbscan.fit_predict(data_scaled)

    # Calculate metrics
    silhouette_kmeans = silhouette_score(data_scaled, kmeans_labels)
    dbi_kmeans = davies_bouldin_score(data_scaled, kmeans_labels)
    ch_kmeans = calinski_harabasz_score(data_scaled, kmeans_labels)

    silhouette_dbscan = -1 if len(set(dbscan_labels)) <= 1 else silhouette_score(data_scaled, dbscan_labels)
    dbi_dbscan = -1 if len(set(dbscan_labels)) <= 1 else davies_bouldin_score(data_scaled, dbscan_labels)

    # Assign color-coded values
    results_html = f"""
    <div style="border: 2px solid #ddd; padding: 10px; margin: 5px 0; background-color: #f9f9f9;">
        <p><b>Feature Set {i+1}:</b> {', '.join(valid_features)}</p>
        <p>KMeans Silhouette Score: {color_score(silhouette_kmeans)}</p>
        <p>KMeans Davies-Bouldin Index: {color_score(dbi_kmeans, reverse=True)}</p>
        <p>KMeans Calinski-Harabasz Score: {color_score(ch_kmeans)}</p>
        <p>DBSCAN Silhouette Score: {color_score(silhouette_dbscan) if silhouette_dbscan != -1 else 'N/A'}</p>
        <p>DBSCAN Davies-Bouldin Index: {color_score(dbi_dbscan, reverse=True) if dbi_dbscan != -1 else 'N/A'}</p>
    </div>
    """
    display(HTML(results_html))
    time.sleep(0.1)  # Small delay for real-time effect

# Display total models trained
display(HTML(f"<h2 style='color: navy; font-size:18px;'><b> Total Models Trained: {total_models}</b></h2>"))

print("\n\n")

display(HTML(f"""
<div style="font-family: Arial, sans-serif; font-size: 18px; padding: 15px; border-radius: 10px;
             background: #282c34; color: #61dafb; text-align: center; width: 60%; margin: 20px auto;
             box-shadow: 2px 2px 10px rgba(0,0,0,0.2);">
    <strong>Total K-Means & DBCSCAN Clustering models trained:</strong> <span style="color: #ffcc00; font-size: 22px;">{total_models}</span>
</div>
"""))










