<a href="https://colab.research.google.com/github/Maddi007-Py/Maddi007-Py-CrimeAnalytics_Clustering/blob/main/Code%20Sections/5.9%20Best%20Model%20Training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **5.9 Best Model Training**

In [1]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from sklearn.preprocessing import StandardScaler
from IPython.display import display, HTML
import ast  # For safely evaluating strings

# Load the dataset
url = "https://raw.githubusercontent.com/Maddi007-Py/Maddi007-Py-CrimeAnalytics_Clustering/refs/heads/main/Output_CSV/FE_Encoded.csv"
original_data = pd.read_csv(url)

# Store _id before clustering
original_data['_id'] = original_data.index

# Use 20% of the data (random sample)
sample_data = original_data.copy()

# Load feature combinations
url1 = "https://raw.githubusercontent.com/Maddi007-Py/Maddi007-Py-CrimeAnalytics_Clustering/refs/heads/main/Output_CSV/Feature_Combo_Current_Results.csv"
feature_combos = pd.read_csv(url1)

# Define the set names to match
set_names = ['4_Set_165','4_Set_369', '4_Set_490', '4_Set_494', '4_Set_495']

# Initialize an empty list to hold the feature sets
feature_sets = []

# Extract corresponding feature sets
for set_name in set_names:
    matched_features = feature_combos[feature_combos['Feature Set'] == set_name]['Feature_Names_String']
    if not matched_features.empty:
        features_list = ast.literal_eval(matched_features.values[0])  # Convert string to list
        feature_sets.append(features_list)

# Create a copy of the original data to store clustering results
clustering_results = original_data.copy()

# Add placeholder columns for clustering results
for i in range(1, 5):
    clustering_results[f'KMeans{i}_Cluster'] = -1
    clustering_results[f'KMeans{i}_Silhouette_Score'] = np.nan
    clustering_results[f'KMeans{i}_Davies_Bouldin_Index'] = np.nan
    clustering_results[f'KMeans{i}_Calinski_Harabasz_Score'] = np.nan
    clustering_results[f'KMeans{i}_Prediction_Accuracy'] = np.nan
    clustering_results[f'DBSCAN{i}_Cluster'] = -1
    clustering_results[f'DBSCAN{i}_Silhouette_Score'] = np.nan
    clustering_results[f'DBSCAN{i}_Davies_Bouldin_Index'] = np.nan
    clustering_results[f'DBSCAN{i}_Prediction_Accuracy'] = np.nan

# Perform clustering on each feature set
for i, features in enumerate(feature_sets, start=1):
    valid_features = [f for f in features if f in sample_data.columns]
    data_for_clustering = sample_data[valid_features].copy()

    # Store _id for mapping back
    sample_ids = sample_data['_id'].values

    numerical_cols = data_for_clustering.select_dtypes(include=['int64', 'float64']).columns.tolist()

    scaler = StandardScaler()
    data_scaled = pd.DataFrame(scaler.fit_transform(data_for_clustering[numerical_cols]), columns=numerical_cols)

    # KMeans Clustering
    kmeans = KMeans(n_clusters=4, random_state=42)
    kmeans_labels = kmeans.fit_predict(data_scaled)
    silhouette_score_kmeans = silhouette_score(data_scaled, kmeans_labels)
    davies_bouldin_score_kmeans = davies_bouldin_score(data_scaled, kmeans_labels)
    calinski_harabasz_score_kmeans = calinski_harabasz_score(data_scaled, kmeans_labels)
    kmeans_accuracy = max(0, silhouette_score_kmeans) * 100

    # DBSCAN Clustering
    dbscan = DBSCAN(eps=0.5, min_samples=5)
    dbscan_labels = dbscan.fit_predict(data_scaled)
    silhouette_score_dbscan = -1 if len(set(dbscan_labels)) <= 1 else silhouette_score(data_scaled, dbscan_labels)
    davies_bouldin_score_dbscan = -1 if len(set(dbscan_labels)) <= 1 else davies_bouldin_score(data_scaled, dbscan_labels)
    dbscan_accuracy = max(0, silhouette_score_dbscan) * 100

    # Update clustering results in the original dataset based on _id matching
    for idx, original_idx in enumerate(sample_ids):
        clustering_results.loc[original_idx, f'KMeans{i}_Cluster'] = kmeans_labels[idx]
        clustering_results.loc[original_idx, f'KMeans{i}_Silhouette_Score'] = silhouette_score_kmeans
        clustering_results.loc[original_idx, f'KMeans{i}_Davies_Bouldin_Index'] = davies_bouldin_score_kmeans
        clustering_results.loc[original_idx, f'KMeans{i}_Calinski_Harabasz_Score'] = calinski_harabasz_score_kmeans
        clustering_results.loc[original_idx, f'KMeans{i}_Prediction_Accuracy'] = kmeans_accuracy
        clustering_results.loc[original_idx, f'DBSCAN{i}_Cluster'] = dbscan_labels[idx]
        clustering_results.loc[original_idx, f'DBSCAN{i}_Silhouette_Score'] = silhouette_score_dbscan
        clustering_results.loc[original_idx, f'DBSCAN{i}_Davies_Bouldin_Index'] = davies_bouldin_score_dbscan
        clustering_results.loc[original_idx, f'DBSCAN{i}_Prediction_Accuracy'] = dbscan_accuracy

# Save clustering results to CSV
clustering_results.to_csv('Best_clustering_Models.csv', index=False)
# Display formatted message for saved file
display(HTML("""
    <p style="color: darkblue; font-size: 18px; font-weight: bold;">
         Clustering results saved as <span style="color: green;">Best_clustering_Models.csv </span>.
    </p>
"""))