<a href="https://colab.research.google.com/github/mohammadbadi/Clustering_Frequency/blob/main/Code%20Sections/6.0%20Descriptive%20Statistics%20Best%20Clustering%20Models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **6.0 Descriptive Statistics - Best Clustering Models**

In [None]:
import pandas as pd
import ast
from IPython.display import display, HTML
from google.colab import files

# Load the dataset
url = "https://raw.githubusercontent.com/mohammadbadi/CrimeAnalytics_Clustering/refs/heads/main/Output_CSV/Best_Clustering_Results.csv"
url1 = "https://raw.githubusercontent.com/mohammadbadi/CrimeAnalytics_Clustering/refs/heads/main/Output_CSV/Feature_Combo_Current_Results.csv"
clustering_results = pd.read_csv(url)

# Load feature combinations
feature_combos = pd.read_csv(url1)

# Define the set names to match
set_names = ['4_Set_165','4_Set_369', '4_Set_490', '4_Set_494', '4_Set_495']

# Initialize an empty list to hold the feature sets
feature_sets = []

# Extract corresponding feature sets
for set_name in set_names:
    matched_features = feature_combos[feature_combos['Feature Set'] == set_name]['Feature_Names_String']
    if not matched_features.empty:
        features_list = ast.literal_eval(matched_features.values[0])  # Convert string to list
        feature_sets.append(features_list)

# Build a combined results list (one row per set)
combined_results = []
for i in range(1, 6):  # For each set (5 sets)
    # KMeans metrics
    silhouette_score_kmeans = clustering_results[f'KMeans{i}_Silhouette_Score'].iloc[0]
    davies_bouldin_score_kmeans = clustering_results[f'KMeans{i}_Davies_Bouldin_Index'].iloc[0]
    calinski_harabasz_score_kmeans = clustering_results[f'KMeans{i}_Calinski_Harabasz_Score'].iloc[0]
    kmeans_accuracy = clustering_results[f'KMeans{i}_Prediction_Accuracy'].iloc[0]

    # DBSCAN metrics
    silhouette_score_dbscan = clustering_results[f'DBSCAN{i}_Silhouette_Score'].iloc[0]
    davies_bouldin_score_dbscan = clustering_results[f'DBSCAN{i}_Davies_Bouldin_Index'].iloc[0]
    dbscan_accuracy = clustering_results[f'DBSCAN{i}_Prediction_Accuracy'].iloc[0]

    # Format the prediction accuracies (bold, 2 decimals)
    kmeans_accuracy_str = f"<strong>{kmeans_accuracy:.2f}%</strong>"
    dbscan_accuracy_str = f"<strong>{dbscan_accuracy:.2f}%</strong>"

    # Create a combined record for this set
    combined_results.append({
         "Set": f"Set {i}",
         "Features": ', '.join(feature_sets[i - 1]),
         "KMeans Silhouette Score": f"{silhouette_score_kmeans:.3f}",
         "KMeans Davies-Bouldin Index": f"{davies_bouldin_score_kmeans:.3f}",
         "KMeans Calinski-Harabasz Score": f"{calinski_harabasz_score_kmeans:.0f}",
         "KMeans Prediction Accuracy": kmeans_accuracy_str,
         "DBSCAN Silhouette Score": f"{silhouette_score_dbscan:.3f}",
         "DBSCAN Davies-Bouldin Index": f"{davies_bouldin_score_dbscan:.3f}",
         "DBSCAN Prediction Accuracy": dbscan_accuracy_str,
         "DBSCAN Accuracy Float": dbscan_accuracy  # for sorting purposes
    })

# Create a DataFrame from the combined results
df_combined = pd.DataFrame(combined_results)

# Sort the DataFrame by DBSCAN Accuracy (as a float) in descending order
df_sorted = df_combined.sort_values(by="DBSCAN Accuracy Float", ascending=False)

# Build the HTML table using the sorted DataFrame
html_table = """
<table style='border-collapse: collapse; width: 100%; font-size: 18px;'>
  <thead style='background-color: #4CAF50; color: white;'>
    <tr>
      <th colspan="9" style="text-align: center; font-size: 24px; background-color: #2f4f4f; color: white;">
        <strong>Clustering Summary Table</strong>
      </th>
    </tr>
    <tr>
      <th>Set</th>
      <th>Features</th>
      <th>KMeans Silhouette Score</th>
      <th>Davies-Bouldin Index</th>
      <th>Calinski-Harabasz Score</th>
      <th>KMeans Prediction Accuracy (%)</th>
      <th>DBSCAN Silhouette Score</th>
      <th>Davies-Bouldin Index</th>
      <th>DBSCAN Prediction Accuracy (%)</th>
    </tr>
  </thead>
  <tbody>
"""

# Loop through the sorted rows to build the table rows
for idx, row in df_sorted.iterrows():
    html_table += f"""
    <tr style='border: 1px solid #dddddd;'>
      <td style='border: 1px solid #dddddd; padding: 8px;'>{row['Set']}</td>
      <td style='border: 1px solid #dddddd; padding: 8px;'>{row['Features']}</td>
      <td style='border: 1px solid #dddddd; padding: 8px;'>{row['KMeans Silhouette Score']}</td>
      <td style='border: 1px solid #dddddd; padding: 8px;'>{row['KMeans Davies-Bouldin Index']}</td>
      <td style='border: 1px solid #dddddd; padding: 8px;'>{row['KMeans Calinski-Harabasz Score']}</td>
      <td style='border: 1px solid #dddddd; padding: 8px;'>{row['KMeans Prediction Accuracy']}</td>
      <td style='border: 1px solid #dddddd; padding: 8px;'>{row['DBSCAN Silhouette Score']}</td>
      <td style='border: 1px solid #dddddd; padding: 8px;'>{row['DBSCAN Davies-Bouldin Index']}</td>
      <td style='border: 1px solid #dddddd; padding: 8px;'>{row['DBSCAN Prediction Accuracy']}</td>
    </tr>
    """
html_table += "</tbody></table>"

# Save the HTML table to a file
with open("Best_Clusters_Summary_Results.html", "w") as f:
    f.write(html_table)
    files.download("Best_Clusters_Summary_Results.html")

# Display the HTML table in Google Colab
display(HTML(html_table))

# Display formatted message for saved file
display(HTML("""
    <p style="color: darkblue; font-size: 18px; font-weight: bold;">
        Clustering Model summary results saved  as <span style="color: green;">Best_Clusters_Summary_Results.html</span>.
    </p>
"""))


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Clustering Summary Table,Clustering Summary Table,Clustering Summary Table,Clustering Summary Table,Clustering Summary Table,Clustering Summary Table,Clustering Summary Table,Clustering Summary Table,Clustering Summary Table
Set,Features,KMeans Silhouette Score,Davies-Bouldin Index,Calinski-Harabasz Score,KMeans Prediction Accuracy (%),DBSCAN Silhouette Score,Davies-Bouldin Index.1,DBSCAN Prediction Accuracy (%)
Set 4,"Hood_158_Encoded, Location_Engineered_Other, Location_Engineered_Public, Location_Engineered_Residential",0.862,0.387,220520,86.25%,0.892,0.115,89.19%
Set 5,"Division_Encoded, Location_Engineered_Other, Location_Engineered_Public, Location_Engineered_Residential",0.7,0.408,126299,70.01%,0.779,0.34,77.94%
Set 2,"OCC_HOUR, Location_Engineered_Other, Location_Engineered_Public, Location_Engineered_Residential",0.64,0.463,90424,63.99%,0.664,0.555,66.42%
Set 1,"OCC_YEAR, Location_Engineered_Other, Location_Engineered_Public, Location_Engineered_Residential",0.654,0.539,58296,65.37%,0.661,0.518,66.05%
Set 3,"OCC_DOW_Encoded, Location_Engineered_Other, Location_Engineered_Public, Location_Engineered_Residential",0.645,0.445,103447,64.47%,0.65,0.532,65.03%
