<a href="https://colab.research.google.com/github/mohammadbadi/CrimeAnalytics_Clustering/blob/main/Code%20Sections/5.8%20Summary%20Table%20of%20Clustering%20Models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **Summary Table of Clustering Models - Approach_3**

In [11]:
import warnings                                                                   # Import necessary libraries
import pandas as pd
import asyncio
!pip install dataframe_image -qqq
import dataframe_image as dfi
from IPython.display import display, HTML
from google.colab import files

print("\n\n")
warnings.filterwarnings("ignore", category=DeprecationWarning)                    # Ignore Deprecation Warnings
warnings.filterwarnings("ignore", category=FutureWarning)                         # Ignore future warnings

# Read the CSV file
url = "https://raw.githubusercontent.com/mohammadbadi/CrimeAnalytics_Clustering/refs/heads/main/Output_CSV/Feature_Combo_Current_Results.csv"
data = pd.read_csv(url)

# Define the metrics to evaluate
metrics = [
    'KMeans Silhouette Score',
    'KMeans Calinski-Harabasz Score',
    'DBSCAN Silhouette Score',
    'KMeans Davies-Bouldin Index',
    'DBSCAN Davies-Bouldin Index'
]

# Initialize dictionaries for results and summary counts
top_results = {}
feature_set_summary = {}

# Iterate through the top 100 rows for each metric and store occurrences
for metric in metrics:
    if metric in data.columns:
        # For Davies-Bouldin Index, lower values are better; otherwise higher is better
        if 'Davies-Bouldin' in metric:
            top_rows = data.nsmallest(100, metric)
        else:
            top_rows = data.nlargest(100, metric)
        top_results[metric] = top_rows
        for _, row in top_rows.iterrows():
            feature_set = row['Feature Set']
            if feature_set not in feature_set_summary:
                feature_set_summary[feature_set] = {
                    'Count': 0,
                    'Found In': []
                }
            feature_set_summary[feature_set]['Count'] += 1
            feature_set_summary[feature_set]['Found In'].append(metric)

# Create a summary DataFrame for the most repeated feature sets
summary_df = pd.DataFrame.from_dict(feature_set_summary, orient='index')
summary_df.reset_index(inplace=True)
summary_df.columns = ['Feature Set', 'Count', 'Found In']
summary_df.sort_values(by='Count', ascending=False, inplace=True)

# Prepare the final metrics DataFrame with an extra 'Features' column
final_metrics_df = pd.DataFrame(columns=['Feature Set', 'Features', 'Count', *metrics])
for index, row in summary_df.iterrows():
    feature_set_name = row['Feature Set']
    metrics_row = data[data['Feature Set'] == feature_set_name]
    if not metrics_row.empty:
        new_row = {
            'Feature Set': feature_set_name,
            'Features': metrics_row['Feature_Names_String'].values[0],  # Get value from Feature_Names_String column
            'Count': row['Count'],
            **{metric: metrics_row[metric].values[0] for metric in metrics}
        }
        final_metrics_df = pd.concat([final_metrics_df, pd.DataFrame([new_row])], ignore_index=True)

final_metrics_df.sort_values(by='Count', ascending=False, inplace=True)

# Define a function to highlight the top 5 unique values for a given metric
def highlight_best_top5(s, metric):
    # For Davies-Bouldin, lower is better; for others, higher is better
    if 'Davies-Bouldin' in metric:
        sorted_values = s.sort_values(ascending=True)
    else:
        sorted_values = s.sort_values(ascending=False)
    top5_values = sorted_values.unique()[:5]
    return ['background-color: lightgreen' if x in top5_values else '' for x in s]

# Style the full metrics summary table Approach_3
styled_table = final_metrics_df.style.apply(highlight_best_top5, metric='KMeans Silhouette Score', subset=['KMeans Silhouette Score']) \
                                      .apply(highlight_best_top5, metric='KMeans Calinski-Harabasz Score', subset=['KMeans Calinski-Harabasz Score']) \
                                      .apply(highlight_best_top5, metric='KMeans Davies-Bouldin Index', subset=['KMeans Davies-Bouldin Index']) \
                                      .apply(highlight_best_top5, metric='DBSCAN Silhouette Score', subset=['DBSCAN Silhouette Score']) \
                                      .apply(highlight_best_top5, metric='DBSCAN Davies-Bouldin Index', subset=['DBSCAN Davies-Bouldin Index']) \
                                      .format({metric: '{:.2f}' for metric in metrics}) \
                                      .set_table_styles([
                                          {'selector': 'th', 'props': [('background-color', '#4CAF50'),
                                                                       ('color', 'white'),
                                                                       ('font-weight', 'bold'),
                                                                       ('text-align', 'center')]},
                                          {'selector': 'td', 'props': [('padding', '10px'),
                                                                       ('text-align', 'center')]},
                                          {'selector': '.row:hover', 'props': [('background-color', '#f1f1f1')]}
                                      ]) \
                                      .set_properties(**{'border': '1px solid black'}) \
                                      .set_caption("<h3 style='color: navy; text-align: center;'>📊 Metrics Summary Table Approach_3</h3>")

# Save the styled table to Excel and PNG files
styled_table.data.to_excel('5.8 metrics_summary_table_Approach_3.xlsx', index=False)
dfi.export(styled_table.data, '5.8 metrics_summary_table_Approach_3.png', table_conversion='matplotlib', max_rows=-1)
files.download('5.8 metrics_summary_table_Approach_3.png')
files.download('5.8 metrics_summary_table_Approach_3.xlsx')

# Create an extra table displaying the top 5 models (sorted by Count)
top5_df = final_metrics_df.head(5).copy()
styled_top5 = top5_df.style.apply(lambda s: highlight_best_top5(s, 'KMeans Silhouette Score'), subset=['KMeans Silhouette Score']) \
                              .apply(lambda s: highlight_best_top5(s, 'KMeans Calinski-Harabasz Score'), subset=['KMeans Calinski-Harabasz Score']) \
                              .apply(lambda s: highlight_best_top5(s, 'KMeans Davies-Bouldin Index'), subset=['KMeans Davies-Bouldin Index']) \
                              .apply(lambda s: highlight_best_top5(s, 'DBSCAN Silhouette Score'), subset=['DBSCAN Silhouette Score']) \
                              .apply(lambda s: highlight_best_top5(s, 'DBSCAN Davies-Bouldin Index'), subset=['DBSCAN Davies-Bouldin Index']) \
                              .format({metric: '{:.2f}' for metric in metrics}) \
                              .set_table_styles([
                                  {'selector': 'th', 'props': [('background-color', '#4CAF50'),
                                                               ('color', 'white'),
                                                               ('font-weight', 'bold'),
                                                               ('text-align', 'center'),
                                                               ('border', '1px solid black')]},
                                  {'selector': 'td', 'props': [('padding', '10px'),
                                                               ('text-align', 'center'),
                                                               ('border', '1px solid black')]},
                                  {'selector': 'table', 'props': [('border-collapse', 'collapse')]}
                              ]) \
                              .set_properties(**{'border': '1px solid black'}) \
                              .set_caption("<h3 style='color: navy; text-align: center;'>Top 5 Models in Approach_3</h3>")

display(styled_top5)
display(styled_table)

# Export the top models table as a PNG file and download it
dfi.export(styled_top5.data, 'Top 5 models in Approach_3.png', table_conversion='matplotlib', max_rows=-1)
files.download('Top 5 models in Approach_3.png')

display(HTML("""
    <p style="color: darkblue; font-size: 18px; font-weight: bold;">
         Metrics summary table Appraoch_3 has been saved to <span style="color: green;">'5.8 metrics_summary_table_Approach_3.xlsx'</span>
         and <span style="color: green;">'5.8 metrics_summary_table_Approach_3.png'</span>.
    </p>
"""))







<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Unnamed: 0,Feature Set,Features,Count,KMeans Silhouette Score,KMeans Calinski-Harabasz Score,DBSCAN Silhouette Score,KMeans Davies-Bouldin Index,DBSCAN Davies-Bouldin Index
0,4_Set_494,"['Hood_158_Encoded', 'Location_Engineered_Other', 'Location_Engineered_Public', 'Location_Engineered_Residential']",5,0.86,23082.3,0.89,0.38,0.11
18,5_Set_665,"['OCC_HOUR', 'Hood_158_Encoded', 'Location_Engineered_Other', 'Location_Engineered_Public', 'Location_Engineered_Residential']",5,0.49,3653.75,0.62,0.81,0.59
20,5_Set_631,"['OCC_HOUR', 'LAT_WGS84', 'Location_Engineered_Other', 'Location_Engineered_Public', 'Location_Engineered_Residential']",5,0.5,4210.35,0.5,0.71,0.8
21,5_Set_596,"['OCC_HOUR', 'LONG_WGS84', 'Location_Engineered_Other', 'Location_Engineered_Public', 'Location_Engineered_Residential']",5,0.5,4138.07,0.48,0.75,0.74
22,4_Set_492,"['Hood_158_Encoded', 'Division_Encoded', 'Location_Engineered_Other', 'Location_Engineered_Residential']",5,0.51,3771.5,0.63,0.88,0.5


Unnamed: 0,Feature Set,Features,Count,KMeans Silhouette Score,KMeans Calinski-Harabasz Score,DBSCAN Silhouette Score,KMeans Davies-Bouldin Index,DBSCAN Davies-Bouldin Index
0,4_Set_494,"['Hood_158_Encoded', 'Location_Engineered_Other', 'Location_Engineered_Public', 'Location_Engineered_Residential']",5,0.86,23082.3,0.89,0.38,0.11
18,5_Set_665,"['OCC_HOUR', 'Hood_158_Encoded', 'Location_Engineered_Other', 'Location_Engineered_Public', 'Location_Engineered_Residential']",5,0.49,3653.75,0.62,0.81,0.59
20,5_Set_631,"['OCC_HOUR', 'LAT_WGS84', 'Location_Engineered_Other', 'Location_Engineered_Public', 'Location_Engineered_Residential']",5,0.5,4210.35,0.5,0.71,0.8
21,5_Set_596,"['OCC_HOUR', 'LONG_WGS84', 'Location_Engineered_Other', 'Location_Engineered_Public', 'Location_Engineered_Residential']",5,0.5,4138.07,0.48,0.75,0.74
22,4_Set_492,"['Hood_158_Encoded', 'Division_Encoded', 'Location_Engineered_Other', 'Location_Engineered_Residential']",5,0.51,3771.5,0.63,0.88,0.5
24,5_Set_785,"['OCC_Month_Encoded', 'Hood_158_Encoded', 'Location_Engineered_Other', 'Location_Engineered_Public', 'Location_Engineered_Residential']",5,0.5,3887.97,0.61,0.79,0.59
25,5_Set_539,"['OCC_DOY', 'Hood_158_Encoded', 'Location_Engineered_Other', 'Location_Engineered_Public', 'Location_Engineered_Residential']",5,0.5,3895.49,0.61,0.79,0.59
26,5_Set_790,"['OCC_DOW_Encoded', 'Hood_158_Encoded', 'Location_Engineered_Other', 'Location_Engineered_Public', 'Location_Engineered_Residential']",5,0.5,3886.83,0.61,0.77,0.59
27,5_Set_701,"['LONG_WGS84', 'LAT_WGS84', 'Location_Engineered_Other', 'Location_Engineered_Public', 'Location_Engineered_Residential']",5,0.45,4298.27,0.5,0.76,0.79
28,5_Set_295,"['OCC_YEAR', 'LAT_WGS84', 'Location_Engineered_Other', 'Location_Engineered_Public', 'Location_Engineered_Residential']",5,0.45,4136.08,0.48,0.84,0.87


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>