<a href="https://colab.research.google.com/github/mohammadbadi/Clustering_Frequency/blob/main/Code%20Sections/5.8%20Summary%20Table%20of%20Clustering%20Models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **Summary Table of Clustering Models**

In [None]:
import warnings                                                                   # Import necessary libraries
import pandas as pd
import asyncio
!pip install dataframe_image -qqq
import dataframe_image as dfi
from IPython.display import display, HTML
from google.colab import files

print("\n\n")
warnings.filterwarnings("ignore", category=DeprecationWarning)                    # Ignore Deprecation Warnings
warnings.filterwarnings("ignore", category=FutureWarning)                         # Ignore future warnings

# Read the CSV file
url = "https://raw.githubusercontent.com/mohammadbadi/CrimeAnalytics_Clustering/refs/heads/main/Output_CSV/Feature_Combo_Current_Results.csv" # Read the dataset from CSV file
data = pd.read_csv(url)

metrics = [                                                                       # Define the metrics to evaluate
    'KMeans Silhouette Score',
    'KMeans Calinski-Harabasz Score',
    'DBSCAN Silhouette Score',
    'KMeans Davies-Bouldin Index',
    'DBSCAN Davies-Bouldin Index'
]

top_results = {}                                                                  # Initialize a dictionary to store top results and their counts
feature_set_summary = {}

for metric in metrics:                                                            # Iterate through the top 100 rows for each metric and store occurrences
    if metric in data.columns:
        if 'Davies-Bouldin' in metric:                                            # For Davies-Bouldin Index, we want the lowest values
            top_rows = data.nsmallest(100, metric)
        else:
            top_rows = data.nlargest(100, metric)
        top_results[metric] = top_rows                                            # Store top rows in a dictionary
        for _, row in top_rows.iterrows():                                        # Iterate through the top rows to count occurrences of Feature Sets and track where they were found
            feature_set = row['Feature Set']
            if feature_set not in feature_set_summary:
                feature_set_summary[feature_set] = {
                    'Count': 0,
                    'Found In': []
                }
            feature_set_summary[feature_set]['Count'] += 1
            feature_set_summary[feature_set]['Found In'].append(metric)

summary_df = pd.DataFrame.from_dict(feature_set_summary, orient='index')          # Create a summary of most repeated feature sets
summary_df.reset_index(inplace=True)
summary_df.columns = ['Feature Set', 'Count', 'Found In']
summary_df.sort_values(by='Count', ascending=False, inplace=True)                 # Sort by Count in descending order
final_metrics_df = pd.DataFrame(columns=['Feature Set', 'Count', *metrics])       # Prepare the final metrics DataFrame

for index, row in summary_df.iterrows():
    feature_set_name = row['Feature Set']
    metrics_row = data[data['Feature Set'] == feature_set_name]
    if not metrics_row.empty:
        new_row = {
            'Feature Set': feature_set_name,
            'Count': row['Count'],
            **{metric: metrics_row[metric].values[0] for metric in metrics}
        }

        if final_metrics_df.empty:                                                # Handle empty DataFrame scenario properly before concatenation
            final_metrics_df = pd.DataFrame([new_row])                            # Append the new row to the final metrics DataFrame
        else:
            final_metrics_df = pd.concat([final_metrics_df, pd.DataFrame([new_row])], ignore_index=True)

final_metrics_df.sort_values(by='Count', ascending=False, inplace=True)           # Sort the final metrics DataFrame by Count

def highlight_best_top5(s, metric):                                               # Function to highlight top 5 unique values for each metric
    if 'Davies-Bouldin' in metric:
        sorted_values = s.sort_values(ascending=True)
    else:
        sorted_values = s.sort_values(ascending=False)
    top5_values = sorted_values.unique()[:5]
    return ['background-color: lightgreen' if x in top5_values else '' for x in s]
                                                                                  # Apply highlighting for all metrics
styled_table = final_metrics_df.style.apply(highlight_best_top5, metric='KMeans Silhouette Score', subset=['KMeans Silhouette Score']) \
                                      .apply(highlight_best_top5, metric='DBSCAN Silhouette Score', subset=['DBSCAN Silhouette Score']) \
                                      .apply(highlight_best_top5, metric='KMeans Davies-Bouldin Index', subset=['KMeans Davies-Bouldin Index']) \
                                      .apply(highlight_best_top5, metric='DBSCAN Davies-Bouldin Index', subset=['DBSCAN Davies-Bouldin Index']) \
                                      .apply(highlight_best_top5, metric='KMeans Calinski-Harabasz Score', subset=['KMeans Calinski-Harabasz Score']) \
                                      .format({metric: '{:.2f}' for metric in metrics}) \
                                      .set_table_styles([
                                          {'selector': 'th', 'props': [('background-color', '#4CAF50'), ('color', 'white'), ('font-weight', 'bold')]},
                                          {'selector': 'td', 'props': [('padding', '10px'), ('text-align', 'center')]},
                                          {'selector': '.row:hover', 'props': [('background-color', '#f1f1f1')]},
                                      ]).set_properties(**{'border': '1px solid black'}) \
                                      .set_caption("<h3 style='color: navy; text-align: center;'>📊 Metrics Summary Table</h3>")

display(HTML("<h2 style='color: green; font-size: 20px;'><b> Styled Metrics Summary Table:</b></h2>"))  # Display styled table with formatted title
display(styled_table)

styled_table.data.to_excel('5.8 metrics_summary_table.xlsx', index=False)         # Save the styled table to an Excel file

                                                                                  # Display formatted message for saved file
display(HTML("""
    <p style="color: darkblue; font-size: 18px; font-weight: bold;">
         Metrics summary table has been saved to <span style="color: green;">'metrics_summary_table.xlsx'</span>.
    </p>
"""))
dfi.export(styled_table.data, '5.8 metrics_summary_table.png', table_conversion='matplotlib', max_rows=-1)
files.download('5.8 metrics_summary_table.png')                                   # Download the results in PNG file
files.download('5.8 metrics_summary_table.xlsx')                                  # Download the results in Excel File

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/6.7 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.5/6.7 MB[0m [31m14.2 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/6.7 MB[0m [31m47.8 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m6.7/6.7 MB[0m [31m73.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.7/6.7 MB[0m [31m53.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m385.7/385.7 kB[0m [31m26.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.2/45.2 MB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
[?25h




Unnamed: 0,Feature Set,Count,KMeans Silhouette Score,KMeans Calinski-Harabasz Score,DBSCAN Silhouette Score,KMeans Davies-Bouldin Index,DBSCAN Davies-Bouldin Index
0,4_Set_494,5,0.86,23082.3,0.89,0.38,0.11
1,4_Set_369,5,0.73,12051.47,0.66,0.34,0.55
2,4_Set_495,5,0.7,12466.24,0.78,0.4,0.35
3,4_Set_165,5,0.68,11537.69,0.66,0.42,0.52
4,4_Set_490,5,0.65,10295.54,0.65,0.44,0.53
5,5_Set_735,5,0.63,6087.93,0.65,0.55,0.39
6,4_Set_480,5,0.63,8676.9,0.65,0.45,0.53
7,4_Set_285,5,0.63,8741.96,0.64,0.45,0.53
8,4_Set_460,5,0.62,10161.53,0.65,0.45,0.5
9,4_Set_425,5,0.61,9328.52,0.65,0.48,0.52


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>