<a href="https://colab.research.google.com/github/mohammadbadi/Clustering_FE_MCA/blob/main/Code%20Sections/5.8%20Summary%20Table%20of%20Clustering%20Models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **5.8 Summary Table of Clustering Models - Approach 3**

In [7]:
import warnings                                                                   # Import necessary libraries
import pandas as pd
import asyncio
!pip install dataframe_image -qqq
import dataframe_image as dfi
from IPython.display import display, HTML
from google.colab import files

print("\n\n")
warnings.filterwarnings("ignore", category=DeprecationWarning)                    # Ignore Deprecation Warnings
warnings.filterwarnings("ignore", category=FutureWarning)                         # Ignore future warnings

url = "https://raw.githubusercontent.com/mohammadbadi/Clustering_FE_MCA/refs/heads/main/Output_CSV/Feature_Combo_New_Results.csv"  # Load results of all Clustering Models from Approach 3
data = pd.read_csv(url)

metrics = [                                                                       # Define the metrics to evaluate
    'KMeans Silhouette Score',
    'KMeans Calinski-Harabasz Score',
    'DBSCAN Silhouette Score',
    'KMeans Davies-Bouldin Index',
    'DBSCAN Davies-Bouldin Index'
]

top_results = {}                                                                  # Initialize a dictionary to store top results and their counts
feature_set_summary = {}

for metric in metrics:                                                            # Iterate through the top 100 rows for each metric and store occurrences
    if metric in data.columns:
        if 'Davies-Bouldin' in metric:                                            # For Davies-Bouldin Index, we want the lowest values
            top_rows = data.nsmallest(100, metric)
        else:
            top_rows = data.nlargest(100, metric)
        top_results[metric] = top_rows                                            # Store top rows in a dictionary
        for _, row in top_rows.iterrows():                                        # Iterate through the top rows to count occurrences of Feature Sets and track where they were found
            feature_set = row['Features']                                         # Updated column name
            if feature_set not in feature_set_summary:
                feature_set_summary[feature_set] = {
                    'Count': 0,
                    'Found In': []
                }
            feature_set_summary[feature_set]['Count'] += 1
            feature_set_summary[feature_set]['Found In'].append(metric)

summary_df = pd.DataFrame.from_dict(feature_set_summary, orient='index')          # Create a summary of most repeated feature sets
summary_df.reset_index(inplace=True)
summary_df.columns = ['Features', 'Count', 'Found In']                            # Updated column name
summary_df.sort_values(by='Count', ascending=False, inplace=True)                 # Sort by Count in descending order
final_metrics_df = pd.DataFrame(columns=['Set Number', 'Features', 'Count', *metrics])  # Prepare the final metrics DataFrame

for index, row in summary_df.iterrows():
    feature_set_name = row['Features']
    metrics_row = data[data['Features'] == feature_set_name]
    if not metrics_row.empty:
        new_row = {
            'Set Number': metrics_row['Set Number'].values[0],                    # Fetch Set Number from CSV
            'Features': feature_set_name,
            'Count': row['Count'],
            **{metric: metrics_row[metric].values[0] for metric in metrics}
        }
        if final_metrics_df.empty:                                                # Handle empty DataFrame scenario properly before concatenation
            final_metrics_df = pd.DataFrame([new_row])                            # Append the new row to the final metrics DataFrame
        else:
            final_metrics_df = pd.concat([final_metrics_df, pd.DataFrame([new_row])], ignore_index=True)

final_metrics_df.sort_values(by='Count', ascending=False, inplace=True)           # Sort the final metrics DataFrame by Count

desired_order = ['Set Number', 'Features', 'Count',
                 'KMeans Silhouette Score', 'KMeans Calinski-Harabasz Score', 'KMeans Davies-Bouldin Index',
                 'DBSCAN Silhouette Score', 'DBSCAN Davies-Bouldin Index']
final_metrics_df = final_metrics_df[desired_order]                                # Reorder columns as requested

                                                                                  # Create Table: Top 7 Models in Approach 3
                                                                                  # Top 3 models based on highest KMeans Silhouette Score
top3_kmeans = final_metrics_df.nlargest(3, 'KMeans Silhouette Score')
                                                                                  # Top 4 models based on highest DBSCAN Silhouette Score
top4_dbscan = final_metrics_df.nlargest(4, 'DBSCAN Silhouette Score')
                                                                                  # Combine them into one table; drop duplicates if any
top7_df = pd.concat([top3_kmeans, top4_dbscan]).drop_duplicates().reset_index(drop=True)
top7_df = top7_df[desired_order]                                                  # Reorder columns in the Top 7 table

                                                                                  # Create a styled table for Top 7 Models
styled_top7 = top7_df.style.set_caption("<h3 style='color: navy; text-align: center;'>Top 7 Models in Approach 3</h3>")\
    .format({
        'KMeans Silhouette Score': '{:.2f}',                                      # 2 decimals for Silhouette
        'DBSCAN Silhouette Score': '{:.2f}',                                      # 2 decimals for Silhouette
        'KMeans Davies-Bouldin Index': '{:.2f}',                                  # 2 decimals for DBI
        'DBSCAN Davies-Bouldin Index': '{:.2f}',                                  # 2 decimals for DBI
        'KMeans Calinski-Harabasz Score': '{:.0f}'                                # no decimals for CHI
    })\
    .set_table_styles([
        {'selector': 'th', 'props': [('background-color', '#4CAF50'),
                                     ('color', 'white'),
                                     ('font-weight', 'bold'),
                                     ('text-align', 'center'),
                                     ('border', '1px solid black')]},
        {'selector': 'td', 'props': [('padding', '10px'),
                                     ('text-align', 'center'),
                                     ('border', '1px solid black')]},
        {'selector': 'table', 'props': [('border-collapse', 'collapse')]}
    ])\
    .set_properties(**{'border': '1px solid black'})

display(HTML("<h2 style='color: green; font-size: 20px;'><b>Top 7 Models in Approach 3:</b></h2>"))
display(styled_top7)

                                                                                  # Export and download the Top 7 Models table as a PNG image
dfi.export(top7_df, 'Top 7 models in Approach 3.png', table_conversion='matplotlib', max_rows=-1)
files.download('Top 7 models in Approach 3.png')

                                                                                  #Create Table - display Outputs of ALL Clustering Models in Approach 3
                                                                                  # Define a helper function to highlight the top 5 values correctly
def highlight_top5(s, ascending=True):
    sorted_vals = s.sort_values(ascending=ascending).dropna()                     # Sort the values and remove NaN
    top5_vals = sorted_vals.head(5).unique()                                      # Get the top 5 unique values
    return ['background-color: lightgreen' if x in top5_vals else '' for x in s]  # Return styling only for values in top5_vals

                                                                                  # Apply highlighting with correct order:
                                                                                  # For silhouette and Calinski-Harabasz, higher is better (descending sort)
                                                                                  # For Davies-Bouldin Index, lower is better (ascending sort)
styled_table = final_metrics_df.style.apply(lambda s: highlight_top5(s, ascending=False), subset=['KMeans Silhouette Score'])\
    .apply(lambda s: highlight_top5(s, ascending=False), subset=['DBSCAN Silhouette Score'])\
    .apply(lambda s: highlight_top5(s, ascending=True), subset=['KMeans Davies-Bouldin Index'])\
    .apply(lambda s: highlight_top5(s, ascending=True), subset=['DBSCAN Davies-Bouldin Index'])\
    .apply(lambda s: highlight_top5(s, ascending=False), subset=['KMeans Calinski-Harabasz Score'])\
    .format({
        'KMeans Silhouette Score': '{:.2f}',                                      # 2 decimals for Silhouette
        'DBSCAN Silhouette Score': '{:.2f}',                                      # 2 decimals for Silhouette
        'KMeans Davies-Bouldin Index': '{:.2f}',                                  # 2 decimals for DBI
        'DBSCAN Davies-Bouldin Index': '{:.2f}',                                  # 2 decimals for DBI
        'KMeans Calinski-Harabasz Score': '{:.0f}'                                # no decimals for CHI
    })\
    .set_table_styles([
        {'selector': 'th', 'props': [('background-color', '#4CAF50'),
                                     ('color', 'white'),
                                     ('font-weight', 'bold'),
                                     ('text-align', 'center'),
                                     ('border', '1px solid black')]},             # Center-align and add black borders to headers
        {'selector': 'td', 'props': [('padding', '10px'),
                                     ('text-align', 'center'),
                                     ('border', '1px solid black')]},             # Center-align and add black borders to all cells
        {'selector': 'table', 'props': [('border-collapse', 'collapse')]}         # Ensure border styling applies correctly
    ])\
    .set_properties(**{'border': '1px solid black'})\
    .set_caption("<h3 style='color: navy; text-align: center;'>📊 Metrics Summary Table</h3>")

display(HTML("<h2 style='color: green; font-size: 20px;'><b>Styled Metrics Summary Table:</b></h2>"))
display(styled_table)

styled_table.data.to_excel('metrics_summary_table.xlsx', index=False)             # Save the styled table to an Excel file

display(HTML("""
    <p style="color: darkblue; font-size: 18px; font-weight: bold;">
         Metrics summary table has been saved to <span style="color: green;">'metrics_summary_table.xlsx'</span>.
    </p>
"""))
dfi.export(styled_table.data, 'metrics_summary_table.png', table_conversion='matplotlib', max_rows=-1)
files.download('metrics_summary_table.png')                                       # Download the results in PNG file
files.download('metrics_summary_table.xlsx')                                      # Download the results in Excel File







Unnamed: 0,Set Number,Features,Count,KMeans Silhouette Score,KMeans Calinski-Harabasz Score,KMeans Davies-Bouldin Index,DBSCAN Silhouette Score,DBSCAN Davies-Bouldin Index
0,557,"['OCC_DOY', 'OCC_MONTH_Num', 'HOOD_Freq', 'LOCATION_Freq']",4,0.49,4409,0.72,0.51,0.85
1,562,"['OCC_DOY', 'OCC_MONTH_Num', 'DIV_HOOD_Hier', 'Loca_Premi_Freq']",4,0.49,4369,0.73,0.52,0.85
2,561,"['OCC_DOY', 'OCC_MONTH_Num', 'DIV_HOOD_Hier', 'PREMISES_Freq']",4,0.48,4388,0.73,0.52,0.85
3,7,"['LONG_WGS84', 'LAT_WGS84', 'DIV_HOOD_Hier', 'LOCATION_Freq']",5,0.37,3318,0.96,0.53,0.62
4,9,"['LONG_WGS84', 'LAT_WGS84', 'DIV_HOOD_Hier', 'Loca_Premi_Freq']",5,0.4,3252,0.89,0.53,0.63
5,8,"['LONG_WGS84', 'LAT_WGS84', 'DIV_HOOD_Hier', 'PREMISES_Freq']",5,0.39,3251,0.89,0.52,0.63
6,4,"['LONG_WGS84', 'LAT_WGS84', 'HOOD_Freq', 'LOCATION_Freq']",5,0.39,3285,0.87,0.52,0.63


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Unnamed: 0,Set Number,Features,Count,KMeans Silhouette Score,KMeans Calinski-Harabasz Score,KMeans Davies-Bouldin Index,DBSCAN Silhouette Score,DBSCAN Davies-Bouldin Index
0,574,"['OCC_DOY', 'OCC_MONTH_Num', 'LONG_LAT_PCA', 'DIV_HOOD_Hier']",5,0.4,4062,0.84,0.51,0.73
1,6,"['LONG_WGS84', 'LAT_WGS84', 'HOOD_Freq', 'Loca_Premi_Freq']",5,0.39,3188,0.9,0.52,0.64
2,7,"['LONG_WGS84', 'LAT_WGS84', 'DIV_HOOD_Hier', 'LOCATION_Freq']",5,0.37,3318,0.96,0.53,0.62
3,8,"['LONG_WGS84', 'LAT_WGS84', 'DIV_HOOD_Hier', 'PREMISES_Freq']",5,0.39,3251,0.89,0.52,0.63
4,150,"['DOW_Category_Freq', 'LONG_WGS84', 'LAT_WGS84', 'DIV_HOOD_Hier']",5,0.38,3495,0.9,0.51,0.66
5,146,"['DOW_Category_Freq', 'LONG_WGS84', 'LAT_WGS84', 'HOOD_Freq']",5,0.38,3422,0.91,0.51,0.67
6,9,"['LONG_WGS84', 'LAT_WGS84', 'DIV_HOOD_Hier', 'Loca_Premi_Freq']",5,0.4,3252,0.89,0.53,0.63
7,4,"['LONG_WGS84', 'LAT_WGS84', 'HOOD_Freq', 'LOCATION_Freq']",5,0.39,3285,0.87,0.52,0.63
8,570,"['OCC_DOY', 'OCC_MONTH_Num', 'LONG_LAT_PCA', 'HOOD_Freq']",5,0.39,3975,0.85,0.51,0.74
9,13,"['OCC_YEAR', 'LONG_LAT_PCA', 'HOOD_Freq', 'LOCATION_Freq']",5,0.34,2409,1.04,0.5,0.76


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>