<a href="https://colab.research.google.com/github/mohammadbadi/CrimeAnalytics_Clustering_Approach_3/blob/main/Code%20for%20Approach%203%20of%20CrimeAnalytics%20Clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


### **5.0 Loading Libraries and Major Crime Indicator Dataset from TPS**

In [None]:
                                                                                  # Import necessary libraries
import itertools
import os
import pandas as pd
import time
import kagglehub
import warnings
from IPython.display import display, HTML
from kagglehub import KaggleDatasetAdapter
from google.colab import files

os.system('pip install openpyxl -qqq')                                            # Install openpyxl for Excel support
os.system('pip install tabulate -qqq')                                            # Install tabulate for cleaner table output

warnings.filterwarnings("ignore", category=DeprecationWarning)                    # Ignore Deprecation Warnings
warnings.filterwarnings("ignore", category=FutureWarning)                         # Ignore future warnings

file_path = "major-crime-indicators.csv"                                          # Set the file path to the filename with extension

crime_df = kagglehub.load_dataset(                                                # Load the latest version of the dataset from Kaggle
    kagglehub.KaggleDatasetAdapter.PANDAS,
    "mohammadbadi/crimes-in-toronto",                                             # Updated dataset handle
    file_path,
)

def format_message(message):                                                      # Function to format HTML messages
    return f"""
<div style="font-size: 18px; color: #333; font-weight: bold; padding: 10px;">
    {message}
</div>
"""

load_message = format_message(                                                    # Display HTML formatted message confirming that the dataset is loaded
    "Dataset <span style='color: blue;'>major-crime-indicators.csv</span> by <span style='color: slategray;'>Mohammad Badi</span> from Kaggle website is <span style='color: green;'>Successfully</span> loaded!"
)
display(HTML(load_message))

crime_df.to_csv("major-crime-indicators.csv", index=False)                        # Save the loaded dataset as a CSV file

save_message = format_message(                                                    # Display HTML formatted message confirming that the dataset is saved
    "Dataset saved in <span style='color: blue;'>current workspace</span> <span style='color: green;'>Successfully!</span>"
)
display(HTML(save_message))

major_crime_df = crime_df                                                         # Reusing the dataframe loaded earlier
def save_data(data_df, filename_base):
    csv_filename = f"{filename_base}.csv"                                         # Save as CSV
    data_df.to_csv(csv_filename, index=False)
    csv_msg = format_message(f"Data saved as CSV: <span style='color: blue;'>{csv_filename}</span>")
    display(HTML(csv_msg))
    excel_filename = f"{filename_base}.xlsx"                                      # Save as Excel
    data_df.to_excel(excel_filename, index=False, engine='openpyxl')
    excel_msg = format_message(f"Data saved as Excel: <span style='color: blue;'>{excel_filename}</span>")
    display(HTML(excel_msg))

save_data(major_crime_df, "Checking_Load_Time")                                   # Save the dataset as both CSV and Excel with name 'Checking_Load_Time'

def measure_read_time(file_path, file_type):                                      # Function to measure file reading time
    start_time = time.time()
    if file_type == "csv":
        pd.read_csv(file_path)
    elif file_type == "excel":
        pd.read_excel(file_path)
    end_time = time.time()
    return end_time - start_time

csv_time = measure_read_time('Checking_Load_Time.csv', "csv")                     # Measure read times
excel_time = measure_read_time('Checking_Load_Time.xlsx', "excel")

csv_time_color = "green" if csv_time < excel_time else "red"                      # Determine color coding for time messages
excel_time_color = "green" if excel_time < csv_time else "red"

csv_time_message = format_message(                                                # Display HTML formatted time messages
    f"Time taken to read <span style='color: blue;'>Checking_Load_Time CSV file</span>: <span style='color: {csv_time_color};'>{csv_time:.2f} seconds</span>"
)

excel_time_message = format_message(
    f"Time taken to read <span style='color: blue;'>Checking_Load_Time Excel file</span>: <span style='color: {excel_time_color};'>{excel_time:.2f} seconds</span>"
)

display(HTML(csv_time_message))                                                   # Display the time messages
display(HTML(excel_time_message))

if csv_time < excel_time:                                                         # Determine the recommendation based on time
    speed_factor = excel_time / csv_time
    recommendation = (
        f"Recommendation: Load the data from <span style='color: green;'>CSV</span> as it is approximately "
        f"<span style='color: green;'>{speed_factor:.2f} times faster</span> than loading from Excel."
    )
else:
    speed_factor = csv_time / excel_time
    recommendation = (
        f"Recommendation: Load the data from <span style='color: green;'>Excel</span> as it is approximately "
        f"<span style='color: green;'>{speed_factor:.2f} times faster</span> than loading from CSV."
    )

recommendation_message = format_message(recommendation)
display(HTML(recommendation_message))

completion_message = format_message("Dataset has been analyzed, and recommendation has been provided!")
display(HTML(completion_message))


### **5.1 Learning Application Domain: Summary Table**

In [None]:
import warnings                                                                   # Import necessary libraries
import pandas as pd
import kagglehub
import matplotlib.pyplot as plt
import os
import contextlib
from google.colab import files
from IPython.display import display, HTML
from kagglehub import KaggleDatasetAdapter

print("\n\n")
# Suppress warnings
warnings.simplefilter(action='ignore', category=FutureWarning)                    # Ignore Deprecation Warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)                    # Ignore future warnings

file_path = "major-crime-indicators.csv"                                          # File path for the dataset

with open(os.devnull, 'w') as fnull:                                              # Load the latest version using the correct dataset handle while suppressing the download output
    with contextlib.redirect_stdout(fnull):
        df = kagglehub.load_dataset(
            KaggleDatasetAdapter.PANDAS,
            "mohammadbadi/crimes-in-toronto",                                     # Dataset handle on Kaggle
            file_path,
        )

def count_leading_trailing_spaces(column):                                        # Function to check leading/trailing spaces, ensuring the column is treated as a string
    column = column.astype(str)
    return column.str.startswith(' ').sum(), column.str.endswith(' ').sum()

def prepare_summary_table(df):                                                    # Function to prepare the summary table with only Null values check and calculate % of Null
    unique_values = df.nunique()                                                  # Count total, unique, and Null values for each column
    total_values = df.count() + df.isnull().sum()                                 # Total includes NaN and Null
    null_counts = df.isnull().sum()

    null_percentages = (null_counts / total_values) * 100                         # Calculate the percentage of Null values compared to total values

    leading_spaces, trailing_spaces = zip(*[count_leading_trailing_spaces(df[col]) for col in df.columns]) # Initialize column-based space checks

    summary_table = pd.DataFrame({                                                # Create a summary table with the calculated values
        "Column": df.columns,
        "Data Type": df.dtypes,
        "Total Values": total_values,
        "Unique Values": unique_values,
        "Null Values": null_counts,
        "Null %": null_percentages.round(1),
        "Leading Spaces": leading_spaces,
        "Trailing Spaces": trailing_spaces
    })

    return summary_table, null_counts, null_percentages

def generate_summary_html(summary_table):                                        # Function to generate HTML for the summary table
    summary_table_html = """
    <style>
        table {
            border-collapse: collapse;
            width: 100%;
            font-family: Arial, sans-serif;
        }
        table th, table td {
            border: 1px solid #ddd;
            padding: 8px;
            text-align: left;
        }
        table th {
            background-color: #4CAF50;
            color: white;
            font-size: 1.1em;
        }
        table tr:nth-child(even) {background-color: #f2f2f2;}
        table tr:hover {background-color: #ddd;}
    </style>
    <table>
        <tr>
            <th>Column</th>
            <th>Data Type</th>
            <th>Total Values</th>
            <th>Unique Values</th>
            <th>Null Values</th>
            <th>Null %</th>
            <th>Leading Spaces</th>
            <th>Trailing Spaces</th>
        </tr>
    """

    for _, row in summary_table.iterrows():
        null_color = "red" if row['Null Values'] > 0 else "green"
        leading_color = "red" if row['Leading Spaces'] > 0 else "green"
        trailing_color = "red" if row['Trailing Spaces'] > 0 else "green"

        summary_table_html += f"""
        <tr>
            <td>{row['Column']}</td>
            <td>{row['Data Type']}</td>
            <td>{row['Total Values']}</td>
            <td>{row['Unique Values']}</td>
            <td style='color:{null_color};'>{row['Null Values']}</td>
            <td>{row['Null %']}</td>
            <td style='color:{leading_color};'>{row['Leading Spaces']}</td>
            <td style='color:{trailing_color};'>{row['Trailing Spaces']}</td>
        </tr>
        """
    summary_table_html += "</table>"
    return summary_table_html

def plot_missing_percentage(df, dataset_name):                                    # Function to plot missing percentage visualization
    missing_percentage = df.isnull().mean() * 100                                 # Calculate the missing percentage for each column
    missing_percentage = missing_percentage[missing_percentage > 0]               # show only columns with missing values

    if missing_percentage.empty:                                                  # Check if there's any column with missing data
        print(f"No missing data in {dataset_name}. Skipping missing percentage plot.")
        return ""

    plt.figure(figsize=(10, 6))                                                  # Plotting the bar chart for missing percentages
    missing_percentage.sort_values().plot(kind='barh', color='skyblue', edgecolor='grey')
    plt.title(f'Missing Data Percentage - {dataset_name}', fontsize=16)
    plt.xlabel('Missing Percentage (%)', fontsize=12)
    plt.ylabel('Columns', fontsize=12)
    plt.xticks(rotation=45)
    plt.tight_layout()
    missing_plot_path = f'/content/5.1 Original Data Missing PercentPlot.png'     # Path to save the plot
    plt.savefig(missing_plot_path)
    plt.close()                                                                   # Close the plot to avoid displaying it again

    return missing_plot_path

def get_suggestions(null_counts):                                                 # Function to generate suggestions based on Null values
    suggestions = []

    no_null_columns = null_counts[null_counts == 0].index.tolist()                # Handle columns with no Null values
    if no_null_columns:
        suggestions.append(f"<div style='color:green;'><strong>Columns {', '.join(no_null_columns)}</strong> have no Null values. <em>No action is needed.</em></div>")

    columns_with_null_data = null_counts[null_counts > 0].index.tolist()          # Handle columns with Null values
    if columns_with_null_data:
        suggestions.append(f"<div style='color:red;'><strong>Columns {', '.join(columns_with_null_data)}</strong> have Null values. <em>Actions should be taken.</em></div>")

    return suggestions

def display_summary_for_file(df, title):                                          # Function to display the summary table and suggestions
    summary_table, null_counts, null_percentages = prepare_summary_table(df)      # Prepare the summary table
    summary_html = generate_summary_html(summary_table)

    suggestions = get_suggestions(null_counts)                                    # Get the suggestions based on Null values handling

    missing_plot_path = plot_missing_percentage(df, title)                        # Plot the missing percentage visual and get the path of the image
                                                                                  # Combine only the table and suggestions into the HTML content
    complete_html = f"""
    <html>
    <head><title>Dataset Summary - {title}</title></head>
    <body>
        <h2>Summary Table for {title}</h2>
        {summary_html}
        <h2>Suggestions</h2>
        {"".join(suggestions)}
    </body>
    </html>
    """
    summary_html_path = f'/content/5.1 Learning Application Domain.html'          # Save the HTML output to a file for table + suggestions
    with open(summary_html_path, 'w', encoding='utf-8') as f:
        f.write(complete_html)

    return summary_html_path, missing_plot_path

html_file_path, missing_plot_path = display_summary_for_file(df, "Analysis on Original Data") # Save and display summary for dataset

display(HTML(f"<h2>Summary Table and Suggestions for Major Crime Indicators Dataset</h2><br>{open(html_file_path).read()}"))  # Display the summary table and suggestions in the console

files.download(html_file_path)                                                    # Download the generated Summary Table + Suggestions as HTML
files.download(missing_plot_path)                                                 # Download the generated Missing Data Percentage plot as PNG
print("\n\n")




Downloading from https://www.kaggle.com/api/v1/datasets/download/mohammadbadi/crimes-in-toronto?dataset_version_number=1&file_name=major-crime-indicators.csv...


100%|██████████| 117M/117M [00:03<00:00, 33.2MB/s]


### **5.2	Creating Target Dataset**

In [None]:
import warnings                                                                   # Import necessary libraries
import pandas as pd
import kagglehub
import matplotlib.pyplot as plt
import os
import contextlib
from google.colab import files
from IPython.display import display, HTML
from kagglehub import KaggleDatasetAdapter

print("\n\n")
warnings.filterwarnings("ignore", category=DeprecationWarning)                    # Ignore Deprecation Warnings
warnings.filterwarnings("ignore", category=FutureWarning)                         # Ignore future warnings

file_path = "major-crime-indicators.csv"                                          # Set the file path to the filename with extension

crime_df = kagglehub.load_dataset(                                                # Load the latest version of the dataset from Kaggle
    kagglehub.KaggleDatasetAdapter.PANDAS,
    "mohammadbadi/crimes-in-toronto",                                             # Updated dataset handle
    file_path,
)

initial_count = df.shape[0]

filter1_df = df[(df['UCR_CODE'] == 2135) & (df['UCR_EXT'] == 210)].copy()         # Filter 1: UCR Code 2135 with UCR Extension 210
count1 = filter1_df.shape[0]

filter2_df = df[(df['UCR_CODE'] == 1610) & (df['UCR_EXT'] == 140)].copy()         # Filter 2: UCR Code 1610 with UCR Extension 140
count2 = filter2_df.shape[0]

final_df = pd.concat([filter1_df, filter2_df]).copy()                             # Final dataset: Union of both filters
final_count = final_df.shape[0]

final_df.to_csv('Target_Dataset.csv', index=False)                                # Save the final dataset as Target_Dataset.csv
files.download('Target_Dataset.csv')                                              # Download the saved dataset csv

steps_summary = []                                                                # Build steps summary as a list of dictionaries
steps_summary.append({
    "Step Taken": "Filter 1: UCR Code 2135 with UCR Extension 210",
    "Before Action": initial_count,
    "Affected by Action": count1,
    "After Action": count1,
    "Unit": "Rows"
})
steps_summary.append({
    "Step Taken": "Filter 2: UCR Code 1610 with UCR Extension 140",
    "Before Action": count1,                                                      # using count1 as the 'After Action' of Filter 1
    "Affected by Action": count2,
    "After Action": final_count,
    "Unit": "Rows"
})
steps_summary.append({
    "Step Taken": "Rows Affected in <strong>UCR Filtering</strong>",
    "Before Action": "Initial Load:<br><strong>" + str(initial_count) + "</strong>",
    "Affected by Action": "Rows Filtered:<br><strong>" + str(initial_count - final_count) + "</strong>",
    "After Action": "Final Count:<br><strong>" + str(final_count) + "</strong>",
    "Unit": "Rows"
})

html_output_filename = '/content/5.2 Target Dataset.html'                         # Set the HTML output filename
                                                                                  # Create HTML Table with styling
html_table = """
<table style='border-collapse: collapse; width: 100%; font-size: 18px;'>
    <thead style='background-color: #4CAF50; color: white;'>
        <tr>
            <th colspan="5" style="text-align: center; font-size: 24px; background-color: #2f4f4f; color: white;">
                5.2 Creating Target Dataset
            </th>
        </tr>
        <tr>
            <th>Step Taken</th>
            <th>Before Action</th>
            <th>Affected by Action</th>
            <th>After Action</th>
            <th>Unit</th>
        </tr>
    </thead>
    <tbody>
"""

for step in steps_summary:
    html_table += f"""
    <tr style='border: 1px solid #dddddd;'>
        <td style='border: 1px solid #dddddd; padding: 8px;'>{step['Step Taken']}</td>
        <td style='border: 1px solid #dddddd; padding: 8px;'>{step['Before Action']}</td>
        <td style='border: 1px solid #dddddd; padding: 8px;'>{step['Affected by Action']}</td>
        <td style='border: 1px solid #dddddd; padding: 8px;'>{step['After Action']}</td>
        <td style='border: 1px solid #dddddd; padding: 8px;'>{step['Unit']}</td>
    </tr>
    """

note_text = (                                                                     # Add a final row with the note (spanning all columns)
    "<strong>Note: The dataset contains "
    "<span style='color: darkred; '>ALL CRIMES</span>, but our research focuses on "
    "<span style='color: green; '>MOTOR VEHICLE THEFTS</span>. "
    "Therefore, we applied two filters: <br>"
    "• Filter 1: UCR Code 2135 with UCR Extension 210 for Theft of a Motor Vehicle (Auto Theft), and <br>"
    "• Filter 2: UCR Code 1610 with UCR Extension 140 for Robbery - Vehicle Jacking.<br>"
    "The target dataset has been saved as <span style='color: blue;'>'Target_Dataset.csv'</span> for further analysis. </strong>"
)
html_table += f"""
    <tr style='border: 1px solid #dddddd;'>
        <td colspan="5" style='border: 1px solid #dddddd; padding: 8px;'>{note_text}</td>
    </tr>
"""
html_table += "</tbody></table>"


print("\n\n")
display(HTML(html_table))                                                          # Display the output HTML table

with open(html_output_filename, 'w', encoding='utf-8') as f:
    f.write(html_table)
files.download(html_output_filename)
print("\n\n")

### **5.3 Data Cleaning**

In [None]:
import warnings                                                                   # Import necessary libraries
import pandas as pd
from IPython.display import display, HTML
from google.colab import files

warnings.filterwarnings("ignore", category=DeprecationWarning)                    # Ignore Deprecation Warnings
warnings.filterwarnings("ignore", category=FutureWarning)                         # Ignore future warnings

url = "https://raw.githubusercontent.com/mohammadbadi/CrimeAnalytics_Clustering_Approach_3/refs/heads/main/Output_CSV/Target_Dataset.csv"    # Read the data from CSV file
Data_Preparing_df = pd.read_csv(url, low_memory=False).copy()

print("\n\n")

html_output_filename = '/content/5.3 Data Cleaning.html'                          # Output File Name for HTML summary changed to "5.3 Data Cleaning"

steps_summary = []                                                                # Table to store results

before_step_1 = Data_Preparing_df.shape[0]                                        # Step 1: Dataset Loading
steps_summary.append({
    "Step Taken": "Step 1: Load Dataset",
    "Before Action": before_step_1,
    "Affected by Action": 0,
    "After Action": before_step_1,
    "Unit": "Rows"
})

columns_to_check = [col for col in Data_Preparing_df.columns if col != '_id']     # Step 2: Identify and remove true duplicates (excluding '_id')
duplicate_count = Data_Preparing_df.duplicated(subset=columns_to_check).sum()
rows_before_dedup = Data_Preparing_df.shape[0]
Data_Preparing_df = Data_Preparing_df.drop_duplicates(subset=columns_to_check, keep='first').copy()
rows_after_dedup = Data_Preparing_df.shape[0]
steps_summary.append({
    "Step Taken": "Step 2: Remove TRUE DUPLICATE Records",
    "Before Action": rows_before_dedup,
    "Affected by Action": duplicate_count,
    "After Action": rows_after_dedup,
    "Unit": "Rows"
})

before_drop_rows = Data_Preparing_df.shape[0]                                     # Step 3: Drop rows with null, NaN, or missing data
Data_Preparing_df = Data_Preparing_df.dropna().copy()
after_drop_rows = Data_Preparing_df.shape[0]
steps_summary.append({
    "Step Taken": "Step 3: Drop Rows with Missing Data",
    "Before Action": before_drop_rows,
    "Affected by Action": before_drop_rows - after_drop_rows,
    "After Action": after_drop_rows,
    "Unit": "Rows"
})

obj_cols = Data_Preparing_df.select_dtypes(include="object").columns              # Step 4: Strip leading and trailing spaces from string columns
orig_step4 = Data_Preparing_df[obj_cols].copy()
Data_Preparing_df[obj_cols] = Data_Preparing_df[obj_cols].apply(lambda s: s.str.strip())
affected_step4 = (orig_step4 != Data_Preparing_df[obj_cols]).any(axis=1).sum()
steps_summary.append({
    "Step Taken": "Step 4: Strip Leading/Trailing Spaces",
    "Before Action": after_drop_rows,
    "Affected by Action": affected_step4,
    "After Action": after_drop_rows,
    "Unit": "Rows"
})

orig_step5 = Data_Preparing_df[obj_cols].copy()                                   # Step 5: Remove leading apostrophes from string columns
Data_Preparing_df[obj_cols] = Data_Preparing_df[obj_cols].apply(lambda s: s.str.lstrip("'"))
affected_step5 = (orig_step5 != Data_Preparing_df[obj_cols]).any(axis=1).sum()
steps_summary.append({
    "Step Taken": "Step 5: Remove Leading Apostrophes",
    "Before Action": after_drop_rows,
    "Affected by Action": affected_step5,
    "After Action": after_drop_rows,
    "Unit": "Rows"
})

nsa_replaced_count = 0                                                            # Step 6: Match rows where 'HOOD_158' is 'NSA' and replace with matching value based on coordinates
for i, row in Data_Preparing_df[Data_Preparing_df['HOOD_158'] == 'NSA'].iterrows():
    match = Data_Preparing_df[
        (Data_Preparing_df['LONG_WGS84'] == row['LONG_WGS84']) &
        (Data_Preparing_df['LAT_WGS84'] == row['LAT_WGS84']) &
        (Data_Preparing_df['HOOD_158'] != 'NSA')
    ]
    if not match.empty:
        matched_value = match.iloc[0]['HOOD_158']
        Data_Preparing_df.loc[i, 'HOOD_158'] = matched_value
        nsa_replaced_count += 1
steps_summary.append({
    "Step Taken": "Step 6: Match & Replace 'NSA' Values",
    "Before Action": after_drop_rows,
    "Affected by Action": nsa_replaced_count,
    "After Action": after_drop_rows,
    "Unit": "Rows"
})

mask_remaining_nsa = (Data_Preparing_df['HOOD_158'] == 'NSA') | (Data_Preparing_df['NEIGHBOURHOOD_158'] == 'NSA')     # Step 7: Remove remaining rows where 'HOOD_158' or 'NEIGHBOURHOOD_158' contains 'NSA'
remaining_nsa_count = mask_remaining_nsa.sum()
before_removal = Data_Preparing_df.shape[0]
Data_Preparing_df = Data_Preparing_df[~mask_remaining_nsa].copy()
after_removal = Data_Preparing_df.shape[0]
steps_summary.append({
    "Step Taken": "Step 7: Remove Remaining 'NSA' Rows",
    "Before Action": before_removal,
    "Affected by Action": remaining_nsa_count,
    "After Action": after_removal,
    "Unit": "Rows"
})

orig_long = Data_Preparing_df['LONG_WGS84'].copy()                                # Step 8: Format Longitude & Latitude to 7 decimals
orig_lat = Data_Preparing_df['LAT_WGS84'].copy()
Data_Preparing_df.loc[:, 'LONG_WGS84'] = Data_Preparing_df['LONG_WGS84'].astype(float).map(lambda x: f"{x:.7f}")
Data_Preparing_df.loc[:, 'LAT_WGS84'] = Data_Preparing_df['LAT_WGS84'].astype(float).map(lambda x: f"{x:.7f}")
affected_step8 = ((orig_long != Data_Preparing_df['LONG_WGS84']) | (orig_lat != Data_Preparing_df['LAT_WGS84'])).sum()
steps_summary.append({
    "Step Taken": "Step 8: Format Longitude & Latitude to 7 Decimals",
    "Before Action": after_removal,
    "Affected by Action": affected_step8,
    "After Action": after_removal,
    "Unit": "Rows"
})

final_row_count = Data_Preparing_df.shape[0]                                      # Final row: Rows Affected in 5.3 Data Cleaning
steps_summary.append({
    "Step Taken": "Rows Affected in <strong>5.3 Data Cleaning </strong>",
    "Before Action": "Initial Load:<br><strong>" + str(before_step_1) + "</strong>",
    "Affected by Action": "Overall Reduction:<br><strong>" + str(before_step_1 - final_row_count) + "</strong>",
    "After Action": "Final Count:<br><strong>" + str(final_row_count) + "</strong>",
    "Unit": "Rows"
})
                                                                                  # Build HTML Table with styling
html_table = """
<table style='border-collapse: collapse; width: 100%; font-size: 18px;'>
    <thead style='background-color: #4CAF50; color: white;'>
        <tr>
            <th colspan="5" style="text-align: center; font-size: 24px; background-color: #2f4f4f; color: white;">
                5.3 Data Cleaning
            </th>
        </tr>
        <tr>
            <th>Step Taken</th>
            <th>Before Action</th>
            <th>Affected by Action</th>
            <th>After Action</th>
            <th>Unit</th>
        </tr>
    </thead>
    <tbody>
"""
for step in steps_summary:
    html_table += f"""
        <tr style='border: 1px solid #dddddd;'>
            <td style='border: 1px solid #dddddd; padding: 8px;'>{step['Step Taken']}</td>
            <td style='border: 1px solid #dddddd; padding: 8px;'>{step['Before Action']}</td>
            <td style='border: 1px solid #dddddd; padding: 8px;'>{step['Affected by Action']}</td>
            <td style='border: 1px solid #dddddd; padding: 8px;'>{step['After Action']}</td>
            <td style='border: 1px solid #dddddd; padding: 8px;'>{step['Unit']}</td>
        </tr>
    """
note_text = (                                                                     # Add final note row spanning all columns with the required note text
    "<strong>Note: "
    "Longitude and Latitude were reduced to 7 Decimal Places as,<br>"
    "• 7 decimal places offer precision of <span style='color: green;'>1.1 cm</span>, precise enough for GPS devices.<br>"
    "• Further granularity adds processing time and energy consumption without real-world benefits.<br>"
    "The final cleaned data has been saved as <span style='color: blue;'> 'Cleaned_Data.csv' </span> for further analysis.</strong>"
)
html_table += f"""
        <tr style='border: 1px solid #dddddd;'>
            <td colspan="5" style='border: 1px solid #dddddd; padding: 8px;'>{note_text}</td>
        </tr>
    </tbody>
</table>
"""

Data_Preparing_df.to_csv('Cleaned_Data.csv', index=False)                         # Save the cleaned data to a new CSV file

display(HTML(html_table))                                                         # Display the HTML table

with open(html_output_filename, 'w', encoding='utf-8') as f:                      # Save the HTML table to a file
    f.write(html_table)
files.download(html_output_filename)                                              # Download the HTML file
files.download('Cleaned_Data.csv')                                                # Download the cleaned CSV file
print("\n\n")

### **5.4	Data Reduction and Projection - a)	Feature Engineering, b) EDA, c) Feature Engineering & d) Data Reduction**

In [None]:
import warnings                                                                   # Import necessary libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.colors import BoundaryNorm, LinearSegmentedColormap
from IPython.display import display, HTML
from google.colab import files

print("\n\n")
warnings.filterwarnings("ignore", category=DeprecationWarning)                    # Ignore Deprecation Warnings
warnings.filterwarnings("ignore", category=FutureWarning)                         # Ignore future warnings

url = "https://raw.githubusercontent.com/mohammadbadi/CrimeAnalytics_Clustering_Approach_3/refs/heads/main/Output_CSV/Cleaned_Data.csv"    # Read the data from CSV file
df = pd.read_csv(url, parse_dates=['REPORT_DATE', 'OCC_DATE'], low_memory=False)

df['DAYS_DIFFERENCE'] = (df['REPORT_DATE'] - df['OCC_DATE']).dt.days              # Calculate the difference in days between REPORT_DATE and OCC_DATE
df = df[(df['DAYS_DIFFERENCE'] >= 0) & (df['DAYS_DIFFERENCE'] <= 365)]            # Filter out invalid values (0 to 365 days)
day_diff_counts = df['DAYS_DIFFERENCE'].value_counts().reset_index()              # Count occurrences of each 'DAYS_DIFFERENCE'
day_diff_counts.columns = ['Day #', 'Reported number']
day_diff_counts = day_diff_counts.sort_values(by='Day #')

days_range = pd.DataFrame({'Day #': range(0, 90)})                                # Create a DataFrame for days 0 to 89 (90 days total so they can be arranged in 30 rows x 3 columns)
table_df = pd.merge(days_range, day_diff_counts, on='Day #', how='left')
table_df['Reported number'] = table_df['Reported number'].fillna(0).astype(int)

total_incidents = table_df['Reported number'].sum()                               # Calculate totals and percentages
table_df['% Report'] = table_df['Reported number'] / total_incidents * 100
table_df['Total reported until now'] = table_df['Reported number'].cumsum()
table_df['Total % Reported'] = table_df['Total reported until now'] / total_incidents * 100

table_df['Reported number'] = table_df['Reported number'].map(lambda x: f"{x:>10d}")    # Format numeric columns with right-justification
table_df['% Report'] = table_df['% Report'].map(lambda x: f"{x:>6.2f}%")
table_df['Total reported until now'] = table_df['Total reported until now'].map(lambda x: f"{x:>10d}")
table_df['Total % Reported'] = table_df['Total % Reported'].map(lambda x: f"{x:>6.2f}%")

n_cols = 3                                                                        # Reshape the DataFrame into a grid with 3 columns and 30 rows.
n_rows = len(table_df) // n_cols                                                  # Should be 30 rows if table_df has 90 rows

records = table_df.to_dict('records')
grid = []
for i in range(n_rows):
    row_records = []
    for j in range(n_cols):
        idx = j * n_rows + i
        row_records.append(records[idx])
    grid.append(row_records)

                                                                                  # Build the HTML table.
html_table = """
<table style='border-collapse: collapse; width: 100%; font-size: 18px;'>
  <thead>
    <tr>
      <th colspan="15" style="text-align: center; font-size: 24px; background-color: #2f4f4f; color: white; padding: 8px;">
        Distribution of Reporting Delays (30 rows x 3 columns)
      </th>
    </tr>
    <tr style='background-color: #4CAF50; color: white;'>
      <th><strong>Day #</strong></th>
      <th>Reported number</th>
      <th>% Report</th>
      <th>Total reported until now</th>
      <th>Total % Reported</th>
      <th style='border-left: 4px solid #dddddd;'><strong>Day #</strong></th>
      <th>Reported number</th>
      <th>% Report</th>
      <th>Total reported until now</th>
      <th>Total % Reported</th>
      <th style='border-left: 4px solid #dddddd;'><strong>Day #</strong></th>
      <th>Reported number</th>
      <th>% Report</th>
      <th>Total reported until now</th>
      <th>Total % Reported</th>
    </tr>
  </thead>
  <tbody>
"""

for row in grid:
    html_table += "<tr>"
    for j, record in enumerate(row):
        if j > 0:                                                                 # For the Day # cell, add bold formatting.
            day_style = "border: 1px solid #dddddd; padding: 8px; text-align: right; border-left: 4px solid #dddddd; "
        else:
            day_style = "border: 1px solid #dddddd; padding: 8px; text-align: right; "
        html_table += f"<td style='{day_style}'><strong>{record['Day #']}</strong></td>"
        html_table += f"<td style='border: 1px solid #dddddd; padding: 8px; text-align: right;'>{record['Reported number']}</td>"
        html_table += f"<td style='border: 1px solid #dddddd; padding: 8px; text-align: right;'>{record['% Report']}</td>"
        html_table += f"<td style='border: 1px solid #dddddd; padding: 8px; text-align: right;'>{record['Total reported until now']}</td>"
        html_table += f"<td style='border: 1px solid #dddddd; padding: 8px; text-align: right;'>{record['Total % Reported']}</td>"
    html_table += "</tr>"
html_table += """
  </tbody>
</table>
"""

display(HTML(html_table))


day_diff_counts = df['DAYS_DIFFERENCE'].value_counts().reset_index()              # Count occurrences of each 'DAYS_DIFFERENCE'
day_diff_counts.columns = ['Days Difference', 'Occurrences']
day_diff_counts = day_diff_counts.sort_values(by='Days Difference')

day_diff_filtered = day_diff_counts[day_diff_counts['Days Difference'] <= 50].copy()  # Filter for days <= 50 for the first chart

fig, ax = plt.subplots(figsize=(12, 8))                                           #First Chart: Bar Plot (First 50 Days)
sns.barplot(x='Days Difference', y='Occurrences', data=day_diff_filtered, palette='viridis', ax=ax)
ax.set_title('Distribution of Days Difference (First 50 Days)', fontsize=16)
ax.set_xlabel('Days Difference', fontsize=14)
ax.set_ylabel('Occurrences', fontsize=14)
ax.set_xlim(-1, 50)
ax.set_xticks(day_diff_filtered["Days Difference"])                               # Set tick positions explicitly
ax.set_xticklabels(day_diff_filtered["Days Difference"], rotation=45)             # Then set tick labels

total_incidents = day_diff_counts['Occurrences'].sum()                            # Compute totals for day 0 and day 1 based on the complete dataset
zero_day_count = (day_diff_counts[day_diff_counts['Days Difference'] == 0]['Occurrences']
                  .sum() if not day_diff_counts[day_diff_counts['Days Difference'] == 0].empty else 0)
one_day_count = (day_diff_counts[day_diff_counts['Days Difference'] == 1]['Occurrences']
                 .sum() if not day_diff_counts[day_diff_counts['Days Difference'] == 1].empty else 0)
total_zero_one = zero_day_count + one_day_count

percent_zero = (zero_day_count / total_incidents) * 100 if total_incidents > 0 else 0   # Calculate percentages
percent_one = (one_day_count / total_incidents) * 100 if total_incidents > 0 else 0
percent_total = (total_zero_one / total_incidents) * 100 if total_incidents > 0 else 0

explanation_text = (                                                              # Explanation text with three lines; construct the mathtext for the percentages separately so that only those values are bold.
    f"crimes reported by same Day: {zero_day_count} which is " +
    "$\\mathbf{" + f"{percent_zero:.2f}" + "\\%}$" + "\n\n" +
    f"crimes reported within 1 Day: {one_day_count} which is " +
    "$\\mathbf{" + f"{percent_one:.2f}" + "\\%}$" + "\n\n" +
    f"Total crimes reported within 1 day: {total_zero_one} which is " +
    "$\\mathbf{" + f"{percent_total:.2f}" + "\\%}$"
)

ax.text(0.97, 0.96, explanation_text, transform=ax.transAxes, ha='right', va='top', # Place the explanation text inside the graph at the top-right using axis coordinates
        fontsize=12, bbox=dict(facecolor='white', alpha=0.7))

bar_plot_filename = '/content/5.4 EDA_First_50days.png'                                # Save and download the chart
plt.savefig(bar_plot_filename, bbox_inches='tight', dpi=300)
plt.show()
files.download(bar_plot_filename)

print("\n\n")
Data_Processing_df = pd.read_csv(url, low_memory=False).copy()

html_output_filename = "/content/5.4 Data Reduction and Projection.html"

steps_summary = []                                                                # Store processing steps summary

before_step_1 = Data_Processing_df.shape[0]                                       # Step 1: Dataset Loading
before_columns = Data_Processing_df.shape[1]                                      # Columns at the time of loading dataset
steps_summary.append({
    "Step Taken": "Step 1: Load Dataset",
    "Before Action": before_step_1,
    "Affected by Action": "",
    "After Action": before_step_1,
    "Unit": "Rows"
})

current_rows = Data_Processing_df.shape[0]                                        # Use current row count after loading as reference for subsequent steps

Data_Processing_df['REPORT_DATETIME'] = pd.to_datetime(Data_Processing_df['REPORT_DATE']) + pd.to_timedelta(Data_Processing_df['REPORT_HOUR'], unit='h')    # Step 2: Create datetime columns for 'REPORT_DATETIME' and 'OCC_DATETIME'
Data_Processing_df['OCC_DATETIME'] = pd.to_datetime(Data_Processing_df['OCC_DATE']) + pd.to_timedelta(Data_Processing_df['OCC_HOUR'], unit='h')

rows_after_datetime = Data_Processing_df.shape[0]                                 # The row count remains unchanged after creating datetime columns
steps_summary.append({
    "Step Taken": "Step 2: Feature Engineering - Join the Date & Time",
    "Before Action": current_rows,
    "Affected by Action": "New Columns Added",
    "After Action": rows_after_datetime,
    "Unit": "Rows"
})

reporting_timedelta = Data_Processing_df['REPORT_DATETIME'] - Data_Processing_df['OCC_DATETIME']  # Step 3: Calculate the reporting delay (days + hours only)
Data_Processing_df['reporting_delay_days'] = reporting_timedelta.dt.days
Data_Processing_df['reporting_delay_hours'] = (reporting_timedelta.dt.seconds // 3600)  # Convert seconds to full hours

rows_after_delay_calc = Data_Processing_df.shape[0]                               # The row count remains unchanged after calculating delays
steps_summary.append({
    "Step Taken": "Step 3: Feature Engineering: Compute Reporting Delay in Days & Hours",
    "Before Action": rows_after_datetime,
    "Affected by Action": "New Columns Added",
    "After Action": rows_after_delay_calc,
    "Unit": "Rows"
})

before_delay_filter = Data_Processing_df.shape[0]                                 # Step 4: Filter records with reporting delays between 0 to 60 days
Data_Processing_df = Data_Processing_df[
    (Data_Processing_df['reporting_delay_days'] >= 0) &
    (Data_Processing_df['reporting_delay_days'] <= 1)
].copy()
after_delay_filter = Data_Processing_df.shape[0]
delay_filtered_rows = before_delay_filter - after_delay_filter

steps_summary.append({
    "Step Taken": "Step 4: Only Keep Complaints with 0-60 Days Delay",
    "Before Action": before_delay_filter,
    "Affected by Action": delay_filtered_rows,
    "After Action": after_delay_filter,
    "Unit": "Rows"
})

columns_to_drop = ['REPORT_YEAR', 'REPORT_MONTH', 'REPORT_DAY', 'REPORT_DOY', 'REPORT_DOW', # Step 5: Drop unnecessary columns
                   'HOOD_140', 'NEIGHBOURHOOD_140', 'UCR_CODE', 'UCR_EXT', 'OFFENCE', 'MCI_CATEGORY',
                   'REPORT_DATE', 'OCC_DATE', 'REPORT_HOUR', 'REPORT_DATETIME']
columns_before_drop = Data_Processing_df.shape[1]

dropped_column_names = [col for col in columns_to_drop if col in Data_Processing_df.columns]  # Store column names before dropping

Data_Processing_df = Data_Processing_df.drop(columns=dropped_column_names).copy() # Drop columns
columns_after_drop = Data_Processing_df.shape[1]
dropped_columns = columns_before_drop - columns_after_drop

steps_summary.append({
    "Step Taken": "Step 5: Dropped Unnecessary Columns",
    "Before Action": columns_before_drop,
    "Affected by Action": dropped_columns,
    "After Action": columns_after_drop,
    "Unit": "Columns"
})

Data_Processing_df.to_csv('Final_Data.csv', index=False)                          # Save the final data to a CSV file

steps_summary.append({                                                            # Step 8: Summary - Rows Affected
    "Step Taken": "Rows Affected in <strong>5.4 Data Reduction and Projection</strong>",
    "Before Action": f"Initial Load:<br><strong>{before_step_1}</strong>",
    "Affected by Action": f"Rows Filtered:<br><strong>{before_step_1 - after_delay_filter}</strong>",
    "After Action": f"Final Count: <br><strong>{after_delay_filter}</strong>",
    "Unit": "Rows"
})

steps_summary.append({                                                            # Step 9: Summary - Columns Affected
    "Step Taken": "Columns Affected in <strong>5.4 Data Reduction and Projection</strong>",
    "Before Action": f"Initial Load:<br><strong>{before_columns}</strong>",
    "Affected by Action": f"Columns Dropped:<br><strong>{dropped_columns}</strong>",
    "After Action": f"Final Count: <br><strong>{columns_after_drop}</strong>",
    "Unit": "Columns"
})
                                                                                  # Create HTML Table with styling
html_table = """
<table style='border-collapse: collapse; width: 100%; font-size: 18px;'>
    <thead style='background-color: #4CAF50; color: white;'>
        <tr>
            <th colspan="5" style="text-align: center; font-size: 24px; background-color: #2f4f4f; color: white;">
                <strong>5.4 Data Reduction and Projection</strong>
            </th>
        </tr>
        <tr>
            <th>Step Taken</th>
            <th>Before Action</th>
            <th>Affected by Action</th>
            <th>After Action</th>
            <th>Unit</th>
        </tr>
    </thead>
    <tbody>
"""

for step in steps_summary:
    html_table += f"""
    <tr style='border: 1px solid #dddddd;'>
        <td style='border: 1px solid #dddddd; padding: 8px;'>{step['Step Taken']}</td>
        <td style='border: 1px solid #dddddd; padding: 8px;'>{step['Before Action']}</td>
        <td style='border: 1px solid #dddddd; padding: 8px;'>{step['Affected by Action']}</td>
        <td style='border: 1px solid #dddddd; padding: 8px;'>{step['After Action']}</td>
        <td style='border: 1px solid #dddddd; padding: 8px;'>{step['Unit']}</td>
    </tr>
    """
html_table += "</tbody></table>"

                                                                                  # Final message about saved data
final_message = """
<div style="font-size: 18px; color: #333; font-weight: bold; padding: 10px;">
    The final data after Data Reduction and Projection has been saved as <span style="color: blue;">'Final_Data.csv'</span> for further analysis.
</div>
"""

display(HTML(html_table))                                                         # Display the tables
display(HTML(final_message))                                                      # Display the final message

with open(html_output_filename, 'w', encoding='utf-8') as f:
    f.write(html_table)
    f.write(final_message)

files.download(html_output_filename)                                              # Download HTML File
files.download('Final_Data.csv')                                                  # Download CSV File
print("\n\n")

### **5.4 Data Reduction and Projection - e) EDA**

In [None]:
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.colors import BoundaryNorm, LinearSegmentedColormap
from scipy import stats
from IPython.display import display, HTML
from google.colab import files
import os

                                                                                  # Ignore specific warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

                                                                                  # Read the dataset from CSV (using the URL)
url = "https://raw.githubusercontent.com/mohammadbadi/CrimeAnalytics_Clustering_Approach_3/refs/heads/main/Output_CSV/Final_Data.csv"
original_df = pd.read_csv(url, low_memory=False)

                                                                                  # Create copies for different visualizations
df_box = original_df.copy().drop(columns=['_id', 'EVENT_UNIQUE_ID', 'HOOD_158'])
df_line = original_df.copy()                                                      # For missing values line chart
df_bar = original_df.copy().drop(columns=['_id', 'EVENT_UNIQUE_ID', 'HOOD_158'])
df_cat = df_bar.copy()                                                            # For categorical distribution
df_geo = original_df.copy()                                                       # For geographic scatter
df_corr = original_df.copy()                                                      # For correlation heatmap
                                                                                  # 1) Box Plot of Numerical Features
print("Generating Box Plot for Numerical Features...")
numerical_columns = df_box.select_dtypes(include=['float64', 'int64']).columns
ncols = 4                                                                         # 4 box plots per row
nrows = (len(numerical_columns) + ncols - 1) // ncols
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(15, 2.5 * nrows))
axes = axes.flatten()
for i, col in enumerate(numerical_columns):
    sns.boxplot(x=df_box[col], ax=axes[i], color="#8A0000")
    axes[i].set_title(f"Box Plot: {col}")
    axes[i].tick_params(axis='x', rotation=45)
                                                                                  # Remove any unused axes
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])
plt.tight_layout()
box_plot_filepath = "EDA_Boxplot_Numerical_Features.png"
plt.savefig(box_plot_filepath)
plt.show()
files.download(box_plot_filepath)
                                                                                  # 2) Line Chart: Percentage of Missing Values per Column
print("Generating Line Chart for Missing Values per Column...")
missing_percent = df_line.isnull().mean() * 100
plt.figure(figsize=(10, 3))
missing_percent.plot(kind='line', color='b', marker='o', linestyle='-', linewidth=2)
plt.axhline(0, color='black', linestyle='--')
plt.title("Percentage of Missing Values per Column", fontsize=16)
plt.xlabel("Columns", fontsize=14)
plt.ylabel("Missing Value (%)", fontsize=12)
plt.xticks(rotation=40)
plt.grid(True, linestyle='--', alpha=0.6)
plt.tight_layout()
missing_values_chart_path = "EDA_Missing_Values.png"
plt.savefig(missing_values_chart_path)
plt.show()
files.download(missing_values_chart_path)
                                                                                  # 3) Bar Plot: Incidents per Year, Month, Day of the Week
print("Generating Bar Plots for Incidents per Year, Month, and Day of the Week...")
fig, axes = plt.subplots(1, 3, figsize=(15, 4.5))
                                                                                  # Ensure proper data types and ordering
df_bar['OCC_YEAR'] = df_bar['OCC_YEAR'].astype(int)
df_bar['OCC_MONTH'] = df_bar['OCC_MONTH'].str[:3]                                 # Use first 3 letters
df_bar['OCC_DOW'] = df_bar['OCC_DOW'].str[:3]                                     # Use first 3 letters
month_order = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
dow_order = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
df_bar['OCC_MONTH'] = pd.Categorical(df_bar['OCC_MONTH'], categories=month_order, ordered=True)
df_bar['OCC_DOW'] = pd.Categorical(df_bar['OCC_DOW'], categories=dow_order, ordered=True)
sns.countplot(x='OCC_YEAR', data=df_bar, palette='coolwarm', ax=axes[0]).set_title("Incidents Per Year")
sns.countplot(x='OCC_MONTH', data=df_bar, palette='coolwarm', ax=axes[1]).set_title("Incidents Per Month")
sns.countplot(x='OCC_DOW', data=df_bar, palette='coolwarm', ax=axes[2]).set_title("Incidents Per Day of the Week")
for ax in axes:
    ax.tick_params(axis='x', rotation=45)
incidents_path = "EDA_Incidents_Per_Year_Month_DayOfWeek.png"
plt.tight_layout()
plt.savefig(incidents_path)
plt.show()
files.download(incidents_path)
                                                                                  # 4) Bar Plot: Distribution of Categorical Features
                                                                                  # (DIVISION, LOCATION_TYPE, PREMISES_TYPE, NEIGHBOURHOOD_158)
print("Generating Distribution Plots for Categorical Features...")
categorical_columns = ['DIVISION', 'LOCATION_TYPE', 'PREMISES_TYPE', 'NEIGHBOURHOOD_158']
fig, axes = plt.subplots(len(categorical_columns) // 2, 2, figsize=(15, 5 * (len(categorical_columns) // 2)))
axes = axes.flatten()
for idx, col in enumerate(categorical_columns):
    print(f"\n===== {col} Distribution =====\n")
    print(df_cat[col].value_counts().to_string())
    print("\n" + "="*40 + "\n")
    top_categories = df_cat[col].value_counts().nlargest(10)
    sns.countplot(y=df_cat[col], order=top_categories.index, palette="viridis", ax=axes[idx])
    axes[idx].set_title(f"Distribution of {col}")
    labels = [label.get_text()[:15] for label in axes[idx].get_yticklabels()]
    axes[idx].set_yticklabels(labels)
plt.tight_layout()
categorical_dist_path = "EDA_Categorical_Distribution.png"
plt.savefig(categorical_dist_path)
plt.show()
files.download(categorical_dist_path)
                                                                                  # 5) Colour Coded Scatter Plot (Geographic Distribution)
print("Generating Colour Coded Scatter Plot for Geographic Distribution...")
df_geo['Geo_Location'] = df_geo['LONG_WGS84'].round(7).astype(str) + ", " + df_geo['LAT_WGS84'].round(7).astype(str)
location_counts = df_geo['Geo_Location'].value_counts()
df_geo['Location_Frequency'] = df_geo['Geo_Location'].map(location_counts)
boundaries = [0, 30, 250, 350]
colors = ['#006400', '#FFFF00', '#FF6666', '#8B0000']
cmap = LinearSegmentedColormap.from_list("custom_green_yellow_red", colors, N=256)
norm = BoundaryNorm(boundaries, cmap.N)
fig, ax = plt.subplots(figsize=(10, 6))
sns.scatterplot(x='LONG_WGS84', y='LAT_WGS84', data=df_geo, hue='Location_Frequency', palette=cmap,
                size='Location_Frequency', sizes=(20, 200), alpha=0.6, legend=None,
                hue_norm=norm, ax=ax)
ax.set_title("Geographic Distribution of Incidents (Color Coded by Frequency)")
ax.set_xlabel("Longitude")
ax.set_ylabel("Latitude")
sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm)
sm.set_array([])
fig.colorbar(sm, ax=ax, label="Frequency of Incidents")
plt.tight_layout()
geo_plot_filepath = "Geographic_Scatter_Plot.png"
plt.savefig(geo_plot_filepath, dpi=300, bbox_inches='tight')
plt.show()
files.download(geo_plot_filepath)
print("\n===== Top 10 Most Frequent Incident Locations =====\n")
print(location_counts.head(10).to_string())
print("="*40)

                                                                                  # 6) Correlation Heatmap
print("Generating Correlation Heatmap...")
df_corr_clean = df_corr.drop(columns=['_id', 'EVENT_UNIQUE_ID', 'reporting_delay_days', 'HOOD_158'])
df_corr_clean['OCC_MONTH'] = pd.Categorical(df_corr_clean['OCC_MONTH'], categories=[
    'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August',
    'September', 'October', 'November', 'December'], ordered=False)
df_corr_clean['OCC_DOW'] = pd.Categorical(df_corr_clean['OCC_DOW'], categories=[
    'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'], ordered=False)
df_numerical = df_corr_clean.select_dtypes(include=[np.number])
correlation_matrix = df_numerical.corr()
print("\n===== Correlation Matrix =====")
print(correlation_matrix)
print("="*40)
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5, cbar=True)
plt.title("Correlation Matrix (Excluding Specific Columns)")
plt.tight_layout()
corr_plot_filepath = "Correlation_Heatmap.png"
plt.savefig(corr_plot_filepath, dpi=300, bbox_inches='tight')
plt.show()
files.download(corr_plot_filepath)


### **5.4.3 Feature Engineering - Approach_3**

In [None]:
import warnings                                                                   # Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from sklearn.decomposition import PCA
from google.colab import files
from IPython.display import display, HTML

                                                                                  # URL of the Dataset
url = "https://raw.githubusercontent.com/mohammadbadi/CrimeAnalytics_Clustering_Approach_3/refs/heads/main/Output_CSV/Final_Data.csv"
try:
  df = pd.read_csv(url)
                                                                                  # Print statement removed, only using HTML display below
except Exception as e:
  display(HTML(f"<p style='color: red; font-size: 16px; font-weight: bold;'>Error loading data: {e}</p>"))
  exit()

Data_Preparing_df = pd.read_csv(url)
display(HTML("<p style='color: green; font-size: 16px; font-weight: bold;'>Data loaded successfully.</p>"))

# -------------------- Feature Engineering --------------------
# Capture initial column count
initial_cols_count = len(Data_Preparing_df.columns)

# Define the grouping for location types
residential_types = [
    'Apartment (Rooming House, Condo)',
    'Single Home, House (Attach Garage, Cottage, Mobile)',
    'Group Homes (Non-Profit, Halfway House, Social Agency)',
    'Community Group Home', 'Retirement Home', 'Nursing Home',
    'Private Property Structure (Pool, Shed, Detached Garage)'
]
public_types = [
    'Streets, Roads, Highways (Bicycle Path, Private Road)',
    'Open Areas (Lakes, Parks, Rivers)',
    "Other Non Commercial / Corporate Places (Non-Profit, Gov'T, Firehall)",
    'Parking Lots (Apt., Commercial Or Non-Commercial)'
]

# Work on a copy for feature engineering
df = Data_Preparing_df.copy()

# Engineer new columns based on pre-defined groups
df['Location_Engineered_Residential'] = df['LOCATION_TYPE'].apply(lambda x: 'Residential' if x in residential_types else None)
df['Location_Engineered_Public']      = df['LOCATION_TYPE'].apply(lambda x: 'Public' if x in public_types else None)
df['Location_Engineered_Other']       = df['LOCATION_TYPE'].apply(lambda x: 'Other' if (x not in residential_types and x not in public_types) else None)

# Capture final column count after feature engineering
final_cols_count = len(df.columns)

# Build steps_summary with 3 engineered rows and one final row for column counts
steps_summary = []
steps_summary.append({
    "Original Feature": "LOCATION_TYPE",
    "Action Taken": "Engineered new column Location_Engineered_Residential",
    "Rationale": "Captures dwelling types in residential areas."
})
steps_summary.append({
    "Original Feature": "LOCATION_TYPE",
    "Action Taken": "Engineered new column Location_Engineered_Public",
    "Rationale": "Groups public and community space locations."
})
steps_summary.append({
    "Original Feature": "LOCATION_TYPE",
    "Action Taken": "Engineered new column Location_Engineered_Other",
    "Rationale": "Identifies location types that do not fit the primary groups."
})
steps_summary.append({
    "Original Feature": "Columns affected in <br><strong>5.4.3 Feature Engineering - Approach_3</strong>",
    "Action Taken": "Initial Columns: <strong><br>" + str(initial_cols_count) + "</strong>",
    "Rationale": "Final Columns: <strong><br>" + str(final_cols_count) + "</strong>"
})

# Build HTML Table for Feature Engineering Phase with alternate row shading
html_table = """
<table style='border-collapse: collapse; width: 100%; font-size: 18px;'>
    <thead style='background-color: #4CAF50; color: white;'>
        <tr>
            <th colspan="3" style="text-align: center; font-size: 24px; background-color: #2f4f4f; color: white;">
                5.4.3 Feature Engineering Phase - Approach_3
            </th>
        </tr>
        <tr>
            <th style='border: 1px solid #dddddd; padding: 8px;'>Original Feature</th>
            <th style='border: 1px solid #dddddd; padding: 8px;'>Action Taken</th>
            <th style='border: 1px solid #dddddd; padding: 8px;'>Rationale</th>
        </tr>
    </thead>
    <tbody>
"""

# Add rows with alternating shading
for i, step in enumerate(steps_summary):
    bg_color = "#f2f2f2" if i % 2 == 0 else "white"
    html_table += f"""
        <tr style='border: 1px solid #dddddd; background-color: {bg_color};'>
            <td style='border: 1px solid #dddddd; padding: 8px;'>{step["Original Feature"]}</td>
            <td style='border: 1px solid #dddddd; padding: 8px;'>{step["Action Taken"]}</td>
            <td style='border: 1px solid #dddddd; padding: 8px;'>{step["Rationale"]}</td>
        </tr>
    """

# Add footer note inside the table as a row spanning all columns
note_text = (
    "Feature Engineering completed and saved as <span style='color: green;'>FEngineered_New.csv</span> for further analysis."
)
html_table += f"""
        <tr style='border: 1px solid #dddddd;'>
            <td colspan="3" style='border: 1px solid #dddddd; padding: 8px; background-color: #f8f8f8;'>
                <strong>{note_text}</strong>
            </td>
        </tr>
    </tbody>
</table>
"""

display(HTML(html_table))
df.to_csv("FEngineered_New.csv", index=False)                      # Save engineered data as FEngineered_New.csv
files.download("FEngineered_New.csv")


### **5.4.4 Feature Encoding - Approach_3**

In [None]:
import warnings
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from IPython.display import display, HTML
from google.colab import files

warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
                                                                                  # Read the file created in 5.4.3
url = "https://raw.githubusercontent.com/mohammadbadi/CrimeAnalytics_Clustering_Approach_3/refs/heads/main/Output_CSV/FEngineered_Data.csv"
try:
    df_encoded = pd.read_csv(url)
    display(HTML("<p style='color: green; font-size: 16px; font-weight: bold;'>Previously engineered data loaded successfully.</p>"))
except Exception as e:
    display(HTML(f"<p style='color: red; font-size: 16px; font-weight: bold;'>Error loading engineered data: {e}</p>"))
    exit()
                                                                                  # Count initial columns before encoding
initial_column_count = len(df_encoded.columns)
                                                                                  # Build a list to store encoding steps details
steps_summary = [
    {
        "Original Feature": "Location_Engineered",
        "Action Taken": "One-Hot Encoding applied to create binary features",
        "Rationale": "Separates categories for better clustering"
    },
    {
        "Original Feature": "HOOD_158",
        "Action Taken": "Frequency encoded to 'Hood_158_Encoded'",
        "Rationale": "Represents neighborhood distribution as normalized frequencies"
    },
    {
        "Original Feature": "DIVISION",
        "Action Taken": "Frequency encoded to 'Division_Encoded'",
        "Rationale": "Represents division distribution as normalized frequencies"
    },
    {
        "Original Feature": "OCC_MONTH",
        "Action Taken": "Manual mapping to 'OCC_Month_Encoded'",
        "Rationale": "Converts month names to numerical values"
    },
    {
        "Original Feature": "OCC_DOW",
        "Action Taken": "Label Encoding applied to create 'OCC_DOW_Encoded'",
        "Rationale": "Transforms day names to numeric representations"
    }
]
                                                                                  # 1. Frequency Encoding for HOOD_158
hood_counts = df_encoded['HOOD_158'].value_counts(normalize=True)
df_encoded['Hood_158_Encoded'] = df_encoded['HOOD_158'].map(hood_counts)
                                                                                  # 2. Frequency Encoding for DIVISION
division_counts = df_encoded['DIVISION'].value_counts(normalize=True)
df_encoded['Division_Encoded'] = df_encoded['DIVISION'].map(division_counts)
                                                                                  # 3. One-Hot Encoding for Location_Engineered
encoder = OneHotEncoder(sparse_output=False)
location_encoded = encoder.fit_transform(df_encoded[['Location_Engineered']])
location_encoded_df = pd.DataFrame(location_encoded,
                                   columns=encoder.get_feature_names_out(['Location_Engineered']),
                                   index=df_encoded.index)
df_encoded = pd.concat([df_encoded, location_encoded_df], axis=1)
                                                                                  # 4. Manual Mapping for OCC_MONTH
month_mapping = {
    'January': 1, 'February': 2, 'March': 3, 'April': 4,
    'May': 5, 'June': 6, 'July': 7, 'August': 8,
    'September': 9, 'October': 10, 'November': 11, 'December': 12
}
df_encoded['OCC_Month_Encoded'] = df_encoded['OCC_MONTH'].map(month_mapping)
                                                                                  # 5. Label Encoding for OCC_DOW
dow_encoder = LabelEncoder()
df_encoded['OCC_DOW_Encoded'] = dow_encoder.fit_transform(df_encoded['OCC_DOW'])
                                                                                  # Count final columns after encoding
final_column_count = len(df_encoded.columns)
                                                                                  # Build HTML Table for Feature Encoding Phase
html_table = """
<table style='border-collapse: collapse; width: 100%; font-size: 18px;'>
    <thead style='background-color: #4CAF50; color: white;'>
        <tr>
            <th colspan="3" style="text-align: center; font-size: 24px; background-color: #2f4f4f; color: white;">5.4.4 Feature Encoding Phase - Approach_3</th>
        </tr>
        <tr>
            <th style='border: 1px solid #dddddd; padding: 8px;'>Original Feature</th>
            <th style='border: 1px solid #dddddd; padding: 8px;'>Action Taken</th>
            <th style='border: 1px solid #dddddd; padding: 8px;'>Rationale</th>
        </tr>
    </thead>
    <tbody>
"""
                                                                                  # Add rows with alternating shading
for i, step in enumerate(steps_summary):
    bg_color = "#f2f2f2" if i % 2 == 0 else "white"
    html_table += f"""
        <tr style='border: 1px solid #dddddd; background-color: {bg_color};'>
            <td style='border: 1px solid #dddddd; padding: 8px;'>{step["Original Feature"]}</td>
            <td style='border: 1px solid #dddddd; padding: 8px;'>{step["Action Taken"]}</td>
            <td style='border: 1px solid #dddddd; padding: 8px;'>{step["Rationale"]}</td>
        </tr>
    """
                                                                                  # Add row showing columns affected with the requested formatting
bg_color = "#f2f2f2" if len(steps_summary) % 2 == 0 else "white"
html_table += f"""
    <tr style='border: 1px solid #dddddd; background-color: {bg_color};'>
        <td style='border: 1px solid #dddddd; padding: 8px;'>Columns affected in <br> <strong>5.4.4. Feature Encoding - Approach_3</strong></td>
        <td style='border: 1px solid #dddddd; padding: 8px;'><strong>:</strong> Initial columns: <br> <strong>{initial_column_count}</strong></td>
        <td style='border: 1px solid #dddddd; padding: 8px;'>Final Columns: <br> <strong>{final_column_count}</strong></td>
    </tr>
"""
                                                                                  # Add footer note inside the table
note_text = (
    "Feature Encoding completed and saved as <span style='color: green;'>Encoded_Features.csv</span> "
    "for further analysis."
)
html_table += f"""
        <tr style='border: 1px solid #dddddd;'>
            <td colspan="3" style='border: 1px solid #dddddd; padding: 8px; background-color: #f8f8f8;'><strong>{note_text}</strong></td>
        </tr>
    </tbody>
</table>
"""
display(HTML(html_table))
                                                                                  # Save the encoded dataset to CSV
df_encoded.to_csv("FE_Encoded.csv", index=False)
files.download("FE_Encoded.csv")


### **5.4 Data Reduction and Projection - f) Feature Engineering, g) Feature Encoding and h) Descriptive Statistics - Approach_3**


In [None]:
import warnings                                                                   # Import necessary libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.colors import BoundaryNorm, LinearSegmentedColormap
from scipy import stats
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from IPython.display import display, HTML
from google.colab import files

print("\n\n")
warnings.filterwarnings("ignore", category=DeprecationWarning)                    # Ignore Deprecation Warnings
warnings.filterwarnings("ignore", category=FutureWarning)                         # Ignore future warnings

url = "https://raw.githubusercontent.com/mohammadbadi/CrimeAnalytics_Clustering_Approach_3/refs/heads/main/Output_CSV/FE_Encoded.csv"    # Read the dataset from CSV file
df_summary = pd.read_csv(url)

def count_leading_trailing_spaces(column):                                        # Function to count leading/trailing spaces in strings
    column = column.astype(str)
    return column.str.startswith(' ').sum(), column.str.endswith(' ').sum()

def prepare_summary_table(df):                                                    # Function to generate a summary table for dataset
    unique_values = df.nunique()
    total_values = df.count() + df.isnull().sum()
    null_counts = df.isnull().sum()
    nan_counts = df.isna().sum()
    null_percentages = (null_counts / total_values) * 100
    leading_spaces, trailing_spaces = zip(*[count_leading_trailing_spaces(df[col]) for col in df.columns])
    summary_table = pd.DataFrame({
        "Column": df.columns,
        "Data Type": df.dtypes,
        "Total Values": total_values,
        "Unique Values": unique_values,
        "Null Values": null_counts,
        "Null %": null_percentages.round(1),
        "NaN Values": nan_counts,
        "Leading Spaces": leading_spaces,
        "Trailing Spaces": trailing_spaces
    })
    return summary_table, null_counts, null_percentages

def generate_summary_html(summary_table):
    summary_table_html = """
    <style>
        table { border-collapse: collapse; width: 100%; font-family: Arial, sans-serif; font-size: 16px; }
        table th, table td { border: 1px solid #ddd; padding: 8px; text-align: left; }
        table th { background-color: #4CAF50; color: white; font-size: 16px; }
        table tr:nth-child(even) {background-color: #f2f2f2;}
        table tr:hover {background-color: #ddd;}
    </style>
    <table>
        <tr>
            <th>Column</th>
            <th>Data Type</th>
            <th>Total Values</th>
            <th>Unique Values</th>
            <th>Null Values</th>
            <th>Null %</th>
            <th>NaN Values</th>
            <th>Leading Spaces</th>
            <th>Trailing Spaces</th>
        </tr>
    """
    for _, row in summary_table.iterrows():
        null_color = "red" if row['Null Values'] > 0 else "green"
        leading_color = "red" if row['Leading Spaces'] > 0 else "green"
        trailing_color = "red" if row['Trailing Spaces'] > 0 else "green"
        summary_table_html += f"""
        <tr>
            <td>{row['Column']}</td>
            <td>{row['Data Type']}</td>
            <td>{row['Total Values']}</td>
            <td>{row['Unique Values']}</td>
            <td style='color:{null_color};'>{row['Null Values']}</td>
            <td>{row['Null %']}</td>
            <td>{row['NaN Values']}</td>
            <td style='color:{leading_color};'>{row['Leading Spaces']}</td>
            <td style='color:{trailing_color};'>{row['Trailing Spaces']}</td>
        </tr>
        """
    summary_table_html += "</table>"
    return summary_table_html

def plot_missing_percentage(df, dataset_name):                                    # Function to plot missing data percentage
    missing_percentage = df.isnull().mean() * 100
    missing_percentage = missing_percentage[missing_percentage > 0]
    if missing_percentage.empty:
        display(HTML(f"<p style='color: black; font-size: 16px; font-weight: bold;'>No missing data in {dataset_name}. Skipping missing percentage plot.</p>"))
        return ""
    plt.figure(figsize=(10, 6))
    missing_percentage.sort_values().plot(kind='barh', color='skyblue', edgecolor='grey')
    plt.title(f'Missing Data Percentage - {dataset_name}', fontsize=16)
    plt.xlabel('Missing Percentage (%)', fontsize=12)
    plt.ylabel('Columns', fontsize=12)
    plt.xticks(rotation=45)
    plt.tight_layout()
    missing_plot_path = f'/content/Encoded_Data_Missing_PercentPlot.png'
    plt.savefig(missing_plot_path)
    plt.close()
    display(HTML(f"<p style='color: black; font-size: 16px; font-weight: bold;'>Missing data percentage plot saved to <span style='color: darkgreen; font-weight: bold;'>{missing_plot_path}</span>.</p>"))
    return missing_plot_path

def display_summary_for_file(df, title):                                            # Function to display summary for dataset
    summary_table, null_counts, null_percentages = prepare_summary_table(df)
    summary_html = generate_summary_html(summary_table)
    missing_plot_path = plot_missing_percentage(df, title)
    complete_html = f"""
    <html>
    <head><title>Dataset Summary - {title}</title></head>
    <body>
        <h2 style="color: black; font-size: 16px; font-weight: bold;">Summary Table for {title}</h2>
        {summary_html}
    </body>
    </html>
    """
    summary_html_path = f'5.4.3 Summary_Encoded_Data.html'
    with open(summary_html_path, 'w', encoding='utf-8') as f:
        f.write(complete_html)
    display(HTML(f"<p style='color: black; font-size: 16px; font-weight: bold;'>HTML summary report for {title} saved to <span style='color: darkgreen; font-weight: bold;'>{summary_html_path}</span>.</p>"))
    return summary_html_path, missing_plot_path

html_file_path, missing_plot_path = display_summary_for_file(df_summary, "Analysis on Encoded Data")
display(HTML(open(html_file_path).read()))

files.download(html_file_path)                                                    # Download Summary table as HTML

display(HTML("""
<p style="color: black; font-size: 16px; font-weight: bold;">
    Files: <span style="color: darkblue; font-weight: bold;">HTML summary file</span>, <span style="color: darkblue; font-weight: bold;">FEngineered_Data.csv</span> and <span style="color: darkblue; font-weight: bold;">FE_Encoded.csv</span> have been <span style="color: darkgreen; font-weight: bold;">downloaded</span>.
</p>
"""))


### **5.5 KMeans Feature Importance & 5.6 K-Elbow - Approach_3**

In [None]:
import warnings                                                                   # Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from IPython.display import display, HTML
from google.colab import files

print("\n\n")
warnings.filterwarnings("ignore", category=DeprecationWarning)                    # Ignore Deprecation Warnings
warnings.filterwarnings("ignore", category=FutureWarning)                         # Ignore Future Warnings

url = "https://raw.githubusercontent.com/mohammadbadi/CrimeAnalytics_Clustering_Approach_3/refs/heads/main/Output_CSV/FE_Encoded.csv"  # Read the dataset from CSV file
df = pd.read_csv(url)

continuous_features = ['OCC_YEAR', 'OCC_DOY', 'OCC_HOUR', 'LONG_WGS84', 'LAT_WGS84']  # Define Continuous Features
encoded_features = ['OCC_Month_Encoded', 'OCC_DOW_Encoded', 'Hood_158_Encoded',   # Define Encoded Features
                    'Division_Encoded', 'Location_Engineered_Other',
                    'Location_Engineered_Public', 'Location_Engineered_Residential']
features = continuous_features + encoded_features
                                                                                  # Build preprocessor and pipeline for KMeans Clustering
scaler = StandardScaler()                                                         # Standardization
preprocessor = ColumnTransformer([('num', scaler, features)])
pipeline = Pipeline([                                                             # KMeans Clustering Pipeline (n_clusters=4 based on K-Elbow)
    ('preprocessor', preprocessor),
    ('kmeans', KMeans(n_clusters=4, random_state=42, n_init=10))
])
pipeline.fit(df)                                                                  # Fit the Pipeline
df_processed = pipeline.transform(df)                                             # Transform the Data

kmeans = pipeline.named_steps['kmeans']                                           # Extract KMeans Model, assign cluster labels, compute centroid variance
df['Cluster'] = kmeans.labels_
centroids = kmeans.cluster_centers_
centroid_variance = np.var(centroids, axis=0)
importance_df = pd.DataFrame({'Feature': features, 'Centroid Variance': centroid_variance})\
                  .sort_values(by='Centroid Variance', ascending=False)

sil_score = silhouette_score(df_processed, kmeans.labels_)                        # Compute Silhouette Score, Davies-Bouldin Index, Calinski-Harabasz Index, and Inertia
db_index = davies_bouldin_score(df_processed, kmeans.labels_)
ch_index = calinski_harabasz_score(df_processed, kmeans.labels_)
inertia_value = kmeans.inertia_
                                                                                  # Plot: K-Elbow Method
inertia_values = []                                                               # Compute inertia for K values from 2 to 9
K_range = range(2, 10)
for k in K_range:
    kmeans_test = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans_test.fit(df_processed)
    inertia_values.append(kmeans_test.inertia_)

plt.figure(figsize=(10, 5))
plt.plot(K_range, inertia_values, marker='o', linestyle='-')
plt.xlabel("Number of Clusters (K)")
plt.ylabel("Inertia (Within-Cluster Sum of Squares)")
plt.title("Elbow Method for Optimal K - Approach_3")
plt.grid(True)
plt.savefig("5.5 K-Elbow.png", dpi=300, bbox_inches='tight')                      # Save Elbow plot
plt.show()
files.download("5.5 K-Elbow.png")                                                 # Download the Elbow plot
display(HTML("<br><br>"))
                                                                                  # Text Output: Inertia Values
inertia_html = """
<p style="color: darkblue; font-size: 18px; font-weight: bold;">
    Inertia values for different K:
"""
for k, inertia in zip(K_range, inertia_values):
    inertia_html += f"<br>For K = <span style='color: blue;'>{k}</span>: <span style='color: blue;'>{inertia:.2f}</span>"
inertia_html += "</p>"
display(HTML(inertia_html))
display(HTML("<br><br>"))
                                                                                  #  Plot: K-Means Feature Importance
plt.figure(figsize=(12, 6))
sns.barplot(data=importance_df, x='Centroid Variance', y='Feature', palette='viridis')
plt.xlabel("Centroid Variance")
plt.ylabel("Feature")
plt.title("Feature Importance in KMeans Clustering - Approach_3")
plt.grid(True, linestyle='--', alpha=0.6)
plt.savefig("5.6 K-Means Feature Importance.png", dpi=300, bbox_inches='tight')   # Save Feature Importance plot
plt.show()
files.download("5.6 K-Means Feature Importance.png")                              # Download Feature Importance plot
display(HTML("<br><br>"))
                                                                                  # Plot: Number of Points in Each Cluster
plt.figure(figsize=(12, 6))
sns.countplot(data=df, x='Cluster', hue='Cluster', palette="Set2", dodge=False)
plt.title("Number of Points in Each Cluster - Approach_3")
plt.xlabel("Cluster Label")
plt.ylabel("Number of Data Points")
plt.grid(True, linestyle='--', alpha=0.6)
plt.savefig("5.6 K-Means Cluster Distribution.png", dpi=300, bbox_inches='tight') # Save Cluster Distribution plot
plt.show()
files.download("5.6 K-Means Cluster Distribution.png")                            # Download Cluster Distribution plot
display(HTML("<br><br>"))
                                                                                  # Text Output: Cluster Distribution
cluster_counts = df['Cluster'].value_counts().sort_index()
cluster_html = """
<p style="color: darkblue; font-size: 18px; font-weight: bold;">
    Number of data points in each cluster:
"""
for cluster, count in cluster_counts.items():
    cluster_html += f"<br><span style='color: blue;'>Cluster {cluster}</span>: <span style='color: green;'>{count}</span> data points"
cluster_html += "</p>"
display(HTML(cluster_html))
display(HTML("<br><br>"))
                                                                                  # Build HTML Table for K-Means Clustering Analysis
html_table = f"""
<table style='border-collapse: collapse; font-size: 18px; width: 100%; max-width: 900px; table-layout: fixed;'>
    <thead>
        <tr style='background-color: #2f4f4f; color: white;'>
            <th colspan="3" style="text-align: center; font-size: 24px; padding: 8px;">
                KMEANS Clustering Analysis - Approach_3
            </th>
        </tr>
        <tr>
            <td colspan="3" style="border: 1px solid #dddddd; padding: 8px; text-align: center; white-space: normal; word-wrap: break-word;">
                Based on the Optimal K value from the K-Elbow method, KMeans was performed with 4 clusters (n=4), which produced the following clustering statistics:
                <strong>Silhouette Score:</strong> {sil_score:.2f},
                <strong>Davies-Bouldin Index:</strong> {db_index:.2f},
                <strong>Calinski-Harabasz Index:</strong> {ch_index:.2f}, and
                <strong>Inertia:</strong> {inertia_value:.2f}.
                Centroid Variance represents the variance of the cluster centroids for each feature, reflecting its contribution to the clustering structure.
            </td>
        </tr>
        <tr style='background-color: #4CAF50; color: white;'>
            <th style='border: 1px solid #dddddd; padding: 8px;'>S/N</th>
            <th style='border: 1px solid #dddddd; padding: 8px;'>Feature</th>
            <th style='border: 1px solid #dddddd; padding: 8px;'>Centroid Variance</th>
        </tr>
    </thead>
    <tbody>
"""
for idx, row in enumerate(importance_df.itertuples(), start=1):                   # Add a row for each feature's centroid variance
    feature_val = row.Feature
    variance_val = f"{row[2]:.4f}"                                                # row[2] is the 'Centroid Variance'
    html_table += f"""
        <tr style='border: 1px solid #dddddd;'>
            <td style='border: 1px solid #dddddd; padding: 8px; text-align: center;'>{idx}</td>
            <td style='border: 1px solid #dddddd; padding: 8px;'>{feature_val}</td>
            <td style='border: 1px solid #dddddd; padding: 8px; text-align: right;'>{variance_val}</td>
        </tr>
    """
html_table += """
    </tbody>
</table>
"""

display(HTML(html_table))                                                         # Display the HTML table
display(HTML("<br><br>"))
                                                                                  # Save the HTML table as file "5.6 K-Means Feature Importance.html" and download it
html_table_path = "5.6 K-Means Feature Importance.html"
with open(html_table_path, "w", encoding="utf-8") as f:
    f.write(html_table)
files.download(html_table_path)                                                   # Download the HTML table file
df.to_csv("FE_Encoded_with_Clusters.csv", index=False)                            # Save the updated dataset with cluster labels
files.download("FE_Encoded_with_Clusters.csv")                                    # Download the updated dataset with cluster labels
                                                                                  # Rationale Explanation for Choosing n=4
if len(inertia_values) >= 5:
    inertia_reduction_k3_k4 = ((inertia_values[1] - inertia_values[2]) / inertia_values[1]) * 100
    inertia_reduction_k4_k5 = ((inertia_values[2] - inertia_values[3]) / inertia_values[2]) * 100
else:
    inertia_reduction_k3_k4 = inertia_reduction_k4_k5 = 0

explanation = f"""
<p style="color: black; font-size: 18px; font-weight: bold;">
    The K-Elbow method shows that the inertia value decreases notably as K increases from <span style="color: blue;">2</span> to <span style="color: blue;">4</span>. For example, when K increases from <span style="color: blue;">2</span> to <span style="color: blue;">3</span>, inertia decreases from <span style="color: blue;">{inertia_values[0]:.2f}</span> to <span style="color: blue;">{inertia_values[1]:.2f}</span>, and from K = <span style="color: blue;">3</span> to K = <span style="color: blue;">4</span> it decreases to <span style="color: blue;">{inertia_values[2]:.2f}</span> (a reduction of <span style="color: blue;">{inertia_reduction_k3_k4:.1f}%</span>). Beyond K = <span style="color: blue;">4</span>, the reduction in inertia becomes less dramatic (e.g. a further reduction of <span style="color: blue;">{inertia_reduction_k4_k5:.1f}%</span> from K = <span style="color: blue;">4</span> to K = <span style="color: blue;">5</span>).
</p>
<p style="color: black; font-size: 18px; font-weight: bold;">
    The cluster distribution is also reasonable:
    <br><span style="color: blue;">Cluster 0</span>: <span style="color: green;">{cluster_counts.get(0, 0)}</span> data points
    <br><span style="color: blue;">Cluster 1</span>: <span style="color: green;">{cluster_counts.get(1, 0)}</span> data points
    <br><span style="color: blue;">Cluster 2</span>: <span style="color: green;">{cluster_counts.get(2, 0)}</span> data points
    <br><span style="color: blue;">Cluster 3</span>: <span style="color: green;">{cluster_counts.get(3, 0)}</span> data points
</p>
<p style="color: black; font-size: 18px; font-weight: bold;">
    Based on the significant drop in inertia up to K = <span style="color: blue;">4</span> and a balanced cluster distribution, using <span style="color: blue;">4</span> clusters (n=4) for KMeans clustering is a good choice.
</p>
"""
display(HTML(explanation))

### **5.7 Clustering Model Training - Approach 1**

In [None]:
import warnings                                                                   # Import necessary libraries
import itertools
import numpy as np
import pandas as pd
import time
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from sklearn.preprocessing import StandardScaler
from IPython.display import display, HTML

print("\n\n")
warnings.filterwarnings("ignore", category=DeprecationWarning)                    # Ignore Deprecation Warnings
warnings.filterwarnings("ignore", category=FutureWarning)                         # Ignore future warnings

features = [                                                                      # Define feature list
    'OCC_YEAR', 'OCC_DOY', 'OCC_HOUR', 'LONG_WGS84', 'LAT_WGS84',
    'OCC_Month_Encoded', 'OCC_DOW_Encoded', 'Hood_158_Encoded', 'Division_Encoded',
    'Location_Engineered_Other', 'Location_Engineered_Public', 'Location_Engineered_Residential'
]

all_combinations = [list(combo) for r in range(4, 8) for combo in itertools.combinations(features, r)]  # Generate feature combinations (sizes 4, 5, 6 and 7)

formatted_combinations_df = pd.DataFrame(all_combinations)                        # Save feature combinations to CSV
formatted_combinations_df.to_csv('Feature_Combo_Current.csv', index=False)
display(HTML("<p style='color: green; font-size:16px;'><b>Feature sets saved to 'Feature_Combo_Current.csv'</b></p>"))

url = "https://raw.githubusercontent.com/mohammadbadi/CrimeAnalytics_Clustering_Approach_3/refs/heads/main/Output_CSV/FE_Encoded.csv"  # Read the dataset from CSV file
data = pd.read_csv(url)

sample_data = data.sample(frac=0.1, random_state=42)                              # Sample 10% of the data for clustering
results = {}                                                                      # Track results - to store clustering metrics for each feature set
set_counters = {}                                                                 # To track counts per number of features
total_models = 0

def color_silhouette(value):                                                      # HTML helper functions for color coding Silhouette Score
    """Color code Silhouette Score (higher is better)"""
    if value >= 0.6:
        color = "#32cd32"                                                         # green
    elif value >= 0.3:
        color = "#ffcc00"                                                         # yellow
    else:
        color = "#ff6347"                                                         # red
    return f"<span style='color: {color}; font-weight: bold;'>{value:.2f}</span>"

def color_dbi(value):                                                             # HTML helper functions for color coding DB index
    """Color code Davies-Bouldin Index (lower is better)"""
    if value <= 1.0:
        color = "#32cd32"                                                         # green
    elif value <= 1.5:
        color = "#ffcc00"                                                         # yellow
    else:
        color = "#ff6347"                                                         # red
    return f"<span style='color: {color}; font-weight: bold;'>{value:.2f}</span>"

def color_accuracy(value):                                                        # HTML helper functions for color coding Accuracy Percentage Score
    """Color code prediction accuracy: below 50% red, 50.1%+ green"""
    if value < 50.1:
        color = "#ff6347"                                                         # red
    else:
        color = "#32cd32"                                                         # green
    return f"<span style='color: {color}; font-weight: bold;'>{value:.2f}%</span>"

for i, feature_set in enumerate(all_combinations):                                # Iterate through feature sets
    total_models += 1
    valid_features = [f for f in feature_set if f in sample_data.columns]         # Select valid features (ensuring they exist in the dataset)
    if len(valid_features) != len(feature_set):
        print(f"Warning: Some features in {feature_set} are missing. Using only: {valid_features}")
    data_for_clustering = sample_data[valid_features]
    numerical_cols = data_for_clustering.select_dtypes(include=['int64', 'float64']).columns.tolist()     # Standardize numerical columns
    scaler = StandardScaler()
    data_scaled = pd.DataFrame(scaler.fit_transform(data_for_clustering[numerical_cols]), columns=numerical_cols)
    categorical_cols = [col for col in valid_features if col not in numerical_cols]       # Append any categorical columns
    if categorical_cols:
        data_scaled = pd.concat([data_scaled, data_for_clustering[categorical_cols]], axis=1)
    kmeans = KMeans(n_clusters=4, random_state=42)                                # Perform KMeans clustering
    kmeans_labels = kmeans.fit_predict(data_scaled)
    dbscan = DBSCAN(eps=0.5, min_samples=5)                                       # DBSCAN Clustering
    dbscan_labels = dbscan.fit_predict(data_scaled)

    silhouette_kmeans = silhouette_score(data_scaled, kmeans_labels)              # Calculate KMeans metrics
    dbi_kmeans = davies_bouldin_score(data_scaled, kmeans_labels)
    ch_kmeans = calinski_harabasz_score(data_scaled, kmeans_labels)
    kmeans_accuracy = max(0, silhouette_kmeans) * 100

    silhouette_dbscan = -1 if len(set(dbscan_labels)) <= 1 else silhouette_score(data_scaled, dbscan_labels)     # Calculate DBSCAN metrics
    dbi_dbscan = -1 if len(set(dbscan_labels)) <= 1 else davies_bouldin_score(data_scaled, dbscan_labels)
    dbscan_accuracy = max(0, silhouette_dbscan) * 100

    num_features = len(valid_features)                                            # Update set_counters
    if num_features not in set_counters:
        set_counters[num_features] = 0
    set_counters[num_features] += 1
    set_number = set_counters[num_features]

    formatted_features = [f"'{feature}'" for feature in valid_features]           # Format features string
    feature_names_string = f"[{', '.join(formatted_features)}]"

    results[tuple(valid_features)] = {                                            # Store results in dictionary
        'Feature Set': f"{num_features}_Set_{set_number}",
        'Number of Features': num_features,
        'KMeans Silhouette Score': silhouette_kmeans,
        'KMeans Davies-Bouldin Index': dbi_kmeans,
        'KMeans Calinski-Harabasz Score': ch_kmeans,
        'KMeans Prediction Accuracy': kmeans_accuracy,
        'DBSCAN Silhouette Score': silhouette_dbscan,
        'DBSCAN Davies-Bouldin Index': dbi_dbscan,
        'DBSCAN Prediction Accuracy': dbscan_accuracy,
        'Feature_Names_String': feature_names_string
    }

                                                                                  # Create HTML table output for current result
    results_html = f"""
    <div style="margin: 10px auto; width: 80%; border: 2px solid #ddd; padding: 10px; background-color: #f9f9f9;">
      <table style="width: 100%; border-collapse: collapse; text-align: center;">
        <tr>
          <th colspan="3" style="font-size: 14px; padding: 10px; text-align: left;">Result of Feature Set {num_features}_Set_{set_number}</th>
        </tr>
        <tr>
          <th colspan="3" style="font-size: 13px; padding: 5px; text-align: left;">Features : {feature_names_string}</th>
        </tr>
        <tr style="background-color: #e0e0e0; font-weight: bold;">
          <td style="padding: 8px; border: 1px solid #ddd;">Statistic</td>
          <td style="padding: 8px; border: 1px solid #ddd;">KMeans</td>
          <td style="padding: 8px; border: 1px solid #ddd;">DBSCAN</td>
        </tr>
        <tr>
          <td style="padding: 8px; border: 1px solid #ddd;">Silhouette Score</td>
          <td style="padding: 8px; border: 1px solid #ddd;">{color_silhouette(silhouette_kmeans)}</td>
          <td style="padding: 8px; border: 1px solid #ddd;">{"N/A" if silhouette_dbscan == -1 else color_silhouette(silhouette_dbscan)}</td>
        </tr>
        <tr>
          <td style="padding: 8px; border: 1px solid #ddd;">Davies-Bouldin Index</td>
          <td style="padding: 8px; border: 1px solid #ddd;">{color_dbi(dbi_kmeans)}</td>
          <td style="padding: 8px; border: 1px solid #ddd;">{"N/A" if dbi_dbscan == -1 else color_dbi(dbi_dbscan)}</td>
        </tr>
        <tr>
          <td style="padding: 8px; border: 1px solid #ddd;">Calinski-Harabasz Score</td>
          <td style="padding: 8px; border: 1px solid #ddd;">{round(ch_kmeans,2)}</td>
          <td style="padding: 8px; border: 1px solid #ddd;">-</td>
        </tr>
        <tr style="font-weight: bold;">
          <td style="padding: 8px; border: 1px solid #ddd;">Prediction Accuracy</td>
          <td style="padding: 8px; border: 1px solid #ddd;">{color_accuracy(kmeans_accuracy)}</td>
          <td style="padding: 8px; border: 1px solid #ddd;">{color_accuracy(dbscan_accuracy)}</td>
        </tr>
      </table>
    </div>
    """
    display(HTML(results_html))
    time.sleep(0.1)

results_df = pd.DataFrame(results.values())                                         # Convert results to DataFrame
results_df.to_csv('Feature_Combo_Current_Results.csv', index=False)                 # Save the dataframe as CSV File
files.download('Feature_Combo_Current_Results.csv')                                 # Download the CSV file
                                                                                    # Display formatted message for saved file
display(HTML("""
    <p style="color: darkblue; font-size: 18px; font-weight: bold;">
         Results have been saved as <span style="color: green;">Feature_Combo_Current_Results.csv</span>.
    </p>
"""))
display(HTML(f"<h2 style='color: navy; font-size:18px;'><b>Total K-Means & DBSCAN Clustering Models Trained: {total_models}</b></h2>")) # Display total models trained
print("\n\n")
                                                                                    # Final summary display
display(HTML(f"""
<div style="font-family: Arial, sans-serif; font-size: 18px; padding: 15px; border-radius: 10px;
             background: #282c34; color: #61dafb; text-align: center; width: 60%; margin: 20px auto;
             box-shadow: 2px 2px 10px rgba(0,0,0,0.2);">
    <strong>Total K-Means & DBSCAN Clustering models trained:</strong>
    <span style="color: #ffcc00; font-size: 22px;">{total_models}</span>
</div>
"""))

### **Summary Table of Clustering Models - Approach_3**

In [None]:
import warnings                                                                   # Import necessary libraries
import pandas as pd
import asyncio
!pip install dataframe_image -qqq
import dataframe_image as dfi
from IPython.display import display, HTML
from google.colab import files

print("\n\n")
warnings.filterwarnings("ignore", category=DeprecationWarning)                    # Ignore Deprecation Warnings
warnings.filterwarnings("ignore", category=FutureWarning)                         # Ignore future warnings

# Read the CSV file
url = "https://raw.githubusercontent.com/mohammadbadi/CrimeAnalytics_Clustering_Approach_3/refs/heads/main/Output_CSV/Feature_Combo_Current_Results.csv"
data = pd.read_csv(url)

# Define the metrics to evaluate
metrics = [
    'KMeans Silhouette Score',
    'KMeans Calinski-Harabasz Score',
    'DBSCAN Silhouette Score',
    'KMeans Davies-Bouldin Index',
    'DBSCAN Davies-Bouldin Index'
]

# Initialize dictionaries for results and summary counts
top_results = {}
feature_set_summary = {}

# Iterate through the top 100 rows for each metric and store occurrences
for metric in metrics:
    if metric in data.columns:
        # For Davies-Bouldin Index, lower values are better; otherwise higher is better
        if 'Davies-Bouldin' in metric:
            top_rows = data.nsmallest(100, metric)
        else:
            top_rows = data.nlargest(100, metric)
        top_results[metric] = top_rows
        for _, row in top_rows.iterrows():
            feature_set = row['Feature Set']
            if feature_set not in feature_set_summary:
                feature_set_summary[feature_set] = {
                    'Count': 0,
                    'Found In': []
                }
            feature_set_summary[feature_set]['Count'] += 1
            feature_set_summary[feature_set]['Found In'].append(metric)

# Create a summary DataFrame for the most repeated feature sets
summary_df = pd.DataFrame.from_dict(feature_set_summary, orient='index')
summary_df.reset_index(inplace=True)
summary_df.columns = ['Feature Set', 'Count', 'Found In']
summary_df.sort_values(by='Count', ascending=False, inplace=True)

# Prepare the final metrics DataFrame with an extra 'Features' column
final_metrics_df = pd.DataFrame(columns=['Feature Set', 'Features', 'Count', *metrics])
for index, row in summary_df.iterrows():
    feature_set_name = row['Feature Set']
    metrics_row = data[data['Feature Set'] == feature_set_name]
    if not metrics_row.empty:
        new_row = {
            'Feature Set': feature_set_name,
            'Features': metrics_row['Feature_Names_String'].values[0],  # Get value from Feature_Names_String column
            'Count': row['Count'],
            **{metric: metrics_row[metric].values[0] for metric in metrics}
        }
        final_metrics_df = pd.concat([final_metrics_df, pd.DataFrame([new_row])], ignore_index=True)

final_metrics_df.sort_values(by='Count', ascending=False, inplace=True)

# Define a function to highlight the top 5 unique values for a given metric
def highlight_best_top5(s, metric):
    # For Davies-Bouldin, lower is better; for others, higher is better
    if 'Davies-Bouldin' in metric:
        sorted_values = s.sort_values(ascending=True)
    else:
        sorted_values = s.sort_values(ascending=False)
    top5_values = sorted_values.unique()[:5]
    return ['background-color: lightgreen' if x in top5_values else '' for x in s]

# Style the full metrics summary table Approach_3
styled_table = final_metrics_df.style.apply(highlight_best_top5, metric='KMeans Silhouette Score', subset=['KMeans Silhouette Score']) \
                                      .apply(highlight_best_top5, metric='KMeans Calinski-Harabasz Score', subset=['KMeans Calinski-Harabasz Score']) \
                                      .apply(highlight_best_top5, metric='KMeans Davies-Bouldin Index', subset=['KMeans Davies-Bouldin Index']) \
                                      .apply(highlight_best_top5, metric='DBSCAN Silhouette Score', subset=['DBSCAN Silhouette Score']) \
                                      .apply(highlight_best_top5, metric='DBSCAN Davies-Bouldin Index', subset=['DBSCAN Davies-Bouldin Index']) \
                                      .format({metric: '{:.2f}' for metric in metrics}) \
                                      .set_table_styles([
                                          {'selector': 'th', 'props': [('background-color', '#4CAF50'),
                                                                       ('color', 'white'),
                                                                       ('font-weight', 'bold'),
                                                                       ('text-align', 'center')]},
                                          {'selector': 'td', 'props': [('padding', '10px'),
                                                                       ('text-align', 'center')]},
                                          {'selector': '.row:hover', 'props': [('background-color', '#f1f1f1')]}
                                      ]) \
                                      .set_properties(**{'border': '1px solid black'}) \
                                      .set_caption("<h3 style='color: navy; text-align: center;'>📊 Metrics Summary Table Approach_3</h3>")

# Save the styled table to Excel and PNG files
styled_table.data.to_excel('5.8 metrics_summary_table_Approach_3.xlsx', index=False)
dfi.export(styled_table.data, '5.8 metrics_summary_table_Approach_3.png', table_conversion='matplotlib', max_rows=-1)
files.download('5.8 metrics_summary_table_Approach_3.png')
files.download('5.8 metrics_summary_table_Approach_3.xlsx')

# Create an extra table displaying the top 5 models (sorted by Count)
top5_df = final_metrics_df.head(5).copy()
styled_top5 = top5_df.style.apply(lambda s: highlight_best_top5(s, 'KMeans Silhouette Score'), subset=['KMeans Silhouette Score']) \
                              .apply(lambda s: highlight_best_top5(s, 'KMeans Calinski-Harabasz Score'), subset=['KMeans Calinski-Harabasz Score']) \
                              .apply(lambda s: highlight_best_top5(s, 'KMeans Davies-Bouldin Index'), subset=['KMeans Davies-Bouldin Index']) \
                              .apply(lambda s: highlight_best_top5(s, 'DBSCAN Silhouette Score'), subset=['DBSCAN Silhouette Score']) \
                              .apply(lambda s: highlight_best_top5(s, 'DBSCAN Davies-Bouldin Index'), subset=['DBSCAN Davies-Bouldin Index']) \
                              .format({metric: '{:.2f}' for metric in metrics}) \
                              .set_table_styles([
                                  {'selector': 'th', 'props': [('background-color', '#4CAF50'),
                                                               ('color', 'white'),
                                                               ('font-weight', 'bold'),
                                                               ('text-align', 'center'),
                                                               ('border', '1px solid black')]},
                                  {'selector': 'td', 'props': [('padding', '10px'),
                                                               ('text-align', 'center'),
                                                               ('border', '1px solid black')]},
                                  {'selector': 'table', 'props': [('border-collapse', 'collapse')]}
                              ]) \
                              .set_properties(**{'border': '1px solid black'}) \
                              .set_caption("<h3 style='color: navy; text-align: center;'>Top 5 Models in Approach_3</h3>")

display(styled_top5)
display(styled_table)

# Export the top models table as a PNG file and download it
dfi.export(styled_top5.data, 'Top 5 models in Approach_3.png', table_conversion='matplotlib', max_rows=-1)
files.download('Top 5 models in Approach_3.png')

display(HTML("""
    <p style="color: darkblue; font-size: 18px; font-weight: bold;">
         Metrics summary table Appraoch_3 has been saved to <span style="color: green;">'5.8 metrics_summary_table_Approach_3.xlsx'</span>
         and <span style="color: green;">'5.8 metrics_summary_table_Approach_3.png'</span>.
    </p>
"""))


### **5.9 Best Model Training - Approach 3**

In [None]:
import warnings                                                                   # Import necessary libraries
import numpy as np
import pandas as pd
import ast                                                                        # For safely evaluating strings
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from sklearn.preprocessing import StandardScaler
from IPython.display import display, HTML
from google.colab import files

print("\n\n")
warnings.filterwarnings("ignore", category=DeprecationWarning)                    # Ignore Deprecation Warnings
warnings.filterwarnings("ignore", category=FutureWarning)                         # Ignore future warnings

url = "https://raw.githubusercontent.com/mohammadbadi/CrimeAnalytics_Clustering_Approach_3/refs/heads/main/Output_CSV/FE_Encoded.csv"  # Read the dataset from CSV file
url1 = "https://raw.githubusercontent.com/mohammadbadi/CrimeAnalytics_Clustering_Approach_3/refs/heads/main/Output_CSV/Feature_Combo_Current_Results.csv"  # Load feature combinations
original_data = pd.read_csv(url)
feature_combos = pd.read_csv(url1)

                                                                                  # Debugging: Check input files for missing values
html_output = """
<p style="color: black; font-size: 16px; font-weight: bold;">
    Checking input files...<br>
    Missing values in original_data: <span style="color: green;">{orig_missing}</span><br>
    Missing values in feature_combos: <span style="color: green;">{feat_missing}</span>
</p>
"""
display(HTML(html_output.format(orig_missing=original_data.isnull().sum().sum(), feat_missing=feature_combos.isnull().sum().sum())))

original_data['_id'] = original_data.index                                        # Store _id before clustering
sample_data = original_data.copy()
#sample_data = original_data.sample(frac=0.1, random_state=42)                     # Use 10% of the data (for now)

# Define feature sets
set_names = ['4_Set_165','4_Set_369', '4_Set_490', '4_Set_494', '4_Set_495']      # Define the set names to match
feature_sets = []                                                                 # Initialize an empty list to store feature sets
for set_name in set_names:                                                        # Iterate through the set names
    matched_features = feature_combos[feature_combos['Feature Set'] == set_name]['Feature_Names_String']  # Extract corresponding feature sets
    if not matched_features.empty:
        features_list = ast.literal_eval(matched_features.values[0])              # Convert string to list
        feature_sets.append(features_list)
    else:
        feature_sets.append([])                                                   # Handle missing feature sets

                                                                                  # Debugging: Check feature set validity
debug_feature = "<p style=\"color: black; font-size: 16px; font-weight: bold;\">Checking feature set validity.<br>"
for i, features in enumerate(feature_sets, start=1):
    valid_features = [f for f in features if f in sample_data.columns]
    debug_feature += f"<span style=\"color: darkblue;\">{set_names[i-1]}</span>: <span style=\"color: green;\">{len(valid_features)}</span> valid features out of <span style=\"color: green;\">{len(features)}</span><br>"
    if len(valid_features) == 0:
        debug_feature += f"Warning: Feature set <span style=\"color: darkblue;\">{set_names[i-1]}</span> has no valid features!<br>"
debug_feature += "</p>"
display(HTML(debug_feature))

                                                                                  # Debugging: Standardization Check for each feature set
debug_standard = "<p style=\"color: black; font-size: 16px; font-weight: bold;\">Standardization Checks:<br>"
for i, features in enumerate(feature_sets, start=1):
    valid_features = [f for f in features if f in sample_data.columns]
    if valid_features:
        data_for_clustering = sample_data[valid_features].copy()
        numerical_cols = data_for_clustering.select_dtypes(include=['int64', 'float64']).columns.tolist()
        if len(numerical_cols) == 0:
            debug_standard += f" Warning: Feature set <span style=\"color: darkblue;\">{set_names[i-1]}</span> has NO numerical columns for clustering!<br>"
        else:
            scaler = StandardScaler()
            try:
                scaled_data = scaler.fit_transform(data_for_clustering[numerical_cols])
                debug_standard += f" Scaling successful for <span style=\"color: darkblue;\">{set_names[i-1]}</span> (<span style=\"color: green;\">{len(numerical_cols)}</span> numerical features).<br>"
            except Exception as e:
                debug_standard += f" Error scaling <span style=\"color: darkblue;\">{set_names[i-1]}</span>: {e}<br>"
debug_standard += "</p>"
display(HTML(debug_standard))

                                                                                  # Debugging: Check _id mapping
sample_ids = sample_data['_id'].values                                            # Store _id for mapping
duplicated_ids = sample_data['_id'].duplicated().sum()
html_id = """
<p style="color: black; font-size: 16px; font-weight: bold;">
    Checking _id mapping...<br>
    Total unique IDs: <span style="color: green;">{unique}</span>, Duplicates: <span style="color: green;">{dups}</span>
</p>
"""
display(HTML(html_id.format(unique=len(set(sample_ids)), dups=duplicated_ids)))
if duplicated_ids > 0:
    display(HTML("<p style=\"color: red; font-size: 16px; font-weight: bold;\">Warning: Duplicate _id values found!</p>"))

display(HTML("<p style=\"color: darkblue; font-size: 16px; font-weight: bold;\">Pre-clustering checks completed. Data is ready for clustering!</p>"))

                                                                                  # Prepare clustering results DataFrame
clustering_results = original_data.copy()                                         # Create a copy to store clustering results
for i in range(1, 6):                                                             # Add placeholder columns for clustering results for each set
    for algo in ['KMeans','DBSCAN']:
        clustering_results[f'{algo}{i}_Cluster'] = -1
        clustering_results[f'{algo}{i}_Silhouette_Score'] = np.nan
        clustering_results[f'{algo}{i}_Davies_Bouldin_Index'] = np.nan
        if algo == "KMeans":
            clustering_results[f'{algo}{i}_Calinski_Harabasz_Score'] = np.nan
        clustering_results[f'{algo}{i}_Prediction_Accuracy'] = np.nan

                                                                                  # Clustering for each feature set with debugging outputs
debug_cluster = "<p style=\"color: darkblue; font-size: 18px; font-weight: bold;\">Clustering Debug Info:<br>"
for i, features in enumerate(feature_sets, start=1):                              # Perform clustering on each feature set
    valid_features = [f for f in features if f in sample_data.columns]
    data_for_clustering = sample_data[valid_features].copy()
    sample_ids = sample_data['_id'].values                                        # Store _id for mapping back
    numerical_cols = data_for_clustering.select_dtypes(include=['int64', 'float64']).columns.tolist()
    scaler = StandardScaler()
    data_scaled = pd.DataFrame(scaler.fit_transform(data_for_clustering[numerical_cols]), columns=numerical_cols)

    kmeans = KMeans(n_clusters=4, random_state=42)                                # KMeans Clustering
    kmeans_labels = kmeans.fit_predict(data_scaled)
    silhouette_score_kmeans = silhouette_score(data_scaled, kmeans_labels)
    davies_bouldin_score_kmeans = davies_bouldin_score(data_scaled, kmeans_labels)
    calinski_harabasz_score_kmeans = calinski_harabasz_score(data_scaled, kmeans_labels)
    kmeans_accuracy = max(0, silhouette_score_kmeans) * 100

    dbscan = DBSCAN(eps=0.5, min_samples=5)                                       # DBSCAN Clustering
    dbscan_labels = dbscan.fit_predict(data_scaled)
    silhouette_score_dbscan = -1 if len(set(dbscan_labels)) <= 1 else silhouette_score(data_scaled, dbscan_labels)
    davies_bouldin_score_dbscan = -1 if len(set(dbscan_labels)) <= 1 else davies_bouldin_score(data_scaled, dbscan_labels)
    dbscan_accuracy = max(0, silhouette_score_dbscan) * 100

    debug_cluster += f"Feature set <span style=\"color: darkblue;\">{set_names[i-1]}</span> - KMeans labels: <span style=\"color: green;\">{kmeans_labels[:10]}</span> ...<br>"
    debug_cluster += f"Feature set <span style=\"color: darkblue;\">{set_names[i-1]}</span> - DBSCAN labels: <span style=\"color: green;\">{dbscan_labels[:10]}</span> ...<br>"

    for idx, original_idx in enumerate(sample_ids):                               # Map clustering results back to original data based on feature _id
        clustering_results.loc[original_idx, f'KMeans{i}_Cluster'] = kmeans_labels[idx]
        clustering_results.loc[original_idx, f'KMeans{i}_Silhouette_Score'] = silhouette_score_kmeans
        clustering_results.loc[original_idx, f'KMeans{i}_Davies_Bouldin_Index'] = davies_bouldin_score_kmeans
        clustering_results.loc[original_idx, f'KMeans{i}_Calinski_Harabasz_Score'] = calinski_harabasz_score_kmeans
        clustering_results.loc[original_idx, f'KMeans{i}_Prediction_Accuracy'] = kmeans_accuracy
        clustering_results.loc[original_idx, f'DBSCAN{i}_Cluster'] = dbscan_labels[idx]
        clustering_results.loc[original_idx, f'DBSCAN{i}_Silhouette_Score'] = silhouette_score_dbscan
        clustering_results.loc[original_idx, f'DBSCAN{i}_Davies_Bouldin_Index'] = davies_bouldin_score_dbscan
        clustering_results.loc[original_idx, f'DBSCAN{i}_Prediction_Accuracy'] = dbscan_accuracy

debug_cluster += "</p>"
display(HTML(debug_cluster))

                                                                                  # Debugging: Check for NaN values in clustering_results and print a few rows
debug_nan = "<ul style=\"color: darkblue; font-size: 18px; font-weight: bold;\"><li>Checking for NaN values in clustering_results: <span style=\"color: green;\">{nan_dict}</span></li><li>Preview of clustering_results (first 10 rows): <span style=\"color: green;\">{preview}</span></li></ul>"
nan_dict = clustering_results.isnull().sum().to_dict()
preview = clustering_results.head(10).to_html(classes="table table-bordered", index=False)
display(HTML(debug_nan.format(nan_dict=nan_dict, preview=preview)))

clustering_results.to_csv('Best_Clustering_Models.csv', index=False)                  # Save the clustering results to a CSV file
display(HTML("""
    <p style="color: darkblue; font-size: 18px; font-weight: bold;">
         Clustering results saved as <span style="color: darkblue;">'clustering_results.csv'</span>.
    </p>
"""))

                                                                                  # Define the base columns (first file)
base_columns = ['_id', 'EVENT_UNIQUE_ID', 'OCC_YEAR', 'OCC_MONTH', 'OCC_DAY', 'OCC_DOY', 'OCC_DOW', 'OCC_HOUR', 'DIVISION', 'LOCATION_TYPE', 'PREMISES_TYPE', 'HOOD_158', 'NEIGHBOURHOOD_158', 'LONG_WGS84', 'LAT_WGS84', 'OCC_DATETIME', 'reporting_delay_days', 'reporting_delay_hours', 'Location_Engineered', 'Hood_158_Encoded', 'Division_Encoded', 'Location_Engineered_Other', 'Location_Engineered_Public', 'Location_Engineered_Residential', 'OCC_Month_Encoded', 'OCC_DOW_Encoded']
clustering_results_base = clustering_results[base_columns]                        # Create and save the base CSV file
clustering_results_base.to_csv('Clustering_Base_Features.csv', index=False)

                                                                                  # Define the clustering statistics columns (second file)
stats_columns = ['_id',
                 'KMeans1_Cluster', 'KMeans1_Silhouette_Score', 'KMeans1_Davies_Bouldin_Index', 'KMeans1_Calinski_Harabasz_Score', 'KMeans1_Prediction_Accuracy',
                 'DBSCAN1_Cluster', 'DBSCAN1_Silhouette_Score', 'DBSCAN1_Davies_Bouldin_Index', 'DBSCAN1_Prediction_Accuracy',
                 'KMeans2_Cluster', 'KMeans2_Silhouette_Score', 'KMeans2_Davies_Bouldin_Index', 'KMeans2_Calinski_Harabasz_Score', 'KMeans2_Prediction_Accuracy',
                 'DBSCAN2_Cluster', 'DBSCAN2_Silhouette_Score', 'DBSCAN2_Davies_Bouldin_Index', 'DBSCAN2_Prediction_Accuracy',
                 'KMeans3_Cluster', 'KMeans3_Silhouette_Score', 'KMeans3_Davies_Bouldin_Index', 'KMeans3_Calinski_Harabasz_Score', 'KMeans3_Prediction_Accuracy',
                 'DBSCAN3_Cluster', 'DBSCAN3_Silhouette_Score', 'DBSCAN3_Davies_Bouldin_Index', 'DBSCAN3_Prediction_Accuracy',
                 'KMeans4_Cluster', 'KMeans4_Silhouette_Score', 'KMeans4_Davies_Bouldin_Index', 'KMeans4_Calinski_Harabasz_Score', 'KMeans4_Prediction_Accuracy',
                 'DBSCAN4_Cluster', 'DBSCAN4_Silhouette_Score', 'DBSCAN4_Davies_Bouldin_Index', 'DBSCAN4_Prediction_Accuracy',
                 'KMeans5_Cluster', 'KMeans5_Silhouette_Score', 'KMeans5_Davies_Bouldin_Index', 'KMeans5_Calinski_Harabasz_Score', 'KMeans5_Prediction_Accuracy',
                 'DBSCAN5_Cluster', 'DBSCAN5_Silhouette_Score', 'DBSCAN5_Davies_Bouldin_Index', 'DBSCAN5_Prediction_Accuracy']
clustering_results_stats = clustering_results[stats_columns]                      # Create and save the clustering stats CSV file
clustering_results_stats.to_csv('Clustering_Result_Stats.csv', index=False)
files.download('Clustering_Result_Stats.csv')                                     # Download the clustering stats CSV file
display(HTML("""
    <p style="color: darkblue; font-size: 18px; font-weight: bold;">
         Clustering Base Features saved as <span style="color: darkblue;">'Clustering_Base_Features.csv'</span>.<br>
         Clustering Statistics saved as <span style="color: darkblue;">'Clustering_Result_Stats.csv'</span>.
    </p>
"""))
files.download('Best_Clustering_Models.csv')                                      # Download the clustering base CSV file
files.download('Clustering_Base_Features.csv')                                    # Download the clustering results CSV file
files.download('Clustering_Result_Stats.csv')                                     # Download the feature combo results CSV file

### **6.0 Descriptive Statistics - Best Clustering Models - Approach_3**

In [None]:
import pandas as pd
import ast
from IPython.display import display, HTML
from google.colab import files

# Load the dataset
url = "https://raw.githubusercontent.com/mohammadbadi/CrimeAnalytics_Clustering_Approach_3/refs/heads/main/Output_CSV/Best_Clustering_Results.csv"
url1 = "https://raw.githubusercontent.com/mohammadbadi/CrimeAnalytics_Clustering_Approach_3/refs/heads/main/Output_CSV/Feature_Combo_Current_Results.csv"
clustering_results = pd.read_csv(url)

# Load feature combinations
feature_combos = pd.read_csv(url1)

# Define the set names to match
set_names = ['4_Set_165','4_Set_369', '4_Set_490', '4_Set_494', '4_Set_495']

# Initialize an empty list to hold the feature sets
feature_sets = []

# Extract corresponding feature sets
for set_name in set_names:
    matched_features = feature_combos[feature_combos['Feature Set'] == set_name]['Feature_Names_String']
    if not matched_features.empty:
        features_list = ast.literal_eval(matched_features.values[0])  # Convert string to list
        feature_sets.append(features_list)

# Build a combined results list (one row per set)
combined_results = []
for i in range(1, 6):  # For each set (5 sets)
    # KMeans metrics
    silhouette_score_kmeans = clustering_results[f'KMeans{i}_Silhouette_Score'].iloc[0]
    davies_bouldin_score_kmeans = clustering_results[f'KMeans{i}_Davies_Bouldin_Index'].iloc[0]
    calinski_harabasz_score_kmeans = clustering_results[f'KMeans{i}_Calinski_Harabasz_Score'].iloc[0]
    kmeans_accuracy = clustering_results[f'KMeans{i}_Prediction_Accuracy'].iloc[0]

    # DBSCAN metrics
    silhouette_score_dbscan = clustering_results[f'DBSCAN{i}_Silhouette_Score'].iloc[0]
    davies_bouldin_score_dbscan = clustering_results[f'DBSCAN{i}_Davies_Bouldin_Index'].iloc[0]
    dbscan_accuracy = clustering_results[f'DBSCAN{i}_Prediction_Accuracy'].iloc[0]

    # Format the prediction accuracies (bold, 2 decimals)
    kmeans_accuracy_str = f"<strong>{kmeans_accuracy:.2f}%</strong>"
    dbscan_accuracy_str = f"<strong>{dbscan_accuracy:.2f}%</strong>"

    # Create a combined record for this set
    combined_results.append({
         "Set": f"Set {i}",
         "Features": ', '.join(feature_sets[i - 1]),
         "KMeans Silhouette Score": f"{silhouette_score_kmeans:.3f}",
         "KMeans Davies-Bouldin Index": f"{davies_bouldin_score_kmeans:.3f}",
         "KMeans Calinski-Harabasz Score": f"{calinski_harabasz_score_kmeans:.0f}",
         "KMeans Prediction Accuracy": kmeans_accuracy_str,
         "DBSCAN Silhouette Score": f"{silhouette_score_dbscan:.3f}",
         "DBSCAN Davies-Bouldin Index": f"{davies_bouldin_score_dbscan:.3f}",
         "DBSCAN Prediction Accuracy": dbscan_accuracy_str,
         "DBSCAN Accuracy Float": dbscan_accuracy  # for sorting purposes
    })

# Create a DataFrame from the combined results
df_combined = pd.DataFrame(combined_results)

# Sort the DataFrame by DBSCAN Accuracy (as a float) in descending order
df_sorted = df_combined.sort_values(by="DBSCAN Accuracy Float", ascending=False)

# Build the HTML table using the sorted DataFrame
html_table = """
<table style='border-collapse: collapse; width: 100%; font-size: 18px;'>
  <thead style='background-color: #4CAF50; color: white;'>
    <tr>
      <th colspan="9" style="text-align: center; font-size: 24px; background-color: #2f4f4f; color: white;">
        <strong>Clustering Summary Table</strong>
      </th>
    </tr>
    <tr>
      <th>Set</th>
      <th>Features</th>
      <th>KMeans Silhouette Score</th>
      <th>Davies-Bouldin Index</th>
      <th>Calinski-Harabasz Score</th>
      <th>KMeans Prediction Accuracy (%)</th>
      <th>DBSCAN Silhouette Score</th>
      <th>Davies-Bouldin Index</th>
      <th>DBSCAN Prediction Accuracy (%)</th>
    </tr>
  </thead>
  <tbody>
"""

# Loop through the sorted rows to build the table rows
for idx, row in df_sorted.iterrows():
    html_table += f"""
    <tr style='border: 1px solid #dddddd;'>
      <td style='border: 1px solid #dddddd; padding: 8px;'>{row['Set']}</td>
      <td style='border: 1px solid #dddddd; padding: 8px;'>{row['Features']}</td>
      <td style='border: 1px solid #dddddd; padding: 8px;'>{row['KMeans Silhouette Score']}</td>
      <td style='border: 1px solid #dddddd; padding: 8px;'>{row['KMeans Davies-Bouldin Index']}</td>
      <td style='border: 1px solid #dddddd; padding: 8px;'>{row['KMeans Calinski-Harabasz Score']}</td>
      <td style='border: 1px solid #dddddd; padding: 8px;'>{row['KMeans Prediction Accuracy']}</td>
      <td style='border: 1px solid #dddddd; padding: 8px;'>{row['DBSCAN Silhouette Score']}</td>
      <td style='border: 1px solid #dddddd; padding: 8px;'>{row['DBSCAN Davies-Bouldin Index']}</td>
      <td style='border: 1px solid #dddddd; padding: 8px;'>{row['DBSCAN Prediction Accuracy']}</td>
    </tr>
    """
html_table += "</tbody></table>"

# Save the HTML table to a file
with open("Best_Clusters_Summary_Results.html", "w") as f:
    f.write(html_table)
    files.download("Best_Clusters_Summary_Results.html")

# Display the HTML table in Google Colab
display(HTML(html_table))

# Display formatted message for saved file
display(HTML("""
    <p style="color: darkblue; font-size: 18px; font-weight: bold;">
        Clustering Model summary results saved  as <span style="color: green;">Best_Clusters_Summary_Results.html</span>.
    </p>
"""))


### **Code Sections/6.1 3D Visualizations of Top 5 Clusters - Approach 3.ipynb**

In [None]:
!pip install -q kaleido  # Uncomment if kaleido is not installed


In [None]:
import pandas as pd
import plotly.express as px
from IPython.display import display, HTML
import ast, html, base64, os, copy

#!pip install -q kaleido  # Uncomment if kaleido is not installed

# Global variable to accumulate interactive HTML (if needed)
all_html = ""

# Display function for interactive models (800x600px container)
def display_interactive_table(title, fig):
    global all_html
    fig.update_layout(autosize=True, height=580)  # Plot area height 580px
    fig.update_traces(marker_line_width=0)
    # Generate interactive HTML with modebar visible
    fig_html = fig.to_html(full_html=False, include_plotlyjs='cdn', config={"displayModeBar": True})
    fig_html_escaped = html.escape(fig_html)
    iframe_html = f"""<iframe srcdoc="{fig_html_escaped}" style="position:relative; width:800px; height:610px; border:none;"></iframe>"""
    html_table = f"""
      <table style="border-collapse: collapse; width:800px; margin:auto; border:2px solid black;">
        <tbody>
          <tr>
              <td style="padding:0; margin:0;">{iframe_html}</td>
          </tr>
        </tbody>
      </table>
      <br>
      """

    all_html += html_table
    display(HTML(html_table))

# Functions to save and display static images
def file_to_base64(filepath):
    with open(filepath, "rb") as f:
        data = f.read()
    return base64.b64encode(data).decode('utf-8')

def display_static_image(filepath):
    img_base64 = file_to_base64(filepath)
    html_table = f"""
    <table style="border-collapse: collapse; width:800px; margin:auto;">
      <tr>
         <td style="border: 1px solid #dddddd; text-align:center;">
            <img src="data:image/png;base64,{img_base64}" style="width:100%; border:1px solid #dddddd;" />
         </td>
      </tr>
    </table>
    <br>
    """
    display(HTML(html_table))

# Load datasets
url = "https://raw.githubusercontent.com/mohammadbadi/CrimeAnalytics_Clustering_Approach_3/refs/heads/main/Output_CSV/Clustering_Base_Features.csv"
url1 = "https://raw.githubusercontent.com/mohammadbadi/CrimeAnalytics_Clustering_Approach_3/refs/heads/main/Output_CSV/Clustering_Result_Stats.csv"
url2 = "https://raw.githubusercontent.com/mohammadbadi/CrimeAnalytics_Clustering_Approach_3/refs/heads/main/Output_CSV/Feature_Combo_Current_Results.csv"
features_df = pd.read_csv(url)
clustering_stats_df = pd.read_csv(url1)
clustering_results = pd.merge(features_df, clustering_stats_df, on="_id", how="left")
feature_combos = pd.read_csv(url2)

# Extract feature sets
set_names = ['4_Set_165', '4_Set_369', '4_Set_490', '4_Set_494', '4_Set_495']
feature_sets = []
for set_name in set_names:
    matched_features = feature_combos[feature_combos['Feature Set'] == set_name]['Feature_Names_String']
    if not matched_features.empty:
        features_list = ast.literal_eval(matched_features.values[0])
        feature_sets.append(features_list)

color_sequence = px.colors.qualitative.Plotly
static_outputs = []  # to store PNG filenames

# Process clustering sets (KMeans and DBSCAN)
for i in range(1, 6):
    kmeans_cluster_col = f'KMeans{i}_Cluster'
    dbscan_cluster_col = f'DBSCAN{i}_Cluster'
    features_used = feature_sets[i - 1]

    valid_kmeans_data = clustering_results[
        clustering_results[kmeans_cluster_col].notna() &
        (clustering_results[kmeans_cluster_col] != -1) &
        clustering_results[features_used[0]].notna() &
        clustering_results[features_used[1]].notna() &
        clustering_results[features_used[2]].notna()
    ]
    valid_dbscan_data = clustering_results[
        clustering_results[dbscan_cluster_col].notna() &
        (clustering_results[dbscan_cluster_col] != -1) &
        clustering_results[features_used[0]].notna() &
        clustering_results[features_used[1]].notna() &
        clustering_results[features_used[2]].notna()
    ]

    # Process KMeans data
    if not valid_kmeans_data.empty:
        cluster_sizes = valid_kmeans_data.groupby(kmeans_cluster_col).size().rename('cluster_size')
        valid_kmeans_data = valid_kmeans_data.merge(cluster_sizes, left_on=kmeans_cluster_col, right_index=True)
        fig_kmeans = px.scatter_3d(
            valid_kmeans_data,
            x=features_used[0],
            y=features_used[1],
            z=features_used[2],
            color=valid_kmeans_data[kmeans_cluster_col].astype(str),
            size='cluster_size',
            size_max=50,
            color_discrete_sequence=color_sequence
        )
        fig_kmeans.update_traces(marker_line_width=0)

        # Update layout with smaller margins and horizontal legend
        fig_kmeans.update_layout(
            margin=dict(t=20, b=1, l=30, r=1),  # Reduced top margin
            scene=dict(
                xaxis_title=features_used[0],
                yaxis_title=features_used[1],
                zaxis_title=features_used[2],
                domain=dict(x=[0.2, 1], y=[0, 1])
            ),
            legend=dict(
                orientation="v",    # Horizontal legend
                yanchor="top",      # Anchor position
                y=0.68,             # Position at top
                xanchor="left",     # Left align horizontally
                x=0,                # Left position
                itemwidth=30,       # Width of each legend item
                itemsizing="constant", # Fixed size for items
                borderwidth=0,      # Add a border
                bordercolor="gray", # Border color
                tracegroupgap=7     # Gap between legend groups
            )
        )

        # Add plot title as an annotation in the figure
        fig_kmeans.add_annotation(
            x=0.1, y=1.018, xref="paper", yref="paper",
            text=f"<b>{'KMeans Clustering Set ' + str(i)}</b>",
            showarrow=False, align="center",
            font=dict(size=22),
            bgcolor="rgba(255,255,255,0.8)",
            bordercolor="gray",
            borderwidth=0,
            borderpad=2
        )
        # Add feature names as an annotation near the top, left-aligned
        wrapped_features = "<b>Features:</b> " + ",<br>".join(features_used)
        fig_kmeans.add_annotation(
            x=-0.04, y=0.9, xref="paper", yref="paper",
            text=wrapped_features,
            showarrow=False, align="left",
            font=dict(size=15),
            bgcolor="rgba(255,255,255,0.8)",
            bordercolor="gray",
            borderwidth=0,
            borderpad=0
        )
        # Add clustering statistics as an annotation below the features annotation
        kmeans_sil = clustering_results[f'KMeans{i}_Silhouette_Score'].iloc[0]
        kmeans_db  = clustering_results[f'KMeans{i}_Davies_Bouldin_Index'].iloc[0]
        kmeans_ch  = clustering_results[f'KMeans{i}_Calinski_Harabasz_Score'].iloc[0]
        kmeans_acc = clustering_results[f'KMeans{i}_Prediction_Accuracy'].iloc[0]
        kmeans_stats = (
            f"Silhouette: {kmeans_sil:.3f}<br>"
            f"DB: {kmeans_db:.3f}<br>"
            f"CH: {kmeans_ch:.0f}<br>"
            f"Accuracy: {kmeans_acc:.2f}%"
          )
        fig_kmeans.add_annotation(
        x=-0.04, y=0, xref="paper", yref="paper",
        text=f"<b>{kmeans_stats}</b>",
        showarrow=False, align="left",
        font=dict(size=16),
        bgcolor="rgba(255,255,255,0.8)",
        bordercolor="gray",
        borderwidth=0,
        borderpad=0
        )

        # Display interactive figure
        display_interactive_table(f"KMeans Clustering Set {i}", fig_kmeans)

        # For static PNG - create a copy with different annotations
        fig_kmeans_static = copy.deepcopy(fig_kmeans)

        # Save static PNG for now (we'll improve it later)
        filename = f"kmeans_cluster_set_{i}.png"
        fig_kmeans_static.write_image(filename, width=800, height=600)
        static_outputs.append(filename)

    # Process DBSCAN data with the same improvements
    if not valid_dbscan_data.empty:
        cluster_sizes = valid_dbscan_data.groupby(dbscan_cluster_col).size().rename('cluster_size')
        valid_dbscan_data = valid_dbscan_data.merge(cluster_sizes, left_on=dbscan_cluster_col, right_index=True)
        fig_dbscan = px.scatter_3d(
            valid_dbscan_data,
            x=features_used[0],
            y=features_used[1],
            z=features_used[2],
            color=valid_dbscan_data[dbscan_cluster_col].astype(str),
            size='cluster_size',
            size_max=50,
            color_discrete_sequence=color_sequence
        )
        fig_dbscan.update_traces(marker_line_width=0)

        # Update layout with smaller margins and horizontal legend
        # For DBSCAN plots - update this section
        fig_dbscan.update_layout(
            margin=dict(t=20, b=1, l=30, r=1),  # Reduced top margin
            scene=dict(
                xaxis_title=features_used[0],
                yaxis_title=features_used[1],
                zaxis_title=features_used[2],
                domain=dict(x=[0.2, 1], y=[0, 1])
            ),
            legend=dict(
                orientation="v",    # Horizontal legend
                yanchor="top",      # Anchor position
                y=0.68,             # Position at top
                xanchor="left",     # Left align horizontally
                x=0,                # Left position
                itemwidth=30,       # Width of each legend item
                itemsizing="constant", # Fixed size for items
                borderwidth=0,      # Add a border
                bordercolor="gray", # Border color
                tracegroupgap=7     # Gap between legend groups
            )
        )
        # Add plot title as an annotation in the figure
        fig_dbscan.add_annotation(
            x=0.1, y=1.018, xref="paper", yref="paper",
            text=f"<b>{'DBSCAN Clustering Set ' + str(i)}</b>",
            showarrow=False, align="center",
            font=dict(size=22),
            bgcolor="rgba(255,255,255,0.8)",
            bordercolor="gray",
            borderwidth=0,
            borderpad=2
        )
        # Add feature names as an annotation near the top, left-aligned
        wrapped_features = "<b>Features:</b> " + ",<br>".join(features_used)
        fig_dbscan.add_annotation(
            x=-0.04, y=0.90, xref="paper", yref="paper",
            text=wrapped_features,
            showarrow=False, align="left",
            font=dict(size=15),
            bgcolor="rgba(255,255,255,0.8)",
            bordercolor="gray",
            borderwidth=0,
            borderpad=0
        )
        # Add clustering statistics as an annotation below the features annotation
        dbscan_sil = clustering_results[f'DBSCAN{i}_Silhouette_Score'].iloc[0]
        dbscan_db  = clustering_results[f'DBSCAN{i}_Davies_Bouldin_Index'].iloc[0]
        dbscan_acc = clustering_results[f'DBSCAN{i}_Prediction_Accuracy'].iloc[0]
        dbscan_stats = (
                  f"Silhouette: {dbscan_sil:.3f}<br>"
                  f"DB: {dbscan_db:.3f}<br>"
                  f"Accuracy: {dbscan_acc:.2f}%"
                )
        fig_dbscan.add_annotation(
        x=-0.04, y=0, xref="paper", yref="paper",
        text=f"<b>{dbscan_stats}</b>",
        showarrow=False, align="left",
        font=dict(size=16),
        bgcolor="rgba(255,255,255,0.8)",
        bordercolor="gray",
        borderwidth=0,
        borderpad=0
        )

        # Display interactive figure
        display_interactive_table(f"DBSCAN Clustering Set {i}", fig_dbscan)

        # For static PNG - create a copy with different annotations
        fig_dbscan_static = copy.deepcopy(fig_dbscan)

        # Save static PNG for now (we'll improve it later)
        filename = f"dbscan_cluster_set_{i}.png"
        fig_dbscan_static.write_image(filename, width=800, height=600)
        static_outputs.append(filename)

# After all interactive figures are displayed, show all static PNG images.
for filepath in static_outputs:
    if os.path.exists(filepath):
        display_static_image(filepath)
    else:
        display(HTML(f"<p style='text-align:center; color:red;'>Error: {filepath} not found.</p>"))

# Optionally, save all interactive HTML to a file.
with open("interactive_visuals.html", "w") as f:
    f.write(all_html)

display(HTML("<h3 style='text-align:center;'>Interactive HTML and static PNG images have been saved.</h3>"))