<a href="https://colab.research.google.com/github/mohammadbadi/Clustering_Frequency/blob/main/Code%20Sections/5.1%20Learning%20Application%20Domain%20Summary%20Table.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **5.1 Learning Application Domain: Summary Table**

In [1]:
import warnings                                                                   # Import necessary libraries
import pandas as pd
import kagglehub
import matplotlib.pyplot as plt
import os
import contextlib
from google.colab import files
from IPython.display import display, HTML
from kagglehub import KaggleDatasetAdapter

print("\n\n")
# Suppress warnings
warnings.simplefilter(action='ignore', category=FutureWarning)                    # Ignore Deprecation Warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)                    # Ignore future warnings

file_path = "major-crime-indicators.csv"                                          # File path for the dataset

with open(os.devnull, 'w') as fnull:                                              # Load the latest version using the correct dataset handle while suppressing the download output
    with contextlib.redirect_stdout(fnull):
        df = kagglehub.load_dataset(
            KaggleDatasetAdapter.PANDAS,
            "mohammadbadi/crimes-in-toronto",                                     # Dataset handle on Kaggle
            file_path,
        )

def count_leading_trailing_spaces(column):                                        # Function to check leading/trailing spaces, ensuring the column is treated as a string
    column = column.astype(str)
    return column.str.startswith(' ').sum(), column.str.endswith(' ').sum()

def prepare_summary_table(df):                                                    # Function to prepare the summary table with only Null values check and calculate % of Null
    unique_values = df.nunique()                                                  # Count total, unique, and Null values for each column
    total_values = df.count() + df.isnull().sum()                                 # Total includes NaN and Null
    null_counts = df.isnull().sum()

    null_percentages = (null_counts / total_values) * 100                         # Calculate the percentage of Null values compared to total values

    leading_spaces, trailing_spaces = zip(*[count_leading_trailing_spaces(df[col]) for col in df.columns]) # Initialize column-based space checks

    summary_table = pd.DataFrame({                                                # Create a summary table with the calculated values
        "Column": df.columns,
        "Data Type": df.dtypes,
        "Total Values": total_values,
        "Unique Values": unique_values,
        "Null Values": null_counts,
        "Null %": null_percentages.round(1),
        "Leading Spaces": leading_spaces,
        "Trailing Spaces": trailing_spaces
    })

    return summary_table, null_counts, null_percentages

def generate_summary_html(summary_table):                                        # Function to generate HTML for the summary table
    summary_table_html = """
    <style>
        table {
            border-collapse: collapse;
            width: 100%;
            font-family: Arial, sans-serif;
        }
        table th, table td {
            border: 1px solid #ddd;
            padding: 8px;
            text-align: left;
        }
        table th {
            background-color: #4CAF50;
            color: white;
            font-size: 1.1em;
        }
        table tr:nth-child(even) {background-color: #f2f2f2;}
        table tr:hover {background-color: #ddd;}
    </style>
    <table>
        <tr>
            <th>Column</th>
            <th>Data Type</th>
            <th>Total Values</th>
            <th>Unique Values</th>
            <th>Null Values</th>
            <th>Null %</th>
            <th>Leading Spaces</th>
            <th>Trailing Spaces</th>
        </tr>
    """

    for _, row in summary_table.iterrows():
        null_color = "red" if row['Null Values'] > 0 else "green"
        leading_color = "red" if row['Leading Spaces'] > 0 else "green"
        trailing_color = "red" if row['Trailing Spaces'] > 0 else "green"

        summary_table_html += f"""
        <tr>
            <td>{row['Column']}</td>
            <td>{row['Data Type']}</td>
            <td>{row['Total Values']}</td>
            <td>{row['Unique Values']}</td>
            <td style='color:{null_color};'>{row['Null Values']}</td>
            <td>{row['Null %']}</td>
            <td style='color:{leading_color};'>{row['Leading Spaces']}</td>
            <td style='color:{trailing_color};'>{row['Trailing Spaces']}</td>
        </tr>
        """
    summary_table_html += "</table>"
    return summary_table_html

def plot_missing_percentage(df, dataset_name):                                    # Function to plot missing percentage visualization
    missing_percentage = df.isnull().mean() * 100                                 # Calculate the missing percentage for each column
    missing_percentage = missing_percentage[missing_percentage > 0]               # show only columns with missing values

    if missing_percentage.empty:                                                  # Check if there's any column with missing data
        print(f"No missing data in {dataset_name}. Skipping missing percentage plot.")
        return ""

    plt.figure(figsize=(10, 6))                                                  # Plotting the bar chart for missing percentages
    missing_percentage.sort_values().plot(kind='barh', color='skyblue', edgecolor='grey')
    plt.title(f'Missing Data Percentage - {dataset_name}', fontsize=16)
    plt.xlabel('Missing Percentage (%)', fontsize=12)
    plt.ylabel('Columns', fontsize=12)
    plt.xticks(rotation=45)
    plt.tight_layout()
    missing_plot_path = f'/content/5.1 Original Data Missing PercentPlot.png'     # Path to save the plot
    plt.savefig(missing_plot_path)
    plt.close()                                                                   # Close the plot to avoid displaying it again

    return missing_plot_path

def get_suggestions(null_counts):                                                 # Function to generate suggestions based on Null values
    suggestions = []

    no_null_columns = null_counts[null_counts == 0].index.tolist()                # Handle columns with no Null values
    if no_null_columns:
        suggestions.append(f"<div style='color:green;'><strong>Columns {', '.join(no_null_columns)}</strong> have no Null values. <em>No action is needed.</em></div>")

    columns_with_null_data = null_counts[null_counts > 0].index.tolist()          # Handle columns with Null values
    if columns_with_null_data:
        suggestions.append(f"<div style='color:red;'><strong>Columns {', '.join(columns_with_null_data)}</strong> have Null values. <em>Actions should be taken.</em></div>")

    return suggestions

def display_summary_for_file(df, title):                                          # Function to display the summary table and suggestions
    summary_table, null_counts, null_percentages = prepare_summary_table(df)      # Prepare the summary table
    summary_html = generate_summary_html(summary_table)

    suggestions = get_suggestions(null_counts)                                    # Get the suggestions based on Null values handling

    missing_plot_path = plot_missing_percentage(df, title)                        # Plot the missing percentage visual and get the path of the image
                                                                                  # Combine only the table and suggestions into the HTML content
    complete_html = f"""
    <html>
    <head><title>Dataset Summary - {title}</title></head>
    <body>
        <h2>Summary Table for {title}</h2>
        {summary_html}
        <h2>Suggestions</h2>
        {"".join(suggestions)}
    </body>
    </html>
    """
    summary_html_path = f'/content/5.1 Learning Application Domain.html'          # Save the HTML output to a file for table + suggestions
    with open(summary_html_path, 'w', encoding='utf-8') as f:
        f.write(complete_html)

    return summary_html_path, missing_plot_path

html_file_path, missing_plot_path = display_summary_for_file(df, "Analysis on Original Data") # Save and display summary for dataset

display(HTML(f"<h2>Summary Table and Suggestions for Major Crime Indicators Dataset</h2><br>{open(html_file_path).read()}"))  # Display the summary table and suggestions in the console

files.download(html_file_path)                                                    # Download the generated Summary Table + Suggestions as HTML
files.download(missing_plot_path)                                                 # Download the generated Missing Data Percentage plot as PNG
print("\n\n")




Downloading from https://www.kaggle.com/api/v1/datasets/download/mohammadbadi/crimes-in-toronto?dataset_version_number=1&file_name=major-crime-indicators.csv...


100%|██████████| 117M/117M [00:02<00:00, 42.7MB/s]


Column,Data Type,Total Values,Unique Values,Null Values,Null %,Leading Spaces,Trailing Spaces
_id,int64,420200,420200,0,0.0,0,0
EVENT_UNIQUE_ID,object,420200,366257,0,0.0,0,0
REPORT_DATE,object,420200,4018,0,0.0,0,0
OCC_DATE,object,420200,4518,0,0.0,0,0
REPORT_YEAR,int64,420200,11,0,0.0,0,0
REPORT_MONTH,object,420200,12,0,0.0,0,0
REPORT_DAY,int64,420200,31,0,0.0,0,0
REPORT_DOY,int64,420200,366,0,0.0,0,0
REPORT_DOW,object,420200,7,0,0.0,0,420200
REPORT_HOUR,int64,420200,24,0,0.0,0,0


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>




