<a href="https://colab.research.google.com/github/Maddi007-Py/Maddi007-Py-CrimeAnalytics_Clustering/blob/main/CrimeAnalytics_Clustering_5_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **5.1 Learning Application Domain: Summary Table**

In [None]:
# Suppress warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Read the data from CSV file
df = pd.read_csv('/content/major-crime-indicators.csv', low_memory=False)

# Function to check leading/trailing spaces, ensuring the column is treated as a string
def count_leading_trailing_spaces(column):
    column = column.astype(str)
    return column.str.startswith(' ').sum(), column.str.endswith(' ').sum()

# Function to prepare the summary table with only Null values check and calculate % of Null
def prepare_summary_table(df):
    # Count total, unique, and Null values for each column
    unique_values = df.nunique()
    total_values = df.count() + df.isnull().sum()  # Total includes NaN and Null
    null_counts = df.isnull().sum()

    # Calculate the percentage of Null values compared to total values
    null_percentages = (null_counts / total_values) * 100

    # Initialize column-based space checks
    leading_spaces, trailing_spaces = zip(*[count_leading_trailing_spaces(df[col]) for col in df.columns])

    # Create summary DataFrame
    summary_table = pd.DataFrame({
        "Column": df.columns,
        "Data Type": df.dtypes,
        "Total Values": total_values,
        "Unique Values": unique_values,
        "Null Values": null_counts,
        "Null %": null_percentages.round(1),
        "Leading Spaces": leading_spaces,
        "Trailing Spaces": trailing_spaces
    })

    return summary_table, null_counts, null_percentages

# Function to generate the summary HTML table with color coding for Null, Leading, and Trailing Spaces
def generate_summary_html(summary_table):
    summary_table_html = """
    <style>
        table {
            border-collapse: collapse;
            width: 100%;
            font-family: Arial, sans-serif;
        }
        table th, table td {
            border: 1px solid #ddd;
            padding: 8px;
            text-align: left;
        }
        table th {
            background-color: #4CAF50;
            color: white;
            font-size: 1.1em;
        }
        table tr:nth-child(even) {background-color: #f2f2f2;}
        table tr:hover {background-color: #ddd;}
    </style>
    <table>
        <tr>
            <th>Column</th>
            <th>Data Type</th>
            <th>Total Values</th>
            <th>Unique Values</th>
            <th>Null Values</th>
            <th>Null %</th>
            <th>Leading Spaces</th>
            <th>Trailing Spaces</th>
        </tr>
    """

    for _, row in summary_table.iterrows():
        null_color = "red" if row['Null Values'] > 0 else "green"
        leading_color = "red" if row['Leading Spaces'] > 0 else "green"
        trailing_color = "red" if row['Trailing Spaces'] > 0 else "green"

        summary_table_html += f"""
        <tr>
            <td>{row['Column']}</td>
            <td>{row['Data Type']}</td>
            <td>{row['Total Values']}</td>
            <td>{row['Unique Values']}</td>
            <td style='color:{null_color};'>{row['Null Values']}</td>
            <td>{row['Null %']}</td>
            <td style='color:{leading_color};'>{row['Leading Spaces']}</td>
            <td style='color:{trailing_color};'>{row['Trailing Spaces']}</td>
        </tr>
        """
    summary_table_html += "</table>"
    return summary_table_html

# Function to plot missing percentage visualization
def plot_missing_percentage(df, dataset_name):
    # Calculate the missing percentage for each column
    missing_percentage = df.isnull().mean() * 100
    missing_percentage = missing_percentage[missing_percentage > 0]  # Only show columns with missing values

    # Check if there's any column with missing data
    if missing_percentage.empty:
        print(f"No missing data in {dataset_name}. Skipping missing percentage plot.")
        return ""

    # Plotting the bar chart for missing percentages
    plt.figure(figsize=(10, 6))
    missing_percentage.sort_values().plot(kind='barh', color='skyblue', edgecolor='grey')
    plt.title(f'Missing Data Percentage - {dataset_name}', fontsize=16)
    plt.xlabel('Missing Percentage (%)', fontsize=12)
    plt.ylabel('Columns', fontsize=12)
    plt.xticks(rotation=45)
    plt.tight_layout()
    missing_plot_path = f'/content/Original Data Missing PercentPlot.png'  # Path to save the plot
    plt.savefig(missing_plot_path)
    plt.close()  # Close the plot to avoid displaying it again

    return missing_plot_path

# Suggest actions based on Null values
def get_suggestions(null_counts):
    suggestions = []

    # Handle columns with no Null values
    no_null_columns = null_counts[null_counts == 0].index.tolist()
    if no_null_columns:
        suggestions.append(f"<div style='color:green;'><strong>Columns {', '.join(no_null_columns)}</strong> have no Null values. <em>No action is needed.</em></div>")

    # Handle columns with Null values
    columns_with_null_data = null_counts[null_counts > 0].index.tolist()
    if columns_with_null_data:
        suggestions.append(f"<div style='color:red;'><strong>Columns {', '.join(columns_with_null_data)}</strong> have Null values. <em>Actions should be taken.</em></div>")

    return suggestions

# Function to display the summary and suggestions
def display_summary_for_file(df, title):
    # Prepare the summary table
    summary_table, null_counts, null_percentages = prepare_summary_table(df)
    summary_html = generate_summary_html(summary_table)

    # Get suggestions for Null values handling
    suggestions = get_suggestions(null_counts)

    # Plot the missing percentage visual and get the path of the image
    missing_plot_path = plot_missing_percentage(df, title)

    # Combine only the table and suggestions into the HTML content
    complete_html = f"""
    <html>
    <head><title>Dataset Summary - {title}</title></head>
    <body>
        <h2>Summary Table for {title}</h2>
        {summary_html}
        <h2>Suggestions</h2>
        {"".join(suggestions)}
    </body>
    </html>
    """

    # Save the HTML output to a file for table + suggestions
    summary_html_path = f'/content/5.1 Learning Application Domain.html'
    with open(summary_html_path, 'w', encoding='utf-8') as f:
        f.write(complete_html)

    return summary_html_path, missing_plot_path

# Save and display summary for dataset
html_file_path, missing_plot_path = display_summary_for_file(df, "Analysis on Original Data")

# Display the summary table and suggestions in the console
display(HTML(f"<h2>Summary Table and Suggestions for Major Crime Indicators Dataset</h2><br>{open(html_file_path).read()}"))

# Download the generated files
files.download(html_file_path)  # Summary Table + Suggestions as HTML
files.download(missing_plot_path)  # Missing Data Percentage plot as PNG