<a href="https://colab.research.google.com/github/mohammadbadi/CrimeAnalytics_Clustering/blob/main/Code%20Sections/5.4.3%20Data%20Reduction%20and%20Projection(FGH).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **5.4 Data Reduction and Projection - f) Feature Engineering, g) Feature Encoding and h) Descriptive Statistics**


In [4]:
import warnings                                                                   # Import necessary libraries
import numpy as np
import pandas as pd
from IPython.display import display, HTML
from google.colab import files
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.colors import BoundaryNorm, LinearSegmentedColormap
from scipy import stats
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

print("\n\n")
warnings.filterwarnings("ignore", category=DeprecationWarning)                    # Ignore Deprecation Warnings
warnings.filterwarnings("ignore", category=FutureWarning)                         # Ignore future warnings

def display_formatted_message(message, filename=None, additional_info=None):      # Helper function for creating formatted HTML output messages
    html_message = """
    <p style="color: darkblue; font-size: 18px; font-weight: bold;">
        {message}
    """.format(message=message)

    if filename:
        html_message += """ <span style="color: green;">'{filename}'</span>""".format(filename=filename)

    if additional_info:
        html_message += """ <span style="color: purple;">{info}</span>""".format(info=additional_info)

    html_message += """
    </p>
    """
    display(HTML(html_message))

url = "https://raw.githubusercontent.com/mohammadbadi/CrimeAnalytics_Clustering/refs/heads/main/Output_CSV/Final_Data.csv"    # Read the dataset from CSV file
df = pd.read_csv(url)

residential_types = [                                                             # Define categories for Feature Engineering of Location
    'Apartment (Rooming House, Condo)',
    'Single Home, House (Attach Garage, Cottage, Mobile)',
    'Group Homes (Non-Profit, Halfway House, Social Agency)',
    'Community Group Home', 'Retirement Home', 'Nursing Home',
    'Private Property Structure (Pool, Shed, Detached Garage)'
]
public_types = [
    'Streets, Roads, Highways (Bicycle Path, Private Road)',
    'Open Areas (Lakes, Parks, Rivers)',
    "Other Non Commercial / Corporate Places (Non-Profit, Gov'T, Firehall)",
    'Parking Lots (Apt., Commercial Or Non-Commercial)'
]
df['Location_Engineered'] = df['LOCATION_TYPE'].apply(                            # Apply location categorization based on predefined lists
    lambda x: 'Residential' if x in residential_types else
              ('Public' if x in public_types else 'Other')
)
location_counts = df['Location_Engineered'].value_counts()                        # Count occurrences of each location category
residential_count = df[df['Location_Engineered'] == 'Residential'].shape[0]       # Calculate individual category counts
public_count = df[df['Location_Engineered'] == 'Public'].shape[0]
other_count = df[df['Location_Engineered'] == 'Other'].shape[0]
total_rows = df.shape[0]
not_in_categories = total_rows - (residential_count + public_count + other_count)

                                                                                  # Display category distribution with formatted HTML - using the same style as the formatted messages
category_distribution = f"""
<p style="color: darkblue; font-size: 18px; font-weight: bold;">
    Total Residential rows: <span style="color: green;">{residential_count}</span><br>
    Total Public rows: <span style="color: green;">{public_count}</span><br>
    Total Other rows: <span style="color: green;">{other_count}</span><br>
    Total rows not in any category: <span style="color: green;">{not_in_categories}</span>
</p>
"""
display(HTML(category_distribution))

df.to_csv("FEngineered_Data.csv", index=False)                                    # Save engineered dataset
display_formatted_message("Locations categorized and data saved to", "FEngineered_Data.csv")

df_encoded = df.copy()                                                            # Feature Encoding - copy of previous dataframe
hood_counts = df_encoded['HOOD_158'].value_counts(normalize=True)
df_encoded['Hood_158_Encoded'] = df_encoded['HOOD_158'].map(hood_counts)          # Normalize frequency of neighborhoods and divisions
division_counts = df_encoded['DIVISION'].value_counts(normalize=True)
df_encoded['Division_Encoded'] = df_encoded['DIVISION'].map(division_counts)
encoder = OneHotEncoder(sparse_output=False)                                     # One-hot encode the engineered location categories
location_encoded = encoder.fit_transform(df_encoded[['Location_Engineered']])
location_encoded_df = pd.DataFrame(location_encoded, columns=encoder.get_feature_names_out(['Location_Engineered']))
df_encoded = pd.concat([df_encoded, location_encoded_df], axis=1)
month_mapping = {                                                                # Map month names to numerical values
    'January': 1, 'February': 2, 'March': 3, 'April': 4,
    'May': 5, 'June': 6, 'July': 7, 'August': 8,
    'September': 9, 'October': 10, 'November': 11, 'December': 12
}
df_encoded['OCC_Month_Encoded'] = df_encoded['OCC_MONTH'].map(month_mapping)
dow_encoder = LabelEncoder()                                                      # Label encode the OCC_DOW (Day of Week column)
df_encoded['OCC_DOW_Encoded'] = dow_encoder.fit_transform(df_encoded['OCC_DOW'])
df_encoded.to_csv("FE_Encoded.csv", index=False)                                  # Save encoded dataset
display_formatted_message("After encoding, data saved to", "FE_Encoded.csv")

display_formatted_message("Sample of encoded columns:", additional_info="(first 5 rows)") # Display sample of encoded columns with formatted HTML
display(df_encoded[['DIVISION', 'Division_Encoded', 'HOOD_158', 'Hood_158_Encoded',
          'Location_Engineered', 'Location_Engineered_Public',
          'Location_Engineered_Residential', 'Location_Engineered_Other',
          'OCC_MONTH', 'OCC_Month_Encoded', 'OCC_DOW', 'OCC_DOW_Encoded']].head())

df_encoded_stats = df_encoded.copy()                                              # Create a copy for further statistical analysis
                                                                                  # Display unique values for encoded columns with formatted HTML
unique_values_info = f"""
<p style="color: darkblue; font-size: 18px; font-weight: bold;">
    Unique Values in Encoded Columns:<br>
    Total unique values in 'Hood_158_Encoded': <span style="color: green;">{df_encoded_stats['Hood_158_Encoded'].nunique()}</span><br>
    Total unique values in 'Division_Encoded': <span style="color: green;">{df_encoded_stats['Division_Encoded'].nunique()}</span><br>
    Total unique values in 'OCC_Month_Encoded': <span style="color: green;">{df_encoded_stats['OCC_Month_Encoded'].nunique()}</span><br>
    Total unique values in 'OCC_DOW_Encoded': <span style="color: green;">{df_encoded_stats['OCC_DOW_Encoded'].nunique()}</span>
</p>
"""
display(HTML(unique_values_info))

df_summary = df_encoded_stats.copy()                                              # Data Summary and Missing Data Analysis
def count_leading_trailing_spaces(column):                                        # Function to count leading/trailing spaces in strings
    column = column.astype(str)
    return column.str.startswith(' ').sum(), column.str.endswith(' ').sum()

def prepare_summary_table(df):                                                    # Function to generate a summary table for dataset
    unique_values = df.nunique()
    total_values = df.count() + df.isnull().sum()
    null_counts = df.isnull().sum()
    nan_counts = df.isna().sum()
    null_percentages = (null_counts / total_values) * 100
    leading_spaces, trailing_spaces = zip(*[count_leading_trailing_spaces(df[col]) for col in df.columns])
    summary_table = pd.DataFrame({
        "Column": df.columns,
        "Data Type": df.dtypes,
        "Total Values": total_values,
        "Unique Values": unique_values,
        "Null Values": null_counts,
        "Null %": null_percentages.round(1),
        "NaN Values": nan_counts,
        "Leading Spaces": leading_spaces,
        "Trailing Spaces": trailing_spaces
    })
    return summary_table, null_counts, null_percentages

def generate_summary_html(summary_table):
    summary_table_html = """
    <style>
        table { border-collapse: collapse; width: 100%; font-family: Arial, sans-serif; }
        table th, table td { border: 1px solid #ddd; padding: 8px; text-align: left; }
        table th { background-color: #4CAF50; color: white; font-size: 1.1em; }
        table tr:nth-child(even) {background-color: #f2f2f2;}
        table tr:hover {background-color: #ddd;}
    </style>
    <table>
        <tr>
            <th>Column</th>
            <th>Data Type</th>
            <th>Total Values</th>
            <th>Unique Values</th>
            <th>Null Values</th>
            <th>Null %</th>
            <th>NaN Values</th>
            <th>Leading Spaces</th>
            <th>Trailing Spaces</th>
        </tr>
    """
    for _, row in summary_table.iterrows():
        null_color = "red" if row['Null Values'] > 0 else "green"
        leading_color = "red" if row['Leading Spaces'] > 0 else "green"
        trailing_color = "red" if row['Trailing Spaces'] > 0 else "green"
        summary_table_html += f"""
        <tr>
            <td>{row['Column']}</td>
            <td>{row['Data Type']}</td>
            <td>{row['Total Values']}</td>
            <td>{row['Unique Values']}</td>
            <td style='color:{null_color};'>{row['Null Values']}</td>
            <td>{row['Null %']}</td>
            <td>{row['NaN Values']}</td>
            <td style='color:{leading_color};'>{row['Leading Spaces']}</td>
            <td style='color:{trailing_color};'>{row['Trailing Spaces']}</td>
        </tr>
        """
    summary_table_html += "</table>"
    return summary_table_html

def plot_missing_percentage(df, dataset_name):                                    # Function to plot missing data percentage
    missing_percentage = df.isnull().mean() * 100
    missing_percentage = missing_percentage[missing_percentage > 0]
    if missing_percentage.empty:
        display_formatted_message(f"No missing data in {dataset_name}.", additional_info="Skipping missing percentage plot.")
        return ""
    plt.figure(figsize=(10, 6))
    missing_percentage.sort_values().plot(kind='barh', color='skyblue', edgecolor='grey')
    plt.title(f'Missing Data Percentage - {dataset_name}', fontsize=16)
    plt.xlabel('Missing Percentage (%)', fontsize=12)
    plt.ylabel('Columns', fontsize=12)
    plt.xticks(rotation=45)
    plt.tight_layout()
    missing_plot_path = f'/content/Encoded_Data_Missing_PercentPlot.png'
    plt.savefig(missing_plot_path)
    plt.close()
    display_formatted_message(f"Missing data percentage plot saved to", missing_plot_path)
    return missing_plot_path

def display_summary_for_file(df, title):                                            # Function to display summary for dataset
    summary_table, null_counts, null_percentages = prepare_summary_table(df)
    summary_html = generate_summary_html(summary_table)
    missing_plot_path = plot_missing_percentage(df, title)
    complete_html = f"""
    <html>
    <head><title>Dataset Summary - {title}</title></head>
    <body>
        <h2>Summary Table for {title}</h2>
        {summary_html}
    </body>
    </html>
    """
    summary_html_path = f'/content/5.4.3 Summary_Encoded_Data.html'
    with open(summary_html_path, 'w', encoding='utf-8') as f:
        f.write(complete_html)
    display_formatted_message(f"HTML summary report for {title} saved to", summary_html_path)
    return summary_html_path, missing_plot_path

html_file_path, missing_plot_path = display_summary_for_file(df_summary, "Analysis on Encoded Data")

display(HTML(f"<h2 style='color: #2E5090;'>Summary Table for Encoded Data</h2>")) # Display and download the HTML summary
display(HTML(open(html_file_path).read()))
files.download(html_file_path)
display_formatted_message("HTML summary file has been downloaded.", html_file_path)






Unnamed: 0,DIVISION,Division_Encoded,HOOD_158,Hood_158_Encoded,Location_Engineered,Location_Engineered_Public,Location_Engineered_Residential,Location_Engineered_Other,OCC_MONTH,OCC_Month_Encoded,OCC_DOW,OCC_DOW_Encoded
0,D33,0.05776,43,0.004309,Residential,0.0,1.0,0.0,December,12,Tuesday,5
1,D43,0.059435,123,0.004898,Public,1.0,0.0,0.0,January,1,Wednesday,6
2,D42,0.08444,129,0.00963,Residential,0.0,1.0,0.0,January,1,Thursday,4
3,D23,0.129274,2,0.013754,Public,1.0,0.0,0.0,January,1,Wednesday,6
4,D23,0.129274,9,0.007107,Public,1.0,0.0,0.0,January,1,Wednesday,6


Column,Data Type,Total Values,Unique Values,Null Values,Null %,NaN Values,Leading Spaces,Trailing Spaces
_id,int64,54311,54311,0,0.0,0,0,0
EVENT_UNIQUE_ID,object,54311,54198,0,0.0,0,0,0
OCC_YEAR,float64,54311,12,0,0.0,0,0,0
OCC_MONTH,object,54311,12,0,0.0,0,0,0
OCC_DAY,float64,54311,31,0,0.0,0,0,0
OCC_DOY,float64,54311,366,0,0.0,0,0,0
OCC_DOW,object,54311,7,0,0.0,0,0,0
OCC_HOUR,int64,54311,24,0,0.0,0,0,0
DIVISION,object,54311,17,0,0.0,0,0,0
LOCATION_TYPE,object,54311,42,0,0.0,0,0,0


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>