<a href="https://colab.research.google.com/github/mohammadbadi/Clustering_Frequency/blob/main/Code%20Sections/5.4.3%20Frequency%20Data%20Reduction%20and%20Projection(FGH).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **5.4 Data Reduction and Projection - f) Feature Engineering, g) Feature Encoding and h) Descriptive Statistics**


In [2]:
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.colors import BoundaryNorm, LinearSegmentedColormap
from scipy import stats
from sklearn.preprocessing import LabelEncoder
from IPython.display import display, HTML
from google.colab import files

print("\n\n")
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

# Read the dataset from CSV file
url = "https://raw.githubusercontent.com/mohammadbadi/CrimeAnalytics_Clustering/refs/heads/main/Output_CSV/Final_Data.csv"
df = pd.read_csv(url)

# Frequency Encoding for LOCATION_TYPE (without grouping into 3 categories)
location_counts = df['LOCATION_TYPE'].value_counts(normalize=True)
df['Location_Freq_Encoded'] = df['LOCATION_TYPE'].map(location_counts)

# Frequency Encoding for HOOD_158
hood_counts = df['HOOD_158'].value_counts(normalize=True)
df['Hood_158_Encoded'] = df['HOOD_158'].map(hood_counts)

# Frequency Encoding for DIVISION
division_counts = df['DIVISION'].value_counts(normalize=True)
df['Division_Encoded'] = df['DIVISION'].map(division_counts)

# Map month names to numerical values for OCC_MONTH
month_mapping = {
    'January': 1, 'February': 2, 'March': 3, 'April': 4,
    'May': 5, 'June': 6, 'July': 7, 'August': 8,
    'September': 9, 'October': 10, 'November': 11, 'December': 12
}
df['OCC_Month_Encoded'] = df['OCC_MONTH'].map(month_mapping)

# Label Encoding for OCC_DOW (Day of Week)
dow_encoder = LabelEncoder()
df['OCC_DOW_Encoded'] = dow_encoder.fit_transform(df['OCC_DOW'])

# Save the encoded dataset
df.to_csv("FE_Encoded.csv", index=False)

# Display feature encoding message with HTML formatting
display(HTML(
    "<p style='color: black; font-size: 16px; font-weight: bold;'>"
    "<b>Feature Encoding Applied:</b><br>"
    "- Frequency Encoding was applied to <span style='color: darkblue; font-weight: bold;'>LOCATION_TYPE</span> "
    "resulting in a single column (<span style='color: darkblue; font-weight: bold;'>Location_Freq_Encoded</span>).<br>"
    "- Frequency Encoding was also used for <span style='color: darkblue; font-weight: bold;'>HOOD_158</span> and "
    "<span style='color: darkblue; font-weight: bold;'>DIVISION</span>.<br>"
    "- Manual Mapping was used for <span style='color: darkblue; font-weight: bold;'>OCC_MONTH</span> to convert month names into numbers.<br>"
    "- Label Encoding was applied to <span style='color: darkblue; font-weight: bold;'>OCC_DOW</span>.<br>"
    "The encoded data was saved to <span style='color: darkgreen; font-weight: bold;'>FE_Encoded.csv</span>."
    "</p>"
))

# Display sample of encoded columns (first 5 rows)
display(HTML(
    "<p style='color: black; font-size: 16px; font-weight: bold;'>"
    "Sample of encoded columns: <span style='font-weight: bold;'>(first 5 rows)</span>"
    "</p>"
))
display(df[['LOCATION_TYPE', 'Location_Freq_Encoded', 'HOOD_158', 'Hood_158_Encoded',
            'DIVISION', 'Division_Encoded', 'OCC_MONTH', 'OCC_Month_Encoded',
            'OCC_DOW', 'OCC_DOW_Encoded']].head())

# Create a copy for further statistical analysis
df_encoded_stats = df.copy()

def count_leading_trailing_spaces(column):
    column = column.astype(str)
    return column.str.startswith(' ').sum(), column.str.endswith(' ').sum()

def prepare_summary_table(df):
    unique_values = df.nunique()
    total_values = df.count() + df.isnull().sum()
    null_counts = df.isnull().sum()
    nan_counts = df.isna().sum()
    null_percentages = (null_counts / total_values) * 100
    leading_spaces, trailing_spaces = zip(*[count_leading_trailing_spaces(df[col]) for col in df.columns])
    summary_table = pd.DataFrame({
        "Column": df.columns,
        "Data Type": df.dtypes,
        "Total Values": total_values,
        "Unique Values": unique_values,
        "Null Values": null_counts,
        "Null %": null_percentages.round(1),
        "NaN Values": nan_counts,
        "Leading Spaces": leading_spaces,
        "Trailing Spaces": trailing_spaces
    })
    return summary_table, null_counts, null_percentages

def generate_summary_html(summary_table):
    summary_table_html = """
    <style>
        table { border-collapse: collapse; width: 100%; font-family: Arial, sans-serif; font-size: 16px; }
        table th, table td { border: 1px solid #ddd; padding: 8px; text-align: left; }
        table th { background-color: #4CAF50; color: white; font-size: 16px; }
        table tr:nth-child(even) {background-color: #f2f2f2;}
        table tr:hover {background-color: #ddd;}
    </style>
    <table>
        <tr>
            <th>Column</th>
            <th>Data Type</th>
            <th>Total Values</th>
            <th>Unique Values</th>
            <th>Null Values</th>
            <th>Null %</th>
            <th>NaN Values</th>
            <th>Leading Spaces</th>
            <th>Trailing Spaces</th>
        </tr>
    """
    for _, row in summary_table.iterrows():
        null_color = "red" if row['Null Values'] > 0 else "green"
        leading_color = "red" if row['Leading Spaces'] > 0 else "green"
        trailing_color = "red" if row['Trailing Spaces'] > 0 else "green"
        summary_table_html += f"""
        <tr>
            <td>{row['Column']}</td>
            <td>{row['Data Type']}</td>
            <td>{row['Total Values']}</td>
            <td>{row['Unique Values']}</td>
            <td style='color:{null_color};'>{row['Null Values']}</td>
            <td>{row['Null %']}</td>
            <td>{row['NaN Values']}</td>
            <td style='color:{leading_color};'>{row['Leading Spaces']}</td>
            <td style='color:{trailing_color};'>{row['Trailing Spaces']}</td>
        </tr>
        """
    summary_table_html += "</table>"
    return summary_table_html

def plot_missing_percentage(df, dataset_name):
    missing_percentage = df.isnull().mean() * 100
    missing_percentage = missing_percentage[missing_percentage > 0]
    if missing_percentage.empty:
        display(HTML(f"<p style='color: black; font-size: 16px; font-weight: bold;'>No missing data in {dataset_name}. Skipping missing percentage plot.</p>"))
        return ""
    plt.figure(figsize=(10, 6))
    missing_percentage.sort_values().plot(kind='barh', color='skyblue', edgecolor='grey')
    plt.title(f'Missing Data Percentage - {dataset_name}', fontsize=16)
    plt.xlabel('Missing Percentage (%)', fontsize=12)
    plt.ylabel('Columns', fontsize=12)
    plt.xticks(rotation=45)
    plt.tight_layout()
    missing_plot_path = f'/content/Encoded_Data_Missing_PercentPlot.png'
    plt.savefig(missing_plot_path)
    plt.close()
    display(HTML(f"<p style='color: black; font-size: 16px; font-weight: bold;'>Missing data percentage plot saved to <span style='color: darkgreen; font-weight: bold;'>{missing_plot_path}</span>.</p>"))
    return missing_plot_path

def display_summary_for_file(df, title):
    summary_table, null_counts, null_percentages = prepare_summary_table(df)
    summary_html = generate_summary_html(summary_table)
    missing_plot_path = plot_missing_percentage(df, title)
    complete_html = f"""
    <html>
    <head><title>Dataset Summary - {title}</title></head>
    <body>
        <h2 style="color: black; font-size: 16px; font-weight: bold;">Summary Table for {title}</h2>
        {summary_html}
    </body>
    </html>
    """
    summary_html_path = f'5.4.3 Summary_Encoded_Data.html'
    with open(summary_html_path, 'w', encoding='utf-8') as f:
        f.write(complete_html)
    display(HTML(f"<p style='color: black; font-size: 16px; font-weight: bold;'>HTML summary report for {title} saved to <span style='color: darkgreen; font-weight: bold;'>{summary_html_path}</span>.</p>"))
    return summary_html_path, missing_plot_path

html_file_path, missing_plot_path = display_summary_for_file(df_encoded_stats, "Analysis on Encoded Data")
display(HTML(open(html_file_path).read()))

# Download the summary HTML and encoded dataset
files.download(html_file_path)
files.download("FE_Encoded.csv")

display(HTML("""
<p style="color: black; font-size: 16px; font-weight: bold;">
    Files: <span style="color: darkblue; font-weight: bold;">HTML summary file</span> and <span style="color: darkblue; font-weight: bold;">FE_Encoded.csv</span> have been <span style="color: darkgreen; font-weight: bold;">downloaded</span>.
</p>
"""))







Unnamed: 0,LOCATION_TYPE,Location_Freq_Encoded,HOOD_158,Hood_158_Encoded,DIVISION,Division_Encoded,OCC_MONTH,OCC_Month_Encoded,OCC_DOW,OCC_DOW_Encoded
0,"Apartment (Rooming House, Condo)",0.026827,43,0.004309,D33,0.05776,December,12,Tuesday,5
1,"Streets, Roads, Highways (Bicycle Path, Privat...",0.199739,123,0.004898,D43,0.059435,January,1,Wednesday,6
2,"Single Home, House (Attach Garage, Cottage, Mo...",0.347664,129,0.00963,D42,0.08444,January,1,Thursday,4
3,"Parking Lots (Apt., Commercial Or Non-Commercial)",0.33078,2,0.013754,D23,0.129274,January,1,Wednesday,6
4,"Parking Lots (Apt., Commercial Or Non-Commercial)",0.33078,9,0.007107,D23,0.129274,January,1,Wednesday,6


Column,Data Type,Total Values,Unique Values,Null Values,Null %,NaN Values,Leading Spaces,Trailing Spaces
_id,int64,54311,54311,0,0.0,0,0,0
EVENT_UNIQUE_ID,object,54311,54198,0,0.0,0,0,0
OCC_YEAR,float64,54311,12,0,0.0,0,0,0
OCC_MONTH,object,54311,12,0,0.0,0,0,0
OCC_DAY,float64,54311,31,0,0.0,0,0,0
OCC_DOY,float64,54311,366,0,0.0,0,0,0
OCC_DOW,object,54311,7,0,0.0,0,0,0
OCC_HOUR,int64,54311,24,0,0.0,0,0,0
DIVISION,object,54311,17,0,0.0,0,0,0
LOCATION_TYPE,object,54311,42,0,0.0,0,0,0


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>