<a href="https://colab.research.google.com/github/mohammadbadi/CrimeAnalytics_Clustering/blob/main/Code%20Sections/5.4.4%20Feature%20Encoding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **5.4.4 Feature Encoding - Approach_3**

In [1]:
import warnings
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from IPython.display import display, HTML
from google.colab import files

warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

# -------------------- Feature Encoding --------------------
# Read the file created in 5.4.3
url = "https://raw.githubusercontent.com/mohammadbadi/CrimeAnalytics_Clustering/refs/heads/main/Output_CSV/FEngineered_Data.csv"
try:
    df_encoded = pd.read_csv(url)
    display(HTML("<p style='color: green; font-size: 16px; font-weight: bold;'>Previously engineered data loaded successfully.</p>"))
except Exception as e:
    display(HTML(f"<p style='color: red; font-size: 16px; font-weight: bold;'>Error loading engineered data: {e}</p>"))
    exit()

# Count initial columns before encoding
initial_column_count = len(df_encoded.columns)

# Build a list to store encoding steps details
steps_summary = [
    {
        "Original Feature": "Location_Engineered",
        "Action Taken": "One-Hot Encoding applied to create binary features",
        "Rationale": "Separates categories for better clustering"
    },
    {
        "Original Feature": "HOOD_158",
        "Action Taken": "Frequency encoded to 'Hood_158_Encoded'",
        "Rationale": "Represents neighborhood distribution as normalized frequencies"
    },
    {
        "Original Feature": "DIVISION",
        "Action Taken": "Frequency encoded to 'Division_Encoded'",
        "Rationale": "Represents division distribution as normalized frequencies"
    },
    {
        "Original Feature": "OCC_MONTH",
        "Action Taken": "Manual mapping to 'OCC_Month_Encoded'",
        "Rationale": "Converts month names to numerical values"
    },
    {
        "Original Feature": "OCC_DOW",
        "Action Taken": "Label Encoding applied to create 'OCC_DOW_Encoded'",
        "Rationale": "Transforms day names to numeric representations"
    }
]

# 1. Frequency Encoding for HOOD_158
hood_counts = df_encoded['HOOD_158'].value_counts(normalize=True)
df_encoded['Hood_158_Encoded'] = df_encoded['HOOD_158'].map(hood_counts)

# 2. Frequency Encoding for DIVISION
division_counts = df_encoded['DIVISION'].value_counts(normalize=True)
df_encoded['Division_Encoded'] = df_encoded['DIVISION'].map(division_counts)

# 3. One-Hot Encoding for Location_Engineered
encoder = OneHotEncoder(sparse_output=False)
location_encoded = encoder.fit_transform(df_encoded[['Location_Engineered']])
location_encoded_df = pd.DataFrame(location_encoded,
                                   columns=encoder.get_feature_names_out(['Location_Engineered']),
                                   index=df_encoded.index)
df_encoded = pd.concat([df_encoded, location_encoded_df], axis=1)

# 4. Manual Mapping for OCC_MONTH
month_mapping = {
    'January': 1, 'February': 2, 'March': 3, 'April': 4,
    'May': 5, 'June': 6, 'July': 7, 'August': 8,
    'September': 9, 'October': 10, 'November': 11, 'December': 12
}
df_encoded['OCC_Month_Encoded'] = df_encoded['OCC_MONTH'].map(month_mapping)

# 5. Label Encoding for OCC_DOW
dow_encoder = LabelEncoder()
df_encoded['OCC_DOW_Encoded'] = dow_encoder.fit_transform(df_encoded['OCC_DOW'])

# Count final columns after encoding
final_column_count = len(df_encoded.columns)

# -------------------- Build HTML Table for Feature Encoding Phase --------------------
html_table = """
<table style='border-collapse: collapse; width: 100%; font-size: 18px;'>
    <thead style='background-color: #4CAF50; color: white;'>
        <tr>
            <th colspan="3" style="text-align: center; font-size: 24px; background-color: #2f4f4f; color: white;">5.4.4 Feature Encoding Phase - Approach_3</th>
        </tr>
        <tr>
            <th style='border: 1px solid #dddddd; padding: 8px;'>Original Feature</th>
            <th style='border: 1px solid #dddddd; padding: 8px;'>Action Taken</th>
            <th style='border: 1px solid #dddddd; padding: 8px;'>Rationale</th>
        </tr>
    </thead>
    <tbody>
"""

# Add rows with alternating shading
for i, step in enumerate(steps_summary):
    bg_color = "#f2f2f2" if i % 2 == 0 else "white"
    html_table += f"""
        <tr style='border: 1px solid #dddddd; background-color: {bg_color};'>
            <td style='border: 1px solid #dddddd; padding: 8px;'>{step["Original Feature"]}</td>
            <td style='border: 1px solid #dddddd; padding: 8px;'>{step["Action Taken"]}</td>
            <td style='border: 1px solid #dddddd; padding: 8px;'>{step["Rationale"]}</td>
        </tr>
    """

# Add row showing columns affected with the requested formatting
bg_color = "#f2f2f2" if len(steps_summary) % 2 == 0 else "white"
html_table += f"""
    <tr style='border: 1px solid #dddddd; background-color: {bg_color};'>
        <td style='border: 1px solid #dddddd; padding: 8px;'>Columns affected in <br> <strong>5.4.4. Feature Encoding - Approach_3</strong></td>
        <td style='border: 1px solid #dddddd; padding: 8px;'><strong>:</strong> Initial columns: <br> <strong>{initial_column_count}</strong></td>
        <td style='border: 1px solid #dddddd; padding: 8px;'>Final Columns: <br> <strong>{final_column_count}</strong></td>
    </tr>
"""

# Add footer note inside the table
note_text = (
    "Feature Encoding completed and saved as <span style='color: green;'>Encoded_Features.csv</span> "
    "for further analysis."
)
html_table += f"""
        <tr style='border: 1px solid #dddddd;'>
            <td colspan="3" style='border: 1px solid #dddddd; padding: 8px; background-color: #f8f8f8;'><strong>{note_text}</strong></td>
        </tr>
    </tbody>
</table>
"""
display(HTML(html_table))

# Save the encoded dataset to CSV
df_encoded.to_csv("FE_Encoded.csv", index=False)
files.download("FE_Encoded.csv")


5.4.4 Feature Encoding Phase - Approach_3,5.4.4 Feature Encoding Phase - Approach_3,5.4.4 Feature Encoding Phase - Approach_3
Original Feature,Action Taken,Rationale
Location_Engineered,One-Hot Encoding applied to create binary features,Separates categories for better clustering
HOOD_158,Frequency encoded to 'Hood_158_Encoded',Represents neighborhood distribution as normalized frequencies
DIVISION,Frequency encoded to 'Division_Encoded',Represents division distribution as normalized frequencies
OCC_MONTH,Manual mapping to 'OCC_Month_Encoded',Converts month names to numerical values
OCC_DOW,Label Encoding applied to create 'OCC_DOW_Encoded',Transforms day names to numeric representations
Columns affected in 5.4.4. Feature Encoding - Approach_3,: Initial columns: 19,Final Columns: 26
Feature Encoding completed and saved as Encoded_Features.csv for further analysis.,Feature Encoding completed and saved as Encoded_Features.csv for further analysis.,Feature Encoding completed and saved as Encoded_Features.csv for further analysis.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>