<a href="https://colab.research.google.com/github/mohammadbadi/Clustering_Frequency/blob/main/Code%20Sections/5.4.4%20Feature%20Encoding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
import warnings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from google.colab import files
from IPython.display import display, HTML

# Load the engineered data from previous step
try:
    Data_Encoding_df = pd.read_csv("FEngineered_New.csv")
    display(HTML("<p style='color: green; font-size: 16px; font-weight: bold;'>Previously engineered data loaded successfully.</p>"))
except Exception as e:
    display(HTML(f"<p style='color: red; font-size: 16px; font-weight: bold;'>Error loading engineered data: {e}</p>"))
    exit()

steps_summary = []   # Table to store encoding steps

# Count initial columns before encoding
initial_column_count = len(Data_Encoding_df.columns)
initial_columns = ", ".join(Data_Encoding_df.columns.tolist())

# 1. Frequency Encoding for DIVISION
division_freq = Data_Encoding_df['DIVISION'].value_counts(normalize=True)
Data_Encoding_df['Division_Freq'] = Data_Encoding_df['DIVISION'].map(division_freq)
steps_summary.append({
    "Original Feature": "DIVISION",
    "Action Taken": "Frequency encoded as 'Division_Freq'",
    "Rationale": "Converts categorical division names to numeric values based on occurrence frequency"
})

# 2. Frequency Encoding for LOCATION_TYPE
location_freq = Data_Encoding_df['LOCATION_TYPE'].value_counts(normalize=True)
Data_Encoding_df['LOCATION_Freq'] = Data_Encoding_df['LOCATION_TYPE'].map(location_freq)
steps_summary.append({
    "Original Feature": "LOCATION_TYPE",
    "Action Taken": "Frequency encoded as 'LOCATION_Freq'",
    "Rationale": "Transforms location categories into normalized frequency values"
})

# 3. Frequency Encoding for PREMISES_TYPE
premises_freq = Data_Encoding_df['PREMISES_TYPE'].value_counts(normalize=True)
Data_Encoding_df['PREMISES_Freq'] = Data_Encoding_df['PREMISES_TYPE'].map(premises_freq)
steps_summary.append({
    "Original Feature": "PREMISES_TYPE",
    "Action Taken": "Frequency encoded as 'PREMISES_Freq'",
    "Rationale": "Converts premises types to numeric values based on their relative frequency"
})

# 4. Hierarchical feature combining LOCATION_Freq and PREMISES_Freq
Data_Encoding_df['Loca_Premi_Freq'] = Data_Encoding_df['LOCATION_Freq'] * Data_Encoding_df['PREMISES_Freq']
steps_summary.append({
    "Original Feature": "LOCATION_Freq, PREMISES_Freq",
    "Action Taken": "Created hierarchical feature 'Loca_Premi_Freq'",
    "Rationale": "Captures the joint probability distribution of location and premises types"
})

# 5. Frequency Encoding for HOOD_158
hood_freq = Data_Encoding_df['HOOD_158'].value_counts(normalize=True)
Data_Encoding_df['HOOD_Freq'] = Data_Encoding_df['HOOD_158'].map(hood_freq)
steps_summary.append({
    "Original Feature": "HOOD_158",
    "Action Taken": "Frequency encoded as 'HOOD_Freq'",
    "Rationale": "Encodes neighborhood categories as normalized frequency values"
})

# 6. Hierarchical feature engineering of Division_Freq and HOOD_Freq
Data_Encoding_df['DIV_HOOD_Hier'] = Data_Encoding_df['Division_Freq'] * Data_Encoding_df['HOOD_Freq']
steps_summary.append({
    "Original Feature": "Division_Freq, HOOD_Freq",
    "Action Taken": "Created hierarchical feature 'DIV_HOOD_Hier'",
    "Rationale": "Captures spatial hierarchy relationship between divisions and neighborhoods"
})

# 7. PCA on LONG_WGS84 and LAT_WGS84
# Select only rows with valid coordinates
coord_columns = ['LONG_WGS84', 'LAT_WGS84']
valid_coords = Data_Encoding_df[coord_columns].dropna()

# Apply PCA
pca = PCA(n_components=1)
pca_result = pca.fit_transform(StandardScaler().fit_transform(valid_coords))

# Create a temporary dataframe with index and PCA result
temp_df = pd.DataFrame({
    'index': valid_coords.index,
    'LONG_LAT_PCA': pca_result.flatten()
})

# Merge back to original dataframe
Data_Encoding_df = Data_Encoding_df.join(temp_df.set_index('index'), how='left')

steps_summary.append({
    "Original Feature": "LONG_WGS84, LAT_WGS84",
    "Action Taken": "Applied PCA to create 'LONG_LAT_PCA'",
    "Rationale": "Dimensionality reduction of geographic coordinates into a single numeric feature"
})

# Count final columns after encoding
final_column_count = len(Data_Encoding_df.columns)
affected_columns = ["Division_Freq", "LOCATION_Freq", "PREMISES_Freq", "Loca_Premi_Freq",
                     "HOOD_Freq", "DIV_HOOD_Hier", "LONG_LAT_PCA"]
columns_affected = ", ".join(affected_columns)

# Build HTML Table for Feature Encoding Phase with alternate row shading
html_table = """
<table style='border-collapse: collapse; width: 100%; font-size: 18px;'>
    <thead style='background-color: #4CAF50; color: white;'>
        <tr>
            <th colspan="3" style="text-align: center; font-size: 24px; background-color: #2f4f4f; color: white;">5.4.4 Feature Encoding Phase</th>
        </tr>
        <tr>
            <th style='border: 1px solid #dddddd; padding: 8px;'>Original Feature</th>
            <th style='border: 1px solid #dddddd; padding: 8px;'>Action Taken</th>
            <th style='border: 1px solid #dddddd; padding: 8px;'>Rationale</th>
        </tr>
    </thead>
    <tbody>
"""

# Add rows with alternating shading
for i, step in enumerate(steps_summary):
    # Add background color for alternating rows (even rows get light gray)
    bg_color = "#f2f2f2" if i % 2 == 0 else "white"
    html_table += f"""
        <tr style='border: 1px solid #dddddd; background-color: {bg_color};'>
            <td style='border: 1px solid #dddddd; padding: 8px;'>{step["Original Feature"]}</td>
            <td style='border: 1px solid #dddddd; padding: 8px;'>{step["Action Taken"]}</td>
            <td style='border: 1px solid #dddddd; padding: 8px;'>{step["Rationale"]}</td>
        </tr>
    """

# Add row showing columns affected with the requested formatting
bg_color = "#f2f2f2" if len(steps_summary) % 2 == 0 else "white"
html_table += f"""
    <tr style='border: 1px solid #dddddd; background-color: {bg_color};'>
        <td style='border: 1px solid #dddddd; padding: 8px;'>Columns affected in <br> <strong>5.4.4. Feature Encoding</strong></td>
        <td style='border: 1px solid #dddddd; padding: 8px;'><strong>:</strong> Initial columns: <br> <strong>{initial_column_count}</strong></td>
        <td style='border: 1px solid #dddddd; padding: 8px;'>Final Columns: <br> <strong>{final_column_count}</strong></td>
    </tr>
"""

# Add footer note inside the table
note_text = (
    "Feature Encoding completed and saved as <span style='color: green;'>Encoded_Features.csv</span> "
    "for further analysis."
)
html_table += f"""
        <tr style='border: 1px solid #dddddd;'>
            <td colspan="3" style='border: 1px solid #dddddd; padding: 8px; background-color: #f8f8f8;'><strong>{note_text}</strong></td>
        </tr>
    </tbody>
</table>
"""
display(HTML(html_table))

# Save encoded data to CSV
Data_Encoding_df.to_csv("FE_Encoded_New.csv", index=False)
files.download("FE_Encoded_New.csv")

5.4.4 Feature Encoding Phase,5.4.4 Feature Encoding Phase,5.4.4 Feature Encoding Phase
Original Feature,Action Taken,Rationale
DIVISION,Frequency encoded as 'Division_Freq',Converts categorical division names to numeric values based on occurrence frequency
LOCATION_TYPE,Frequency encoded as 'LOCATION_Freq',Transforms location categories into normalized frequency values
PREMISES_TYPE,Frequency encoded as 'PREMISES_Freq',Converts premises types to numeric values based on their relative frequency
"LOCATION_Freq, PREMISES_Freq",Created hierarchical feature 'Loca_Premi_Freq',Captures the joint probability distribution of location and premises types
HOOD_158,Frequency encoded as 'HOOD_Freq',Encodes neighborhood categories as normalized frequency values
"Division_Freq, HOOD_Freq",Created hierarchical feature 'DIV_HOOD_Hier',Captures spatial hierarchy relationship between divisions and neighborhoods
"LONG_WGS84, LAT_WGS84",Applied PCA to create 'LONG_LAT_PCA',Dimensionality reduction of geographic coordinates into a single numeric feature
Columns affected in 5.4.4. Feature Encoding,: Initial columns: 23,Final Columns: 30
Feature Encoding completed and saved as Encoded_Features.csv for further analysis.,Feature Encoding completed and saved as Encoded_Features.csv for further analysis.,Feature Encoding completed and saved as Encoded_Features.csv for further analysis.
