<a href="https://colab.research.google.com/github/mohammadbadi/Clustering_Frequency/blob/main/Code%20Sections/5.4.3%20Feature%20Engineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **5.4.3 Feature Engineering - Approach_2**

In [None]:
# !pip install prince -qqq                                                        # uncomment to Install prince if not already installed

In [1]:
import warnings                                                                   # Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from sklearn.decomposition import PCA
from google.colab import files
from IPython.display import display, HTML

url = "https://raw.githubusercontent.com/mohammadbadi/CrimeAnalytics_Clustering/refs/heads/main/Output_CSV/Final_Data.csv"  # URL of the Dataset
try:
  df = pd.read_csv(url)
except Exception as e:
  display(HTML(f"<p style='color: red; font-size: 16px; font-weight: bold;'>Error loading data: {e}</p>"))
  exit()

Data_Preparing_df = pd.read_csv(url)
display(HTML("<p style='color: green; font-size: 16px; font-weight: bold;'>Data loaded successfully.</p>"))

                                                                                  # Capture initial column count
initial_cols_count = len(Data_Preparing_df.columns)

steps_summary = []                                                                # Table to store results

                                                                                  # Process OCC_MONTH conversion (Step 1: Feature Engineering)  -> Convert month names to numeric
month_mapping = {'January': 1, 'February': 2, 'March': 3, 'April': 4, 'May': 5, 'June': 6, 'July': 7, 'August': 8, 'September': 9, 'October': 10, 'November': 11, 'December': 12}; Data_Preparing_df['OCC_MONTH_Num'] = Data_Preparing_df['OCC_MONTH'].map(month_mapping)
steps_summary.append({
    "Original Feature": "OCC_MONTH",
    "Action Taken": "Converted to 'OCC_MONTH_Num' (numeric)",
    "Rationale": "Clustering algorithms require numeric data for distance computations"
})
                                                                                  # Convert OCC_HOUR to integer
df['OCC_HOUR'] = df['OCC_HOUR'].astype(int)
                                                                                  # Process OCC_DOW conversion (Step 2: Feature Engineering)  -> Convert day names to numeric
dow_mapping = {'Monday': 1, 'Tuesday': 2, 'Wednesday': 3, 'Thursday': 4, 'Friday': 5, 'Saturday': 6, 'Sunday': 7}; Data_Preparing_df['OCC_DOW_Num'] = Data_Preparing_df['OCC_DOW'].map(dow_mapping)
steps_summary.append({
    "Original Feature": "OCC_DOW",
    "Action Taken": "Converted to 'OCC_DOW_Num' (numeric)",
    "Rationale": "Numeric encoding required for clustering algorithms"
})

                                                                                  # Process DOW_Weekend creation (Step 3: Feature Engineering)  -> Create binary indicator for weekend
Data_Preparing_df['DOW_Weekend'] = Data_Preparing_df['OCC_DOW_Num'].apply(lambda x: 1 if x in [6,7] else 0)
steps_summary.append({
    "Original Feature": "OCC_DOW_Num",
    "Action Taken": "Created 'DOW_Weekend' (1 for Sat & Sun, else 0)",
    "Rationale": "Captures weekend-specific patterns"
})

                                                                                  # Process DOW_Begin creation (Step 4: Feature Engineering)  -> Create binary indicator for beginning of week
Data_Preparing_df['DOW_Begin'] = Data_Preparing_df['OCC_DOW_Num'].apply(lambda x: 1 if x in [1,2,3] else 0)
steps_summary.append({
    "Original Feature": "OCC_DOW_Num",
    "Action Taken": "Created 'DOW_Begin' (1 for Mon-Tue-Wed, else 0)",
    "Rationale": "Helps detect trends at the beginning of the week"
})

                                                                                  # Process DOW_Mid creation (Step 5: Feature Engineering)  -> Create binary indicator for mid-week
Data_Preparing_df['DOW_Mid'] = Data_Preparing_df['OCC_DOW_Num'].apply(lambda x: 1 if x in [4,5] else 0)
steps_summary.append({
    "Original Feature": "OCC_DOW_Num",
    "Action Taken": "Created 'DOW_Mid' (1 for Thu & Fri, else 0)",
    "Rationale": "Distinguishes mid-week patterns"
})

                                                                                  # Capture final column count after feature engineering
final_cols_count = len(Data_Preparing_df.columns)
                                                                                  # Summary Row for Feature Engineering
steps_summary.append({
    "Original Feature": "Columns affected in <br><strong>5.4.3 Feature Engineering - Approach_2</strong>",
    "Action Taken": "Initial Columns: <strong><br>" + str(initial_cols_count) + "</strong>",
    "Rationale": "Final Columns: <strong><br>" + str(final_cols_count) + "</strong>",
})

                                                                                  # Build HTML Table for Feature Engineering Phase with alternate row shading
html_table = """
<table style='border-collapse: collapse; width: 100%; font-size: 18px;'>
    <thead style='background-color: #4CAF50; color: white;'>
        <tr>
            <th colspan="3" style="text-align: center; font-size: 24px; background-color: #2f4f4f; color: white;">5.4.3 Feature Engineering Phase - Approach_2</th>
        </tr>
        <tr>
            <th style='border: 1px solid #dddddd; padding: 8px;'>Original Feature</th>
            <th style='border: 1px solid #dddddd; padding: 8px;'>Action Taken</th>
            <th style='border: 1px solid #dddddd; padding: 8px;'>Rationale</th>
        </tr>
    </thead>
    <tbody>
"""

                                                                                  # Add rows with alternating shading
for i, step in enumerate(steps_summary):
                                                                                  # Add background color for alternating rows (even rows get light gray)
    bg_color = "#f2f2f2" if i % 2 == 0 else "white"
    html_table += f"""
        <tr style='border: 1px solid #dddddd; background-color: {bg_color};'>
            <td style='border: 1px solid #dddddd; padding: 8px;'>{step["Original Feature"]}</td>
            <td style='border: 1px solid #dddddd; padding: 8px;'>{step["Action Taken"]}</td>
            <td style='border: 1px solid #dddddd; padding: 8px;'>{step["Rationale"]}</td>
        </tr>
    """

                                                                                  # Add footer note inside the table
note_text = (
    "Feature Engineering completed and saved as <span style='color: green;'>FEngineered_New.csv</span> "
    "for further analysis."
)
html_table += f"""
        <tr style='border: 1px solid #dddddd;'>
            <td colspan="3" style='border: 1px solid #dddddd; padding: 8px; background-color: #f8f8f8;'><strong>{note_text}</strong></td>
        </tr>
    </tbody>
</table>
"""
display(HTML(html_table))

Data_Preparing_df.to_csv("FEngineered_New.csv", index=False)                      # Save engineered data as FEngineered_New.csv
files.download("FEngineered_New.csv")

5.4.3 Feature Engineering Phase - Approach_2,5.4.3 Feature Engineering Phase - Approach_2,5.4.3 Feature Engineering Phase - Approach_2
Original Feature,Action Taken,Rationale
OCC_MONTH,Converted to 'OCC_MONTH_Num' (numeric),Clustering algorithms require numeric data for distance computations
OCC_DOW,Converted to 'OCC_DOW_Num' (numeric),Numeric encoding required for clustering algorithms
OCC_DOW_Num,"Created 'DOW_Weekend' (1 for Sat & Sun, else 0)",Captures weekend-specific patterns
OCC_DOW_Num,"Created 'DOW_Begin' (1 for Mon-Tue-Wed, else 0)",Helps detect trends at the beginning of the week
OCC_DOW_Num,"Created 'DOW_Mid' (1 for Thu & Fri, else 0)",Distinguishes mid-week patterns
Columns affected in 5.4.3 Feature Engineering - Approach_2,Initial Columns: 18,Final Columns: 23
Feature Engineering completed and saved as FEngineered_New.csv for further analysis.,Feature Engineering completed and saved as FEngineered_New.csv for further analysis.,Feature Engineering completed and saved as FEngineered_New.csv for further analysis.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>