<a href="https://colab.research.google.com/github/Maddi007-Py/Maddi007-Py-CrimeAnalytics_Clustering/blob/main/Code%20Sections/5.2%20Creating%20Target%20Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **5.2	Creating Target Dataset**

In [2]:

import warnings
import pandas as pd
import kagglehub
from kagglehub import KaggleDatasetAdapter
import matplotlib.pyplot as plt
import os
import contextlib
from google.colab import files
from IPython.display import display, HTML

print("\n\n")
# Suppress warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)

# File path to the filename with extension
file_path = "major-crime-indicators.csv"

# Load the latest version using the correct dataset handle while suppressing the download output
with open(os.devnull, 'w') as fnull:
    with contextlib.redirect_stdout(fnull):
        df = kagglehub.load_dataset(
            KaggleDatasetAdapter.PANDAS,
            "maddi007py/major-crime-indicators-toronto-march-2025",  # Dataset handle
            file_path,
        )

initial_count = df.shape[0]

# ---------------------------
# Filter 1: UCR Code 2135 with UCR Extension 210
filter1_df = df[(df['UCR_CODE'] == 2135) & (df['UCR_EXT'] == 210)].copy()
count1 = filter1_df.shape[0]

# ---------------------------
# Filter 2: UCR Code 1610 with UCR Extension 140
filter2_df = df[(df['UCR_CODE'] == 1610) & (df['UCR_EXT'] == 140)].copy()
count2 = filter2_df.shape[0]

# Final dataset: Union of both filters
final_df = pd.concat([filter1_df, filter2_df]).copy()
final_count = final_df.shape[0]

# Save the final dataset as Target_Dataset.csv
final_df.to_csv('Target_Dataset.csv', index=False)

# Build steps summary as a list of dictionaries
steps_summary = []
steps_summary.append({
    "Step Taken": "Filter 1: UCR Code 2135 with UCR Extension 210",
    "Before Action": initial_count,
    "Affected by Action": count1,
    "After Action": count1,
    "Unit": "Rows"
})
steps_summary.append({
    "Step Taken": "Filter 2: UCR Code 1610 with UCR Extension 140",
    "Before Action": count1,  # using count1 as the 'After Action' of Filter 1
    "Affected by Action": count2,
    "After Action": final_count,
    "Unit": "Rows"
})
steps_summary.append({
    "Step Taken": "Rows Affected in <strong>UCR Filtering</strong>",
    "Before Action": "Initial Load:<br><strong>" + str(initial_count) + "</strong>",
    "Affected by Action": "Rows Filtered:<br><strong>" + str(initial_count - final_count) + "</strong>",
    "After Action": "Final Count:<br><strong>" + str(final_count) + "</strong>",
    "Unit": "Rows"
})

# Output File Name for HTML summary
html_output_filename = '/content/5.2 Target Dataset.html'

# Create HTML Table with styling
html_table = """
<table style='border-collapse: collapse; width: 100%; font-size: 18px;'>
    <thead style='background-color: #4CAF50; color: white;'>
        <tr>
            <th colspan="5" style="text-align: center; font-size: 24px; background-color: #2f4f4f; color: white;">
                5.2 Creating Target Dataset
            </th>
        </tr>
        <tr>
            <th>Step Taken</th>
            <th>Before Action</th>
            <th>Affected by Action</th>
            <th>After Action</th>
            <th>Unit</th>
        </tr>
    </thead>
    <tbody>
"""

for step in steps_summary:
    html_table += f"""
    <tr style='border: 1px solid #dddddd;'>
        <td style='border: 1px solid #dddddd; padding: 8px;'>{step['Step Taken']}</td>
        <td style='border: 1px solid #dddddd; padding: 8px;'>{step['Before Action']}</td>
        <td style='border: 1px solid #dddddd; padding: 8px;'>{step['Affected by Action']}</td>
        <td style='border: 1px solid #dddddd; padding: 8px;'>{step['After Action']}</td>
        <td style='border: 1px solid #dddddd; padding: 8px;'>{step['Unit']}</td>
    </tr>
    """

# Add a final row with the note (spanning all columns)
note_text = (
    "<strong>Note: The dataset contains "
    "<span style='color: darkred; '>ALL CRIMES</span>, but our research focuses on "
    "<span style='color: green; '>MOTOR VEHICLE THEFTS</span>. "
    "Therefore, we applied two filters: <br>"
    "• Filter 1: UCR Code 2135 with UCR Extension 210 for Theft of a Motor Vehicle (Auto Theft), and <br>"
    "• Filter 2: UCR Code 1610 with UCR Extension 140 for Robbery - Vehicle Jacking.<br>"
    "The target dataset has been saved as <span style='color: blue;'>'Target_Dataset.csv'</span> for further analysis. </strong>"
)
html_table += f"""
    <tr style='border: 1px solid #dddddd;'>
        <td colspan="5" style='border: 1px solid #dddddd; padding: 8px;'>{note_text}</td>
    </tr>
"""
html_table += "</tbody></table>"


print("\n\n")
# Display the outputs
display(HTML(html_table))

with open(html_output_filename, 'w', encoding='utf-8') as f:
    f.write(html_table)
files.download(html_output_filename)
print("\n\n")









5.2 Creating Target Dataset,5.2 Creating Target Dataset,5.2 Creating Target Dataset,5.2 Creating Target Dataset,5.2 Creating Target Dataset
Step Taken,Before Action,Affected by Action,After Action,Unit
Filter 1: UCR Code 2135 with UCR Extension 210,420200,68063,68063,Rows
Filter 2: UCR Code 1610 with UCR Extension 140,68063,1513,69576,Rows
Rows Affected in UCR Filtering,Initial Load: 420200,Rows Filtered: 350624,Final Count: 69576,Rows
"Note: The dataset contains ALL CRIMES, but our research focuses on MOTOR VEHICLE THEFTS. Therefore, we applied two filters: • Filter 1: UCR Code 2135 with UCR Extension 210 for Theft of a Motor Vehicle (Auto Theft), and • Filter 2: UCR Code 1610 with UCR Extension 140 for Robbery - Vehicle Jacking. The target dataset has been saved as 'Target_Dataset.csv' for further analysis.","Note: The dataset contains ALL CRIMES, but our research focuses on MOTOR VEHICLE THEFTS. Therefore, we applied two filters: • Filter 1: UCR Code 2135 with UCR Extension 210 for Theft of a Motor Vehicle (Auto Theft), and • Filter 2: UCR Code 1610 with UCR Extension 140 for Robbery - Vehicle Jacking. The target dataset has been saved as 'Target_Dataset.csv' for further analysis.","Note: The dataset contains ALL CRIMES, but our research focuses on MOTOR VEHICLE THEFTS. Therefore, we applied two filters: • Filter 1: UCR Code 2135 with UCR Extension 210 for Theft of a Motor Vehicle (Auto Theft), and • Filter 2: UCR Code 1610 with UCR Extension 140 for Robbery - Vehicle Jacking. The target dataset has been saved as 'Target_Dataset.csv' for further analysis.","Note: The dataset contains ALL CRIMES, but our research focuses on MOTOR VEHICLE THEFTS. Therefore, we applied two filters: • Filter 1: UCR Code 2135 with UCR Extension 210 for Theft of a Motor Vehicle (Auto Theft), and • Filter 2: UCR Code 1610 with UCR Extension 140 for Robbery - Vehicle Jacking. The target dataset has been saved as 'Target_Dataset.csv' for further analysis.","Note: The dataset contains ALL CRIMES, but our research focuses on MOTOR VEHICLE THEFTS. Therefore, we applied two filters: • Filter 1: UCR Code 2135 with UCR Extension 210 for Theft of a Motor Vehicle (Auto Theft), and • Filter 2: UCR Code 1610 with UCR Extension 140 for Robbery - Vehicle Jacking. The target dataset has been saved as 'Target_Dataset.csv' for further analysis."


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>




