### IMPORTING NECESSARY DATASET AND LIBRARIES

In [1]:
import pandas as pd

In [2]:
# Load the dataset
df = pd.read_csv('Dataset/Crimes_dataset.csv')

### CLEANING

In [3]:
print(f"Original dataset shape: {df.shape}\n")

Original dataset shape: (8404727, 22)



In [4]:
# --- Step 1: Feature Selection ---
# Focus on key columns as specified in the document: ID, DATE, PRIMARY_TYPE, DESCRIPTION,
# [cite_start]LOCATION_DESCRIPTION, and COMMUNITY_AREA[cite: 5].
key_columns = ['ID', 'Date', 'Primary Type', 'Description', 'Location Description', 'Community Area']
df = df[key_columns]

In [5]:
# --- Step 2: Timeframe Filtering ---
# Filter the data to only include records from 2015 to 2025 to ensure the model is
# [cite_start]relevant to current trends[cite: 4].
df['Date'] = pd.to_datetime(df['Date'])
df = df[(df['Date'].dt.year >= 2015) & (df['Date'].dt.year <= 2025)]

In [6]:
# Rename 'Primary Type' to 'Primary_Type'
df.rename(columns={'Primary Type': 'Primary_Type'}, inplace=True)
df.rename(columns={'Location Description': 'Location_Description'}, inplace=True)
df.rename(columns={'Community Area': 'Community_Area'}, inplace=True)

In [7]:
print(f"Modified dataset shape: {df.shape}\n")

Modified dataset shape: (2684884, 6)



In [8]:
# Check for null values in each column
null_counts = df.isnull().sum()

print("Null values per column:")
print(null_counts)

Null values per column:
ID                          0
Date                        0
Primary_Type                0
Description                 0
Location_Description    13043
Community_Area            177
dtype: int64


In [9]:
# Drop all rows that contain a null value
df_cleaned = df.dropna()

# Verify that there are no more null values in the cleaned DataFrame
print("\nNull values in the cleaned DataFrame:")
print(df_cleaned.isnull().sum())


Null values in the cleaned DataFrame:
ID                      0
Date                    0
Primary_Type            0
Description             0
Location_Description    0
Community_Area          0
dtype: int64


In [10]:
df.head()

Unnamed: 0,ID,Date,Primary_Type,Description,Location_Description,Community_Area
0,13311263,2022-07-29 03:39:00,OFFENSE INVOLVING CHILDREN,CHILD PORNOGRAPHY,RESIDENCE,30.0
1,13053066,2023-01-03 16:44:00,NARCOTICS,MANUFACTURE / DELIVER - CRACK,SIDEWALK,26.0
2,12131221,2020-08-10 09:45:00,ROBBERY,AGGRAVATED VEHICULAR HIJACKING,STREET,24.0
3,11227634,2017-08-26 10:00:00,CRIM SEXUAL ASSAULT,NON-AGGRAVATED,HOTEL/MOTEL,32.0
4,13203321,2023-09-06 17:00:00,CRIMINAL DAMAGE,TO VEHICLE,PARKING LOT / GARAGE (NON RESIDENTIAL),32.0


In [11]:
# Define the output file path
output_path = 'Dataset/cleaned_crimes_dataset.csv'

# Export the DataFrame to a CSV file
# index=False prevents pandas from writing the DataFrame index as a column in the CSV
df_cleaned.to_csv(output_path, index=False)

print(f"DataFrame successfully exported to {output_path}")

DataFrame successfully exported to Dataset/cleaned_crimes_dataset.csv
