Importing Required Libraries

In [None]:
import glob
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

Downloading and Importing Preprocessing Function

In [None]:
!wget -O prepare_data.py "https://raw.githubusercontent.com/orifelszer/CrimeData/refs/heads/main/Prepare_unsupervised_data_functions.py"
from Prepare_unsupervised_data_functions import Preprocessing

--2025-01-07 16:20:06--  https://raw.githubusercontent.com/orifelszer/CrimeData/main/prepare_data.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4721 (4.6K) [text/plain]
Saving to: ‘prepare_data.py’


2025-01-07 16:20:07 (39.9 MB/s) - ‘prepare_data.py’ saved [4721/4721]



Define Paths to Crime Datasets (2019–2024)

In [None]:
# === Downloading and Extracting Crime Data Files ===
import os
import zipfile

!git clone https://github.com/orifelszer/CrimeData.git

zip_folder = 'CrimeData'
# Extracting only ZIP files for the years 2019 to 2024
zip_files = [f for f in os.listdir(zip_folder) if f.startswith('crimes') and any(str(year) in f for year in range(2019, 2025))]

# Unzipping files and saving the new folder paths
extracted_folders = []

for zip_file in zip_files:
    zip_path = os.path.join(zip_folder, zip_file)
    extract_path = os.path.join(zip_folder, zip_file.replace('.zip', ''))
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)
    extracted_folders.append(extract_path)
    print(f"Extracted: {zip_file} -> {extract_path}")


Cloning into 'CrimeData'...
remote: Enumerating objects: 346, done.[K
remote: Counting objects: 100% (158/158), done.[K
remote: Compressing objects: 100% (140/140), done.[K
remote: Total 346 (delta 89), reused 18 (delta 18), pack-reused 188 (from 2)[K
Receiving objects: 100% (346/346), 211.19 MiB | 14.42 MiB/s, done.
Resolving deltas: 100% (172/172), done.
Extracted: crimes2023.zip -> CrimeData/crimes2023
Extracted: crimes2020.zip -> CrimeData/crimes2020
Extracted: crimes2021.zip -> CrimeData/crimes2021
Extracted: crimes2024.zip -> CrimeData/crimes2024
Extracted: crimes2019.zip -> CrimeData/crimes2019
Extracted: crimes2022.zip -> CrimeData/crimes2022


Load Crime Data into Pandas DataFrames

In [None]:
csv_files = []
for folder in extracted_folders:
    csv_files += glob.glob(os.path.join(folder, "*.csv"))

dataframes = {f"Crimes_{file[-8:-4]}": pd.read_csv(file) for file in csv_files}
combined_data = pd.concat(dataframes.values(), axis=0, ignore_index=True)

Applying Preprocessing Function

In [None]:
Clean_data = Preprocessing(combined_data)

Saving the Cleaned Dataset for Unsupervised Learning

In [None]:
Clean_data.to_csv('Clean_data_unsupervised.csv', index=False)