Importing Required Libraries

In [1]:
import glob
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

Downloading and Importing Preprocessing Function

In [2]:
!wget -O Prepare_unsupervised_data_functions.py "https://raw.githubusercontent.com/orifelszer/CrimeData/refs/heads/main/Prepare_unsupervised_data_functions.py"
from Prepare_unsupervised_data_functions import Preprocessing

--2025-01-11 13:50:47--  https://raw.githubusercontent.com/orifelszer/CrimeData/refs/heads/main/Prepare_unsupervised_data_functions.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7816 (7.6K) [text/plain]
Saving to: ‘Prepare_unsupervised_data_functions.py’


2025-01-11 13:50:47 (48.4 MB/s) - ‘Prepare_unsupervised_data_functions.py’ saved [7816/7816]



Define Paths to Crime Datasets (2019–2024)

In [3]:
# === Downloading and Extracting Crime Data Files ===
import os
import zipfile

!git clone https://github.com/orifelszer/CrimeData.git

zip_folder = 'CrimeData'
# Extracting only ZIP files for the years 2019 to 2024
zip_files = [f for f in os.listdir(zip_folder) if f.startswith('crimes') and any(str(year) in f for year in range(2019, 2025))]

# Unzipping files and saving the new folder paths
extracted_folders = []

for zip_file in zip_files:
    zip_path = os.path.join(zip_folder, zip_file)
    extract_path = os.path.join(zip_folder, zip_file.replace('.zip', ''))
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)
    extracted_folders.append(extract_path)
    print(f"Extracted: {zip_file} -> {extract_path}")


Cloning into 'CrimeData'...
remote: Enumerating objects: 728, done.[K
remote: Counting objects: 100% (255/255), done.[K
remote: Compressing objects: 100% (155/155), done.[K
remote: Total 728 (delta 193), reused 100 (delta 100), pack-reused 473 (from 3)[K
Receiving objects: 100% (728/728), 243.97 MiB | 27.92 MiB/s, done.
Resolving deltas: 100% (372/372), done.
Extracted: crimes2020.zip -> CrimeData/crimes2020
Extracted: crimes2023.zip -> CrimeData/crimes2023
Extracted: crimes2024.zip -> CrimeData/crimes2024
Extracted: crimes2019.zip -> CrimeData/crimes2019
Extracted: crimes2021.zip -> CrimeData/crimes2021
Extracted: crimes2022.zip -> CrimeData/crimes2022


Load Crime Data into Pandas DataFrames

In [4]:
csv_files = []
for folder in extracted_folders:
    csv_files += glob.glob(os.path.join(folder, "*.csv"))

dataframes = {f"Crimes_{file[-8:-4]}": pd.read_csv(file) for file in csv_files}
combined_data = pd.concat(dataframes.values(), axis=0, ignore_index=True)

Applying Preprocessing Function

In [5]:
Clean_data = Preprocessing(combined_data)

  lambda x: x.fillna(fill_statistic_area_random(x)).infer_objects(copy=False))


Saving the Cleaned Dataset for Unsupervised Learning

In [7]:
# === Splitting the Cleaned Dataset into Two Separate Files Due to Large File Size ===
split_index = len(Clean_data) // 2
Clean_data_part1 = Clean_data.iloc[:split_index]
Clean_data_part2 = Clean_data.iloc[split_index:]

# Saving the datasets as two separate CSV files
Clean_data_part1.to_csv('Clean_data_unsupervised_part1.csv', index=False)
Clean_data_part2.to_csv('Clean_data_unsupervised_part2.csv', index=False)