<a href="https://colab.research.google.com/github/orifelszer/CrimeData/blob/eden-branch/Creating_file.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
import glob
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [2]:
# הורדת הקובץ מ-GitHub
!wget -O prepare_data.py "https://raw.githubusercontent.com/orifelszer/CrimeData/main/prepare_data.py"

# ייבוא הפונקציה מהקובץ
from prepare_data import Preprocessing

--2025-01-06 21:07:35--  https://raw.githubusercontent.com/orifelszer/CrimeData/main/prepare_data.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4307 (4.2K) [text/plain]
Saving to: ‘prepare_data.py’


2025-01-06 21:07:35 (26.7 MB/s) - ‘prepare_data.py’ saved [4307/4307]



Define Paths to Crime Datasets (2019–2024)

In [3]:
import os
import zipfile

# הורדת הנתונים מה-GitHub
!git clone https://github.com/orifelszer/CrimeData.git

zip_folder = 'CrimeData'
# ✅ קריאה רק לקבצים בשנים 2019 עד 2024
zip_files = [f for f in os.listdir(zip_folder) if f.startswith('crimes') and any(str(year) in f for year in range(2019, 2025))]

# ✅ פריסת הקבצים הנבחרים בלבד
for zip_file in zip_files:
    zip_path = os.path.join(zip_folder, zip_file)
    extract_path = os.path.join(zip_folder, zip_file.replace('.zip', ''))
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)
    print(f"Extracted: {zip_file} -> {extract_path}")


Cloning into 'CrimeData'...
remote: Enumerating objects: 262, done.[K
remote: Counting objects: 100% (74/74), done.[K
remote: Compressing objects: 100% (56/56), done.[K
remote: Total 262 (delta 50), reused 18 (delta 18), pack-reused 188 (from 2)[K
Receiving objects: 100% (262/262), 176.81 MiB | 25.15 MiB/s, done.
Resolving deltas: 100% (133/133), done.
Extracted: crimes2023.zip -> CrimeData/crimes2023
Extracted: crimes2020.zip -> CrimeData/crimes2020
Extracted: crimes2021.zip -> CrimeData/crimes2021
Extracted: crimes2024.zip -> CrimeData/crimes2024
Extracted: crimes2019.zip -> CrimeData/crimes2019
Extracted: crimes2022.zip -> CrimeData/crimes2022


Load Crime Data into Pandas DataFrames

In [4]:
csv_files = glob.glob("CrimeData/**/*.csv", recursive=True)
dataframes = {f"Crimes_{file[-8:-4]}": pd.read_csv(file) for file in csv_files}

In [5]:
combined_data = pd.concat(dataframes, axis=0, ignore_index=True)

In [6]:
# הסרת שורות עם ערך -1 בעמודת StatisticGroupKod
combined_data = combined_data[combined_data['StatisticGroupKod'] != -1]

In [None]:
# y_city = combined_data['Yeshuv']  # מיקום הפשע
# y_crime_type = combined_data['StatisticGroup']  # סוג הפשע

In [12]:
# הסרת עמודות החיזוי מהנתונים
features = combined_data.drop(['StatisticGroup', 'StatisticGroupKod'], axis=1)
target = combined_data['StatisticGroup']

# ✅ חלוקה לסטים: המאפיינים מופרדים מעמודות היעד
X_train, X_test, y_train, y_test = train_test_split(
    features,
    target,
    test_size=0.2,
    random_state=42,
    stratify=target
    )

In [15]:
# ✅ שימוש בפונקציה המעודכנת ללמידה רב-משימתית
X_train_cleaned, train_mappings, scaler, label_encoders = Preprocessing(X_train, fit_scaler=True)
X_test_cleaned, _, _, _ = Preprocessing(X_test, train_mappings=train_mappings, scaler=scaler)

In [16]:
# הגדרת עמודות לשמירה על בסיס סט האימון בלבד
columns_to_keep = [col for col in X_train_cleaned.columns if X_train_cleaned[col].sum() >= 10]

# שמירה על עמודות אימון רלוונטיות בלבד
X_train_cleaned = X_train_cleaned[columns_to_keep]

# התאמת סט המבחן רק לפי מה שנקבע באימון
for col in columns_to_keep:
    if col not in X_test_cleaned.columns:
        X_test_cleaned[col] = 0

# הבטחה ששני הסטים באותו סדר עמודות
X_test_cleaned = X_test_cleaned[X_train_cleaned.columns]

In [18]:
# Ensure target variables match the indices of the cleaned datasets
y_train = y_train.loc[X_train_cleaned.index]
y_test = y_test.loc[X_test_cleaned.index]

In [19]:
# Reducing memory usage
def optimize_data_types(df):
    for col in df.columns:
        col_type = df[col].dtype

        # אם מדובר במשתנה קטגוריאלי טקסטואלי
        if col_type == 'object':
            df[col] = df[col].astype('category')

        # אם מדובר במשתנה מספרי רציף
        elif col_type == 'float64':
            df[col] = df[col].astype('float32')

        # אם מדובר במשתנה מספרי שלם
        elif col_type == 'int64':
            df[col] = df[col].astype('int32')
    return df

# החלת הפונקציה על סט הנתונים
X_train_cleaned = optimize_data_types(X_train_cleaned)
X_test_cleaned = optimize_data_types(X_test_cleaned)

In [21]:
# Initialize the label encoder for the single target column
label_encoder = LabelEncoder()

# Fit and transform the training data
y_train_encoded = label_encoder.fit_transform(y_train)

# Transform the test data using the same encoder (no refitting)
y_test_encoded = label_encoder.transform(y_test)

In [23]:
# ✅ Save the cleaned datasets and encoded targets as CSV files
X_train_cleaned.to_csv('X_train_supervised.csv', index=False)
X_test_cleaned.to_csv('X_test_supervised.csv', index=False)
pd.DataFrame(y_train_encoded).to_csv('y_train_supervised.csv', index=False, header=['target'])
pd.DataFrame(y_test_encoded).to_csv('y_test_supervised.csv', index=False, header=['target'])