<a href="https://colab.research.google.com/github/orifelszer/CrimeData/blob/eden-branch/Creating_file.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import glob
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
# הורדת הקובץ מ-GitHub
!wget -O Prepare_Data.py "https://raw.githubusercontent.com/orifelszer/CrimeData/refs/heads/main/Prepare_Data.py"

# ייבוא הפונקציה מהקובץ
from Prepare_Data import Preprocessing_Multitask_Updated

--2025-01-06 13:49:45--  https://raw.githubusercontent.com/orifelszer/CrimeData/refs/heads/main/Prepare_Data.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3067 (3.0K) [text/plain]
Saving to: ‘Prepare_Data.py’


2025-01-06 13:49:45 (36.2 MB/s) - ‘Prepare_Data.py’ saved [3067/3067]



Define Paths to Crime Datasets (2019–2024)

In [None]:
import os
import zipfile

# הורדת הנתונים מה-GitHub
!git clone https://github.com/orifelszer/CrimeData.git

zip_folder = 'CrimeData'
# ✅ קריאה רק לקבצים בשנים 2019 עד 2024
zip_files = [f for f in os.listdir(zip_folder) if f.startswith('crimes') and any(str(year) in f for year in range(2019, 2025))]

# ✅ פריסת הקבצים הנבחרים בלבד
for zip_file in zip_files:
    zip_path = os.path.join(zip_folder, zip_file)
    extract_path = os.path.join(zip_folder, zip_file.replace('.zip', ''))
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)
    print(f"Extracted: {zip_file} -> {extract_path}")


Cloning into 'CrimeData'...
remote: Enumerating objects: 171, done.[K
remote: Counting objects: 100% (103/103), done.[K
remote: Compressing objects: 100% (90/90), done.[K
remote: Total 171 (delta 62), reused 13 (delta 13), pack-reused 68 (from 1)[K
Receiving objects: 100% (171/171), 127.12 MiB | 23.71 MiB/s, done.
Resolving deltas: 100% (87/87), done.
Extracted: crimes2024.zip -> CrimeData/crimes2024
Extracted: crimes2019.zip -> CrimeData/crimes2019
Extracted: crimes2020.zip -> CrimeData/crimes2020
Extracted: crimes2023.zip -> CrimeData/crimes2023
Extracted: crimes2021.zip -> CrimeData/crimes2021
Extracted: crimes2022.zip -> CrimeData/crimes2022


Load Crime Data into Pandas DataFrames

In [None]:
csv_files = glob.glob("CrimeData/**/*.csv", recursive=True)
dataframes = {f"Crimes_{file[-8:-4]}": pd.read_csv(file) for file in csv_files}

In [None]:
combined_data = pd.concat(dataframes, axis=0, ignore_index=True)

In [None]:
# הסרת שורות עם ערך -1 בעמודת StatisticGroupKod
combined_data = combined_data[combined_data['StatisticGroupKod'] != -1]

In [None]:
y_city = combined_data['Yeshuv']  # מיקום הפשע
y_crime_type = combined_data['StatisticGroup']  # סוג הפשע

In [None]:
# הסרת עמודות החיזוי מהנתונים
features = combined_data.drop(['StatisticGroup', 'StatisticGroupKod', 'Yeshuv', 'YeshuvKod'], axis=1)

# ✅ חלוקה לסטים: המאפיינים מופרדים מעמודות היעד
X_train, X_test, y_city_train, y_city_test, y_crime_train, y_crime_test = train_test_split(
    features, y_city, y_crime_type, test_size=0.2, random_state=42, stratify=y_crime_type)

In [None]:
# ✅ שימוש בפונקציה המעודכנת ללמידה רב-משימתית
X_train_cleaned, train_mappings, scaler = Preprocessing_Multitask_Updated(X_train, fit_scaler=True)
X_test_cleaned = Preprocessing_Multitask_Updated(X_test, train_mappings=train_mappings, scaler=scaler)[0]

In [None]:
# הגדרת עמודות לשמירה על בסיס סט האימון בלבד
columns_to_keep = [col for col in X_train_cleaned.columns if X_train_cleaned[col].sum() >= 10]

# שמירה על עמודות אימון רלוונטיות בלבד
X_train_cleaned = X_train_cleaned[columns_to_keep]

# התאמת סט המבחן רק לפי מה שנקבע באימון
for col in columns_to_keep:
    if col not in X_test_cleaned.columns:
        X_test_cleaned[col] = 0

# הבטחה ששני הסטים באותו סדר עמודות
X_test_cleaned = X_test_cleaned[X_train_cleaned.columns]

In [None]:
y_city_train = y_city_train.loc[X_train_cleaned.index]
y_city_test = y_city_test.loc[X_test_cleaned.index]
y_crime_train = y_crime_train.loc[X_train_cleaned.index]
y_crime_test = y_crime_test.loc[X_test_cleaned.index]

In [None]:
# Reducing memory usage
def optimize_data_types(df):
    for col in df.columns:
        col_type = df[col].dtype

        # אם מדובר במשתנה קטגוריאלי טקסטואלי
        if col_type == 'object':
            df[col] = df[col].astype('category')

        # אם מדובר במשתנה מספרי רציף
        elif col_type == 'float64':
            df[col] = df[col].astype('float32')

        # אם מדובר במשתנה מספרי שלם
        elif col_type == 'int64':
            df[col] = df[col].astype('int32')
    return df

# החלת הפונקציה על סט הנתונים
X_train_cleaned = optimize_data_types(X_train_cleaned)
X_test_cleaned = optimize_data_types(X_test_cleaned)

In [None]:
# קידוד One-Hot עבור עמודות היעד
y_city_train_encoded = pd.get_dummies(y_city_train, prefix="city")
y_city_test_encoded = pd.get_dummies(y_city_test, prefix="city")

y_crime_train_encoded = pd.get_dummies(y_crime_train, prefix="crime")
y_crime_test_encoded = pd.get_dummies(y_crime_test, prefix="crime")

In [None]:
# ✅ תיקון עמודות המטרה במקרה של חוסר התאמה
max_city_columns = max(y_city_train.shape[1], y_city_test.shape[1])
max_crime_columns = max(y_crime_train.shape[1], y_crime_test.shape[1])

In [None]:
# יישור לכל סטי הנתונים כדי לוודא מימדים תואמים
y_city_train = y_city_train.reindex(columns=range(max_city_columns), fill_value=0)
y_city_test = y_city_test.reindex(columns=range(max_city_columns), fill_value=0)
y_crime_train = y_crime_train.reindex(columns=range(max_crime_columns), fill_value=0)
y_crime_test = y_crime_test.reindex(columns=range(max_crime_columns), fill_value=0)

# ✅ בדיקת הנתונים לאחר התיקון
print(f"After fixing: {y_city_train.shape[1]} == {y_city_test.shape[1]}")

In [None]:
# שמירת היעדים המקודדים כקבצי CSV
X_train_cleaned.to_csv('X_train_supervised.csv', index=False)
X_test_cleaned.to_csv('X_test_supervised.csv', index=False)
y_city_train_encoded.to_csv('y_city_train_supervised.csv', index=False)
y_city_test_encoded.to_csv('y_city_test_supervised.csv', index=False)
y_crime_train_encoded.to_csv('y_crime_train_supervised.csv', index=False)
y_crime_test_encoded.to_csv('y_crime_test_supervised.csv', index=False)