<a href="https://colab.research.google.com/github/orifelszer/CrimeData/blob/eden-branch/Creating_file.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import glob
import pandas as pd
from sklearn.model_selection import train_test_split

In [3]:
# הורדת הקובץ מ-GitHub
!wget -O Data_Handling.py "https://raw.githubusercontent.com/orifelszer/CrimeData/main/Data_Handling.py"

# ייבוא הפונקציה מהקובץ
from Data_Handling import Preprocessing

--2025-01-06 06:40:15--  https://raw.githubusercontent.com/orifelszer/CrimeData/main/Data_Handling.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5256 (5.1K) [text/plain]
Saving to: ‘Data_Handling.py’


2025-01-06 06:40:15 (35.3 MB/s) - ‘Data_Handling.py’ saved [5256/5256]



Define Paths to Crime Datasets (2019–2024)

In [4]:
!git clone https://github.com/orifelszer/CrimeData.git

import os
import zipfile

zip_folder = 'CrimeData'

zip_files = [f for f in os.listdir(zip_folder) if f.endswith('.zip')]

for zip_file in zip_files:
    zip_path = os.path.join(zip_folder, zip_file)
    extract_path = os.path.join(zip_folder, zip_file.replace('.zip', ''))

    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)
    print(f"Extracted: {zip_file} -> {extract_path}")

Cloning into 'CrimeData'...
remote: Enumerating objects: 120, done.[K
remote: Counting objects: 100% (52/52), done.[K
remote: Compressing objects: 100% (39/39), done.[K
remote: Total 120 (delta 35), reused 13 (delta 13), pack-reused 68 (from 1)[K
Receiving objects: 100% (120/120), 97.30 MiB | 42.09 MiB/s, done.
Resolving deltas: 100% (60/60), done.
Extracted: crimes2024.zip -> CrimeData/crimes2024
Extracted: crimes2019.zip -> CrimeData/crimes2019
Extracted: crimes2020.zip -> CrimeData/crimes2020
Extracted: crimes2023.zip -> CrimeData/crimes2023
Extracted: crimes2021.zip -> CrimeData/crimes2021
Extracted: crimes2022.zip -> CrimeData/crimes2022


Load Crime Data into Pandas DataFrames

In [5]:
csv_files = glob.glob("CrimeData/**/*.csv", recursive=True)
dataframes = {f"Crimes_{file[-8:-4]}": pd.read_csv(file) for file in csv_files}

In [6]:
combined_data = pd.concat(dataframes, axis=0, ignore_index=True)

In [7]:
# הסרת שורות עם ערך -1 בעמודת StatisticGroupKod
combined_data = combined_data[combined_data['StatisticGroupKod'] != -1]

In [8]:
# שמירת עמודות החיזוי
statistic_group = combined_data['StatisticGroup']
statistic_group_kod = combined_data['StatisticGroupKod']

# הסרת עמודות החיזוי מהנתונים
features = combined_data.drop(['StatisticGroup', 'StatisticGroupKod'], axis=1)

# חלוקה לסט אימון ובדיקה
X_train, X_test, y_train, y_test = train_test_split(
    features,
    statistic_group,  # משתמשים רק ב-StatisticGroupKod כעמודת חיזוי
    test_size=0.2,
    random_state=42,
    stratify=statistic_group  # שומר על התפלגות הקטגוריות
    )

# שמירת עמודת StatisticGroup (לשימוש אפשרי בעתיד)
y_train_text = statistic_group_kod.loc[X_train.index]
y_test_text = statistic_group_kod.loc[X_test.index]

In [9]:
# ניקוי סט האימון ויצירת מיפויים
X_train_cleaned, train_mappings, scaler = Preprocessing(X_train, fit_scaler=True)

# ניקוי סט הבדיקה תוך שימוש במיפויים שנוצרו בסט האימון
X_test_cleaned, _, _ = Preprocessing(X_test, train_mappings=train_mappings, scaler=scaler)

In [10]:
# הגדרת עמודות לשמירה על בסיס סט האימון בלבד
columns_to_keep = [col for col in X_train_cleaned.columns if X_train_cleaned[col].sum() >= 10]

# שמירה על עמודות אימון רלוונטיות בלבד
X_train_cleaned = X_train_cleaned[columns_to_keep]

# התאמת סט המבחן רק לפי מה שנקבע באימון
for col in columns_to_keep:
    if col not in X_test_cleaned.columns:
        X_test_cleaned[col] = 0

# הבטחה ששני הסטים באותו סדר עמודות
X_test_cleaned = X_test_cleaned[X_train_cleaned.columns]

In [11]:
# נתונים לעבודה
X_train = X_train_cleaned
X_test = X_test_cleaned

In [12]:
# התאמת אינדקסים
y_train = y_train.loc[X_train_cleaned.index]
y_test = y_test.loc[X_test_cleaned.index]

In [13]:
# Reducing memory usage
def optimize_data_types(df):
    for col in df.columns:
        col_type = df[col].dtype

        # אם מדובר במשתנה קטגוריאלי טקסטואלי
        if col_type == 'object':
            df[col] = df[col].astype('category')

        # אם מדובר במשתנה מספרי רציף
        elif col_type == 'float64':
            df[col] = df[col].astype('float32')

        # אם מדובר במשתנה מספרי שלם
        elif col_type == 'int64':
            df[col] = df[col].astype('int32')
    return df

# החלת הפונקציה על סט הנתונים
X_train = optimize_data_types(X_train)
X_test = optimize_data_types(X_test)

In [14]:
# שמירת סט האימון והבדיקה כקבצי CSV נפרדים
X_train.to_csv('X_train.csv', index=False)
X_test.to_csv('X_test.csv', index=False)
y_train.to_csv('y_train.csv', index=False)
y_test.to_csv('y_test.csv', index=False)


In [15]:
!git add X_train.csv X_test.csv y_train.csv y_test.csv
!git commit -m "Added cleaned datasets for training and testing"
!git push origin main

fatal: not a git repository (or any of the parent directories): .git
fatal: not a git repository (or any of the parent directories): .git
fatal: not a git repository (or any of the parent directories): .git


In [19]:
# העלאת הקבצים למאגר שממנו נטענה המחברת
!git pull origin eden-branch # למשוך שינויים לפני השמירה
!git add X_train_cleaned.csv X_test_cleaned.csv y_train.csv y_test.csv
!git commit -m "Upload cleaned datasets from Google Colab"
!git push origin main

fatal: not a git repository (or any of the parent directories): .git
fatal: not a git repository (or any of the parent directories): .git
fatal: not a git repository (or any of the parent directories): .git
fatal: not a git repository (or any of the parent directories): .git
