<a href="https://colab.research.google.com/github/orifelszer/CrimeData/blob/eden-branch/LGBM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import zipfile
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [2]:
# ✅ שלב 1: שכפול המאגר מה-GitHub
!git clone https://github.com/orifelszer/CrimeData.git

# ✅ שלב 2: פונקציה לפתיחת קובצי ZIP מהתיקייה המקומית
def load_zipped_csv_local(zip_path):
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        csv_name = zip_ref.namelist()[0]
        with zip_ref.open(csv_name) as file:
            return pd.read_csv(file)

# ✅ שלב 3: קריאת הנתונים לאחר השכפול (נתיב מקומי)
X_train = load_zipped_csv_local('CrimeData/X_train_supervised.zip')
X_test = load_zipped_csv_local('CrimeData/X_test_supervised.zip')
y_train = load_zipped_csv_local('CrimeData/y_train_supervised.zip')
y_test = load_zipped_csv_local('CrimeData/y_test_supervised.zip')

# ✅ בדיקה שהנתונים נטענו כראוי
print(f"X_train Shape: {X_train.shape}")
print(f"X_test Shape: {X_test.shape}")
print(f"y_train Shape: {y_train.shape}")
print(f"y_train Shape: {y_train.shape}")

Cloning into 'CrimeData'...
remote: Enumerating objects: 328, done.[K
remote: Counting objects: 100% (140/140), done.[K
remote: Compressing objects: 100% (122/122), done.[K
remote: Total 328 (delta 81), reused 18 (delta 18), pack-reused 188 (from 2)[K
Receiving objects: 100% (328/328), 209.78 MiB | 21.08 MiB/s, done.
Resolving deltas: 100% (164/164), done.
X_train Shape: (1257198, 10)
X_test Shape: (376971, 10)
y_train Shape: (1257198, 1)
y_train Shape: (1257198, 1)


In [29]:
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from lightgbm import LGBMClassifier
import numpy as np

In [30]:
# חישוב משקלות מחלקות
class_weights = compute_class_weight('balanced', classes=np.unique(y_train.values.ravel()), y=y_train.values.ravel())
class_weight_dict = {i: weight for i, weight in zip(np.unique(y_train.values.ravel()), class_weights)}

In [49]:
lgbm_model = LGBMClassifier(
    boosting_type='gbdt',
    n_estimators=500,
    learning_rate=0.03,
    max_depth=8,
    num_leaves=50,
    min_child_samples=20,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    class_weight=class_weight_dict  # שילוב משקלות המחלקות
)

In [50]:
# אימון המודל
lgbm_model.fit(X_train.values, y_train.values.ravel())



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.038656 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 858
[LightGBM] [Info] Number of data points in the train set: 1257198, number of used features: 10
[LightGBM] [Info] Start training from score -2.639057
[LightGBM] [Info] Start training from score -2.639057
[LightGBM] [Info] Start training from score -2.639057
[LightGBM] [Info] Start training from score -2.639057
[LightGBM] [Info] Start training from score -2.639057
[LightGBM] [Info] Start training from score -2.639057
[LightGBM] [Info] Start training from score -2.639057
[LightGBM] [Info] Start training from score -2.639057
[LightGBM] [Info] Start training from score -2.639057
[LightGBM] [Info] Start training from score -2.639057
[LightGBM] [Info] Start training from score -2.639057
[LightGBM] [Info] Start training from score -2.63905



In [46]:
# תחזיות
lgbm_predictions = lgbm_model.predict(X_test)

In [47]:
# פונקציה להערכת המודל
def evaluate_model(y_true, y_pred):
    return {
        "Accuracy": accuracy_score(y_true, y_pred),
        "Precision": precision_score(y_true, y_pred, average="weighted"),
        "Recall": recall_score(y_true, y_pred, average="weighted"),
        "F1 Score": f1_score(y_true, y_pred, average="weighted")
    }

In [48]:
# הערכת הביצועים
lgbm_metrics = evaluate_model(y_test.values.ravel(), lgbm_predictions)
print("Updated LGBM Metrics:", lgbm_metrics)

Updated LGBM Metrics: {'Accuracy': 0.11623175257513177, 'Precision': 0.30193486146065557, 'Recall': 0.11623175257513177, 'F1 Score': 0.13669936106608072}
