<a href="https://colab.research.google.com/github/orifelszer/CrimeData/blob/main/Baseline_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import zipfile
import requests
from io import BytesIO
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import SMOTE
import warnings
from sklearn.exceptions import UndefinedMetricWarning

# Suppress warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=UndefinedMetricWarning)



In [None]:
# Base URL for your GitHub repository
base_url = "https://github.com/orifelszer/CrimeData/raw/main/"

# File names
files = {
    "X_train": "X_train_supervised.zip",
    "X_test": "X_test_supervised.zip",
    "y_train": "y_train_supervised.zip",
    "y_test": "y_test_supervised.zip"
}

# Function to read zip files from the repository
def load_data_from_repo(file_name):
    url = base_url + file_name
    response = requests.get(url)
    if response.status_code == 200:
        with zipfile.ZipFile(BytesIO(response.content)) as z:
            extracted_file_name = z.namelist()[0]
            with z.open(extracted_file_name) as f:
                return pd.read_csv(f)
    else:
        raise Exception(f"Failed to download {file_name}")

# Load datasets
X_train = load_data_from_repo(files["X_train"])
X_test = load_data_from_repo(files["X_test"])
y_train = load_data_from_repo(files["y_train"]).values.ravel()
y_test = load_data_from_repo(files["y_test"]).values.ravel()

In [3]:
# Handle imbalanced data using SMOTE
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

In [4]:
# Most frequent classifier
dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(X_train_balanced, y_train_balanced)

# Predict on the test set
y_pred_dummy = dummy_clf.predict(X_test)

# Evaluate the most frequent classifier
dummy_metrics = {
    "Accuracy": accuracy_score(y_test, y_pred_dummy),
    "Precision": precision_score(y_test, y_pred_dummy, average='weighted', zero_division=0),
    "Recall": recall_score(y_test, y_pred_dummy, average='weighted', zero_division=0),
    "F1 Score": f1_score(y_test, y_pred_dummy, average='weighted', zero_division=0),
    "Confusion Matrix": confusion_matrix(y_test, y_pred_dummy)
}

print("Most Frequent Classifier Metrics:")
for metric, value in dummy_metrics.items():
    print(f"{metric}: {value}")

print("\nClassification Report:\n", classification_report(y_test, y_pred_dummy, zero_division=0))

Most Frequent Classifier Metrics:
Accuracy: 0.0010600458501759294
Precision: 1.123697204475209e-06
Recall: 0.0010600458501759294
F1 Score: 2.2450145905501202e-06
Confusion Matrix: [[   332      0      0      0      0      0      0      0      0      0
       0      0      0      0]
 [  5466      0      0      0      0      0      0      0      0      0
       0      0      0      0]
 [  2795      0      0      0      0      0      0      0      0      0
       0      0      0      0]
 [ 19182      0      0      0      0      0      0      0      0      0
       0      0      0      0]
 [109624      0      0      0      0      0      0      0      0      0
       0      0      0      0]
 [  5973      0      0      0      0      0      0      0      0      0
       0      0      0      0]
 [    81      0      0      0      0      0      0      0      0      0
       0      0      0      0]
 [ 14880      0      0      0      0      0      0      0      0      0
       0      0      0     

In [5]:
# Simple Decision Tree Classifier
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train_balanced, y_train_balanced)

# Predict on the test set
y_pred_dt = dt_model.predict(X_test)

# Evaluate the Decision Tree model
dt_metrics = {
    "Accuracy": accuracy_score(y_test, y_pred_dt),
    "Precision": precision_score(y_test, y_pred_dt, average='weighted', zero_division=0),
    "Recall": recall_score(y_test, y_pred_dt, average='weighted', zero_division=0),
    "F1 Score": f1_score(y_test, y_pred_dt, average='weighted', zero_division=0),
    "Confusion Matrix": confusion_matrix(y_test, y_pred_dt)
}

print("Decision Tree Classifier Metrics:")
for metric, value in dt_metrics.items():
    print(f"{metric}: {value}")

print("\nClassification Report:\n", classification_report(y_test, y_pred_dt, zero_division=0))

Decision Tree Classifier Metrics:
Accuracy: 0.04342356494696578
Precision: 0.2795425592375389
Recall: 0.04342356494696578
F1 Score: 0.04839160386162438
Confusion Matrix: [[   11    33    23    19     7    44    11    27    13     2     4    47
     82     9]
 [  211  1014   305   152   430  1825    22   244   212    40    20   544
    338   109]
 [   83   359   331   186   126   351    55   237   155   101    28   311
    452    20]
 [  470  1583  1200  1584   486  3188   588  2245   704   327   114  2599
   3956   138]
 [ 2281  9765  8804  7045  4414 15720  4034  9979  4748  2429  1029 14071
  24748   557]
 [   85  1144   295   235   543  2294    37   237   168    55    13   348
    416   103]
 [    1    12     4     5     3    10     1     3     3     4     1     9
     25     0]
 [  352  1484  1108   963   537  2416   549  1393   613   378   122  1948
   2877   140]
 [   16    37    39    20    13   101     7    41    21    18     6    51
     78     6]
 [ 1195  4915  4027  3145  15