<a href="https://colab.research.google.com/github/orifelszer/CrimeData/blob/oriana-branch/DecisionTree.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
# Import necessary libraries
import pandas as pd
import zipfile
import requests
from io import BytesIO
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
!pip install imbalanced-learn

from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV





In [6]:
# Base URL for your GitHub repository
base_url = "https://github.com/orifelszer/CrimeData/raw/main/"

# File names
files = {
    "X_train": "X_train_supervised.zip",
    "X_test": "X_test_supervised.zip",
    "y_train": "y_train_supervised.zip",
    "y_test": "y_test_supervised.zip"
}



In [7]:
# Function to read zip files from the repository
def load_data_from_repo(file_name):
    url = base_url + file_name
    response = requests.get(url)
    if response.status_code == 200:
        with zipfile.ZipFile(BytesIO(response.content)) as z:
            # Assuming there's only one file in each zip
            extracted_file_name = z.namelist()[0]
            with z.open(extracted_file_name) as f:
                return pd.read_csv(f)
    else:
        raise Exception(f"Failed to download {file_name}")



In [8]:
# Load datasets
X_train = load_data_from_repo(files["X_train"])
X_test = load_data_from_repo(files["X_test"])
y_train = load_data_from_repo(files["y_train"]).values.ravel()
y_test = load_data_from_repo(files["y_test"]).values.ravel()




In [12]:
# Handle imbalanced data using SMOTE
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

# Define the Decision Tree Classifier with class weights
base_model = DecisionTreeClassifier(class_weight='balanced', random_state=42)



In [17]:
# Define hyperparameter grid for tuning
param_grid = {
    'max_depth': [10, 25],
    'min_samples_split': [5, 10],
    'min_samples_leaf': [2, 4]}


# Perform Grid Search with 5-fold Cross Validation
grid_search = GridSearchCV(estimator=base_model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_balanced, y_train_balanced)

In [None]:
# Best parameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Evaluate the best model on the test set
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
classification_report_output = classification_report(y_test, y_pred)

# Print results
print("Best Parameters:", best_params)
print(f"Accuracy: {accuracy * 100:.2f}%\n")
print("Classification Report:\n", classification_report_output)

In [5]:
# # Initialize and train the Decision Tree Classifier
# decision_tree = DecisionTreeClassifier(random_state=42)
# decision_tree.fit(X_train, y_train)

# # Predict on the test set
# y_pred = decision_tree.predict(X_test)

# # Evaluate the model
# accuracy = accuracy_score(y_test, y_pred)
# classification_report_output = classification_report(y_test, y_pred)
# confusion_matrix_output = confusion_matrix(y_test, y_pred)

# # Print the evaluation results
# print(f"Accuracy: {accuracy * 100:.2f}%\n")
# print("Classification Report:\n", classification_report_output)

Accuracy: 32.46%

Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00       366
           1       0.11      0.10      0.11      7408
           2       0.01      0.00      0.00      3288
           3       0.08      0.02      0.03     23317
           4       0.37      0.52      0.43    126443
           5       0.06      0.16      0.09      6912
           6       0.00      0.00      0.00       126
           7       0.04      0.01      0.01     17566
           8       0.00      0.00      0.00       625
           9       0.16      0.06      0.09     59252
          10       0.36      0.39      0.37    128157
          11       0.00      0.00      0.00      1377
          12       0.00      0.00      0.00      2074
          13       0.00      0.00      0.00        60

    accuracy                           0.32    376971
   macro avg       0.08      0.09      0.08    376971
weighted avg       0.28      0.32     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
