# Crime Classification
This notebook involves classification crime data into distinct categories, illuminating the nature and characteristics of various offences. This classification serves as a cornerstone for deeper analysis, setting the stage for predictive modelling and trend analysis.


###  Import all necessary libraries

In [None]:
#Import all Neccessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import RobustScaler
import matplotlib.ticker as ticker



### Load in the cleaned crime dataset

In [None]:
#Load in the cleaned dataset into a pandas dataframe, print statistics
df = pd.read_csv('allcleanedcrimedata', index_col=0)

In [None]:
#Confirm that the right data set is loaded 
df.shape
df.info()

### Drop columns not needed for the classification

In [None]:
# Drop unnecessary columns 
df.drop(['date_occ', 'crm_cd_desc', 'weapon_desc', 'time_occ', 'crm_cd', 'AREA', 'rpt_dist_no', 'Mocodes', 'weapon_used_cd', 'Status', 'LOCATION', 'cross_street', 'LAT', 'LON', 'Year', 'Month', 'Day', 'crime_category'], axis=1, inplace=True)

In [None]:
df.shape
df.info()

### Take a random sample

In [None]:
# Take a random sample of size 5000 from the data
crime_data = df.sample(n=5000, random_state=42)

### One-Hot Encoding with Pandas: Transforming Categorical Data

In this section, we are preparing our dataset for machine learning algorithms. Many machine learning models require numerical input, so categorical variables (like crime types, location categories, etc.) need to be transformed into a format that these models can understand.

To achieve this, we use Pandas' `get_dummies` function, which implements a technique known as one-hot encoding. This technique converts categorical variable(s) into a form that could be provided to ML algorithms to do a better job in prediction. 

Here's what `get_dummies` does:
- For each unique value in a categorical column, it creates a new column (a dummy variable) and assigns a binary value of 1 or 0.
- Each observation in the dataset is then represented with a 1 in the column for its corresponding value and 0 in all other new columns.

The code `crime_data = pd.get_dummies(crime_data)` applies this transformation to all the categorical columns in our `crime_data` DataFrame. As a result, we'll have a DataFrame where all categorical features are represented in a way that our machine learning models can efficiently process.

In [None]:
# Apply one-hot encoding to the feature columns only
features = crime_data.drop('crime_type', axis=1)
features_encoded = pd.get_dummies(features)

# Now split the data
X_train, X_test, y_train, y_test = train_test_split(features_encoded, crime_data['crime_type'], test_size=0.2, random_state=50)

### Applying Robust Scaling to Features

### Why Use Robust Scaling?
In data preprocessing, scaling of features is a critical step, especially when working with algorithms sensitive to the scale of input data, like Logistic Regression, SVM, and K-Nearest Neighbors. Robust Scaling is particularly effective when the dataset contains outliers. Unlike standard scaling methods that are influenced by outliers, Robust Scaling uses statistics that are robust to outliers (the median and interquartile range) to scale the data.

### Implementation of Robust Scaling
The process involves the following steps:
-    **Instantiate the Scaler**: Create a `RobustScaler` object. This scaler removes the median and scales the data according to the Interquartile Range (IQR).
-    **Fit and Transform the Training Data**: We fit the scaler to the `X_train` data and then transform `X_train`. Fitting the scaler involves computing the median and IQR, which are then used to scale the data.
-    **Transform the Test Data**: We transform the `X_test` data using the same scaler. It's important to note that we do not fit the scaler to the test data but only transform


In [None]:
#Robust scaling
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


-   Initialize the veral machine learning classifiers from the scikit-learn library. Each of these classifiers will be used to model our crime data and determine the most effective approach for classification. The classifiers we're initializing are as follows

In [None]:
# Initialize classifiers
rf = RandomForestClassifier()
svm = SVC()
lr = LogisticRegression(max_iter=500) 
knn = KNeighborsClassifier()

In [None]:
# Create a list of classifiers and their respective parameter grids for hyperparameter tuning
classifiers = [('Random Forest', rf, {'n_estimators': [100, 200, 250]}),
               ('SVM', svm, {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}),
               ('Logistic Regression', lr, {'C': [0.1, 1, 10]}),
               ('KNN', knn, {'n_neighbors': [3, 5, 7]})]

In [None]:
# Define a dictionary to store the best estimator for each classifier after hyperparameter tuning
best_estimators = {}

# Iterate over each classifier and their respective parameter grid
for name, classifier, param_grid in classifiers:
    # Initialize GridSearchCV with the given classifier, parameter grid, and 5-fold cross-validation
    grid_search = GridSearchCV(classifier, param_grid, cv=5)
    # Fit the GridSearchCV object on the training data
    grid_search.fit(X_train_scaled, y_train)
    # Print the best hyperparameters for the current classifier
    print(f'Best hyperparameters for {name}: {grid_search.best_params_}')
    # Store the best estimator for the current classifier in the dictionary
    best_estimators[name] = grid_search.best_estimator_

In [None]:
# Generate predicted labels for test data using best models found for each classifier
y_pred_rf = best_estimators['Random Forest'].predict(X_test_scaled)
y_pred_svm = best_estimators['SVM'].predict(X_test_scaled)
y_pred_lr = best_estimators['Logistic Regression'].predict(X_test_scaled)
y_pred_knn = best_estimators['KNN'].predict(X_test_scaled)


-   Create confusion matrix plot and classification report

In [None]:

# Function to plot confusion matrix
def plot_confusion_matrix(cm, title):
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='g', cmap='YlGn') 
    plt.title(title)
    plt.ylabel('Actual label')
    plt.xlabel('Predicted label')
    plt.show()

# Iterate over the best estimator dictionary and print the confusion matrix and classification report for each classifier
for name, model in best_estimators.items():
    # Generate predicted labels for test data using the best model
    y_pred = model.predict(X_test_scaled)

    # Calculate confusion matrix
    cm = confusion_matrix(y_test, y_pred)

    # Plotting the confusion matrix
    plot_confusion_matrix(cm, f'Confusion Matrix for {name}')

    # Printing classification report
    print(f'--- Classification Report for {name} ---')
    print(classification_report(y_test, y_pred))
