In [1]:
import pandas as pd
import os
import glob
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import Pipeline

In [2]:
# Need to read in all of the data from the files. Using glob we can get the paths to the files. 
df_all = []
for f in glob.glob("crime data/*/*.csv"):
    df_all.append(pd.read_csv(f))

# Combine the list of dataframes
df_all = pd.concat(df_all,axis=0)

# Remove any of the rows that contain NAs
df_all = df_all[df_all['Last outcome category'].notna()]

# Need to convert these outcomes to be True or False to create a model that will classify if an offender is prossicuted
df_all['prosecuted'] = np.where(df_all['Last outcome category'].isin(["Offender given community sentence", "Offender sent to prison", "Offender given suspended prison sentence", "Offender given community sentence", 
                                                                    "Offender ordered to pay compensation", "Offender fined", "Offender given conditional discharge", "Offender deprived of property", 
                                                                    "Offender given absolute discharge"]), True, False)

# Remove all columns that are not required.
df_all.drop(["Crime ID", "Context", "Reported by", "Falls within", "Month", "Last outcome category"], axis=1, inplace=True)

# Remove all rows that contain Na's.
df_all.dropna(inplace = True)

# Get are targets values
y = df_all["prosecuted"]

# Remover the traget from the data set.
df_all.drop(["prosecuted"], axis=1, inplace=True)

In [3]:
# Need to fill in the Na values. 
# df_all[["Longitude", "Latitude"]] = df_all[["Longitude", "Latitude"]].fillna(-9999)

In [4]:
def create_column_transformer(df):
    # This created a column transformer object that contains a number of encoders that will allow for
    # data to be passed to the object and then encoded.
    columns = []
    counter = 0
    for cols in df.columns.values:
        if df_all[cols].dtype == np.float64 or df_all[cols].dtype == np.int64:
            columns.append((cols, StandardScaler(), [counter]))
        else:
            columns.append((cols, OneHotEncoder(handle_unknown='ignore'), [counter]))
        counter = counter + 1 
        
    ct = ColumnTransformer(columns, remainder='passthrough')
    ct.fit(df)
    return ct

In [5]:
# Create the column transformer object.
ct = create_column_transformer(df_all)

In [6]:
# Need to split the data up in to test and train sets
X_train, X_test, y_train, y_test = train_test_split(df_all, y, test_size=0.3)

# Create a sklearn pipeline to use in the gridsearch
pipe = Pipeline([('sampling', RandomOverSampler()), ('transform', ct), ('rfc', RandomForestClassifier(n_jobs=-1))])

In [7]:
# Create the parameter grid used for the grid search.

param_grid = {
    'rfc__max_depth': [i for i in range(6, 14, 1)],
    'rfc__n_estimators': [i for i in range(100, 700, 100)]
}

In [8]:
# Create the grid search object with a 5 fold stratified k fold.
clf = GridSearchCV(pipe, param_grid, verbose=5, scoring='f1_weighted')

In [9]:
_ = clf.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
[CV] rfc__max_depth=6, rfc__n_estimators=100 .........................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  rfc__max_depth=6, rfc__n_estimators=100, score=0.887, total=   3.9s
[CV] rfc__max_depth=6, rfc__n_estimators=100 .........................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.8s remaining:    0.0s


[CV]  rfc__max_depth=6, rfc__n_estimators=100, score=0.893, total=   2.1s
[CV] rfc__max_depth=6, rfc__n_estimators=100 .........................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    5.9s remaining:    0.0s


[CV]  rfc__max_depth=6, rfc__n_estimators=100, score=0.891, total=   2.3s
[CV] rfc__max_depth=6, rfc__n_estimators=100 .........................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    8.2s remaining:    0.0s


[CV]  rfc__max_depth=6, rfc__n_estimators=100, score=0.895, total=   2.1s
[CV] rfc__max_depth=6, rfc__n_estimators=100 .........................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   10.3s remaining:    0.0s


[CV]  rfc__max_depth=6, rfc__n_estimators=100, score=0.889, total=   2.1s
[CV] rfc__max_depth=6, rfc__n_estimators=200 .........................
[CV]  rfc__max_depth=6, rfc__n_estimators=200, score=0.888, total=   3.6s
[CV] rfc__max_depth=6, rfc__n_estimators=200 .........................
[CV]  rfc__max_depth=6, rfc__n_estimators=200, score=0.891, total=   3.5s
[CV] rfc__max_depth=6, rfc__n_estimators=200 .........................
[CV]  rfc__max_depth=6, rfc__n_estimators=200, score=0.891, total=   3.6s
[CV] rfc__max_depth=6, rfc__n_estimators=200 .........................
[CV]  rfc__max_depth=6, rfc__n_estimators=200, score=0.894, total=   3.6s
[CV] rfc__max_depth=6, rfc__n_estimators=200 .........................
[CV]  rfc__max_depth=6, rfc__n_estimators=200, score=0.886, total=   3.5s
[CV] rfc__max_depth=6, rfc__n_estimators=300 .........................
[CV]  rfc__max_depth=6, rfc__n_estimators=300, score=0.894, total=   4.9s
[CV] rfc__max_depth=6, rfc__n_estimators=300 ...........

[Parallel(n_jobs=1)]: Done 240 out of 240 | elapsed: 36.9min finished


In [10]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [11]:
# Get the prediction of for the test set
prediction = clf.predict(X_test)

In [12]:
# Get the Precission Recall and F1 score for the models results
print(classification_report(y_test, prediction))

              precision    recall  f1-score   support

       False       0.99      0.85      0.92     71838
        True       0.04      0.59      0.08       834

    accuracy                           0.84     72672
   macro avg       0.52      0.72      0.50     72672
weighted avg       0.98      0.84      0.91     72672



In [13]:
# Crerate a confusion matrix for the model to see the amount of correct and in correct anwsers
confusion_matrix(y_test, prediction)

array([[60908, 10930],
       [  339,   495]], dtype=int64)