# Arrests in the City of Chicago (2014 - 2023)

In [1]:
import pandas as pd
import numpy as np
pd.set_option("display.max_columns",100)

from scipy import stats
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.model_selection import train_test_split

import tensorflow as tf

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.optimizers import Adam

from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
from sklearn.metrics import confusion_matrix,classification_report

from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()

from imblearn.over_sampling import SMOTE

import warnings
warnings.filterwarnings('ignore')

In [2]:
df=pd.read_csv("chicago_arrests.csv") #https://www.kaggle.com/datasets/mattop/arrests-in-the-city-of-chicago-2014-2023

### EDA

In [3]:
df

Unnamed: 0,CB_NO,CASE NUMBER,ARREST DATE,RACE,CHARGE 1 STATUTE,CHARGE 1 DESCRIPTION,CHARGE 1 TYPE,CHARGE 1 CLASS,CHARGE 2 STATUTE,CHARGE 2 DESCRIPTION,CHARGE 2 TYPE,CHARGE 2 CLASS,CHARGE 3 STATUTE,CHARGE 3 DESCRIPTION,CHARGE 3 TYPE,CHARGE 3 CLASS,CHARGE 4 STATUTE,CHARGE 4 DESCRIPTION,CHARGE 4 TYPE,CHARGE 4 CLASS,CHARGES STATUTE,CHARGES DESCRIPTION,CHARGES TYPE,CHARGES CLASS
0,18922700,,06/26/2014 10:51:00 PM,BLACK,725 ILCS 5.0/110-3,ISSUANCE OF WARRANT,,Z,,,,,,,,,,,,,725 ILCS 5.0/110-3 | | |,ISSUANCE OF WARRANT | | |,| | |,Z | | |
1,18922086,,06/26/2014 12:07:00 AM,BLACK,625 ILCS 5.0/6-303-A,DRIVING ON SUSPENDED LICENSE,M,A,9-76-160(A),DISPLAY ST REG-FRONT/REAR,,L,,,,,,,,,625 ILCS 5.0/6-303-A | 9-76-160(A) | |,DRIVING ON SUSPENDED LICENSE | DISPLAY ST REG-...,M | | |,A | L | |
2,18908465,HX290868,06/05/2014 11:58:00 AM,BLACK,720 ILCS 570.0/402-C,PCS - POSSESSION - POSS AMT CON SUB EXCEPT (A)(D),F,4,720 ILCS 600.0/3.5-A,PCS - DRUG PARAPHERNALIA - POSSESS,M,A,,,,,,,,,720 ILCS 570.0/402-C | 720 ILCS 600.0/3.5-A | |,PCS - POSSESSION - POSS AMT CON SUB EXCEPT (A)...,F | M | |,4 | A | |
3,18914257,HX302763,06/14/2014 04:50:00 AM,BLACK,720 ILCS 5.0/12-3-A-1,BATTERY - CAUSE BODILY HARM,M,A,520 ILCS 5.0/1.22,RESIST/OBSTRUCT OFFICER,M,A,720 ILCS 5.0/12-3-A-1,BATTERY - CAUSE BODILY HARM,M,A,,,,,720 ILCS 5.0/12-3-A-1 | 520 ILCS 5.0/1.22 | 72...,BATTERY - CAUSE BODILY HARM | RESIST/OBSTRUCT ...,M | M | M |,A | A | A |
4,18911035,,06/09/2014 08:45:00 AM,BLACK,8-4-030,DRINKING ALCOHOL ON THE PUBLIC WAY,,L,,,,,,,,,,,,,8-4-030 | | |,DRINKING ALCOHOL ON THE PUBLIC WAY | | |,| | |,L | | |
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
585413,30209325,JG117312,01/15/2023 04:02:00 PM,BLACK,720 ILCS 5.0/24-1.6-A-1,AGG UUW/VEHICLE/LOADED/NO FCCA,F,4,,,,,,,,,,,,,720 ILCS 5.0/24-1.6-A-1 | | |,AGG UUW/VEHICLE/LOADED/NO FCCA | | |,F | | |,4 | | |
585414,30209368,JG117594,01/15/2023 07:41:00 PM,BLACK,720 ILCS 570.0/402-C,PCS - POSSESSION - POSS AMT CON SUB EXCEPT (A)(D),F,4,625 ILCS 5.0/11-501-A-4,IVC - DRIVING UNDER INFLUENCE OF DRUG,M,A,625 ILCS 5.0/11-502.15-B,UNLWFL POSS CANNABIS/DRIVER,M,A,625 ILCS 5.0/3-702-B,USE CANC/REVOKE REGISTRATION,M,A,720 ILCS 570.0/402-C | 625 ILCS 5.0/11-501-A-4...,PCS - POSSESSION - POSS AMT CON SUB EXCEPT (A)...,F | M | M | M,4 | A | A | A
585415,30209388,JG117599,01/15/2023 09:27:00 PM,WHITE HISPANIC,720 ILCS 550.0/4-D,POSS CAN/>100-500 GRAM/1ST,F,4,625 ILCS 5.0/3-701-1,NO VALID REGISTRATION,,P,,,,,,,,,720 ILCS 550.0/4-D | 625 ILCS 5.0/3-701-1 | |,POSS CAN/>100-500 GRAM/1ST | NO VALID REGISTRA...,F | | |,4 | P | |
585416,30209311,,01/15/2023 03:16:00 PM,WHITE HISPANIC,625 ILCS 5.0/6-303-A,DRIVING ON SUSPENDED LICENSE,M,A,9-76-160(F),REGISTRATION PLATES,,L,,,,,,,,,625 ILCS 5.0/6-303-A | 9-76-160(F) | |,DRIVING ON SUSPENDED LICENSE | REGISTRATION PL...,M | | |,A | L | |


In [4]:
df.isnull().sum() #We examine the empty lines.

CB_NO                        0
CASE NUMBER             227460
ARREST DATE                  0
RACE                         0
CHARGE 1 STATUTE             0
CHARGE 1 DESCRIPTION         0
CHARGE 1 TYPE           126509
CHARGE 1 CLASS            4716
CHARGE 2 STATUTE        337455
CHARGE 2 DESCRIPTION    337455
CHARGE 2 TYPE           444510
CHARGE 2 CLASS          339657
CHARGE 3 STATUTE        449367
CHARGE 3 DESCRIPTION    449367
CHARGE 3 TYPE           538332
CHARGE 3 CLASS          450006
CHARGE 4 STATUTE        510008
CHARGE 4 DESCRIPTION    510008
CHARGE 4 TYPE           566328
CHARGE 4 CLASS          510211
CHARGES STATUTE              0
CHARGES DESCRIPTION          0
CHARGES TYPE                 0
CHARGES CLASS                0
dtype: int64

### Feature Engineering

In [5]:
df["CHARGES STATUTE"].value_counts()

725 ILCS 5.0/110-3 |  |  |                                                                          69175
720 ILCS 570.0/402-C |  |  |                                                                        22075
720 ILCS 5.0/16-25-A-1 |  |  |                                                                      20503
720 ILCS 5.0/12-3.2-A-1 |  |  |                                                                     17854
720 ILCS 5.0/12-3.2-A-2 |  |  |                                                                     15162
                                                                                                    ...  
720 ILCS 5.0/16-1-A-1 | 720 ILCS 5.0/21-1-A-1 | 720 ILCS 5.0/32-10-A |                                  1
625 ILCS 5.0/11-501-A-2 | 625 ILCS 5.0/3-707 | 9-16-020(C) | 625 ILCS 5.0/6-101                         1
720 ILCS 5.0/24-1.1-A | 720 ILCS 550.0/4-B | 9-40-220 | 625 ILCS 5.0/6-101                              1
720 ILCS 5.0/16-25-A-4 | 1-4-145-A |  |       

In [6]:
df=df.drop(["CHARGES STATUTE","CHARGES DESCRIPTION","CHARGES TYPE","CHARGES CLASS","CASE NUMBER"],axis=1)
#We delete some columns.

In [7]:
df["CHARGE 2 STATUTE"].fillna("None",inplace=True)
df["CHARGE 2 DESCRIPTION"].fillna("None",inplace=True)
df["CHARGE 2 TYPE"].fillna("None",inplace=True)
df["CHARGE 2 CLASS"].fillna("None",inplace=True)
df["CHARGE 3 STATUTE"].fillna("None",inplace=True)
df["CHARGE 3 DESCRIPTION"].fillna("None",inplace=True)
df["CHARGE 3 TYPE"].fillna("None",inplace=True)
df["CHARGE 3 CLASS"].fillna("None",inplace=True)
df["CHARGE 4 STATUTE"].fillna("None",inplace=True)
df["CHARGE 4 DESCRIPTION"].fillna("None",inplace=True)
df["CHARGE 4 TYPE"].fillna("None",inplace=True)
df["CHARGE 4 CLASS"].fillna("None",inplace=True)
#We change empty data to None.

In [8]:
df=df.dropna() #We delete rows containing empty data.

In [9]:
df["ARREST DATE"]=pd.to_datetime(df["ARREST DATE"])
df["day"]=(df["ARREST DATE"]).dt.day
df["month"]=(df["ARREST DATE"]).dt.month
df["year"]=(df["ARREST DATE"]).dt.year
df["time"]=(df["ARREST DATE"]).dt.time
del df["ARREST DATE"]
#We divide ARREST DATE into 4 as day, month, year and hour.

In [10]:
df["RACE"].value_counts()

BLACK                           327000
WHITE HISPANIC                   84054
WHITE                            40413
BLACK HISPANIC                    3193
ASIAN / PACIFIC ISLANDER          3048
UNKNOWN / REFUSED                  757
AMER INDIAN / ALASKAN NATIVE       280
Name: RACE, dtype: int64

In [11]:
df=df[df['RACE']!="UNKNOWN / REFUSED"] #We delete UNKNOWN / REFUSED data in RACE.
df["RACE"]=df["RACE"].map({"BLACK":0,"BLACK HISPANIC":0,"WHITE HISPANIC":1,"WHITE":1,"ASIAN / PACIFIC ISLANDER":2,"AMER INDIAN / ALASKAN NATIVE":2})
#We change the data in RACE to 0,1 and 2.
df['RACE']=df['RACE'].astype(int) #We change the type of data in Race to integer.

In [12]:
df=df[df['year']>2021] #We delete the data in year that is less than 2022.

In [13]:
abs(df.corr()['RACE']).sort_values(ascending=False)

RACE     1.000000
month    0.010804
CB_NO    0.009366
year     0.001662
day      0.000666
Name: RACE, dtype: float64

### Classification

In [19]:
x,y=df.drop(["CB_NO","RACE","year","day"],axis=1),df[["RACE"]]
x=pd.get_dummies(x,drop_first=True)
smote = SMOTE(sampling_strategy='minority')
x_sm, y_sm = smote.fit_resample(x, y)
x_sm=scaler.fit_transform(x_sm)
x.shape

(30421, 6092)

In [20]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.20,random_state=42)

In [21]:
model=Sequential()
model.add(Dense(16,activation="relu"))
model.add(Dense(32,activation="relu"))
model.add(Dense(64,activation="relu"))
model.add(Dense(8,activation="relu"))
model.add(Dense(1,activation="sigmoid"))
model.compile(loss="binary_crossentropy",optimizer="adam",metrics="accuracy")

In [22]:
model.fit(x_train,y_train,validation_data=(x_test,y_test),batch_size=10,epochs=50,verbose=1)
model.summary()

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_5 (Dense)             (None, 16)                97488     
                                                                 
 dense_6 (Dense)             (None, 32)                544       
                                                         

 dense_8 (Dense)             (None, 8)                 520       
                                                                 
 dense_9 (Dense)             (None, 1)                 9         
                                                                 
Total params: 100,673
Trainable params: 100,673
Non-trainable params: 0
_________________________________________________________________


In [23]:
scores=model.evaluate(x_test,y_test)

