In [1]:
# Set up for running the code
import pandas
import math
import numpy
import sklearn
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import matplotlib.pyplot as plt
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn import ensemble
from yellowbrick.classifier import ClassificationReport,ConfusionMatrix
from sklearn.naive_bayes import GaussianNB


# Read in the data from the CSV file
dataframe = pandas.read_csv("Crime_Data_from_2020_to_Present.csv", dtype={'TIME OCC': str})


In [2]:
# Get only the date in the DATE OCC column
dataframe['DATE OCC'] = dataframe['DATE OCC'].str.split().str.get(0)

In [3]:
# Trim down the data to only have agregous crimes
dataframe.dropna(subset=['Crm Cd'])

# Crime codes of violent crimes
agregousCrimeCDs = [
    231,
    230,
    624,
    622,
    623,
    860,
    110,
    753,
    822,
    921,
    882,
    910,
    920,
    113,
    625,
    122,
    121,
    251,
    250,
]

col = dataframe["Crm Cd"]

# Replace every agregous crimes with a value of 1, the others as 0
for cd in pandas.unique(col):
    if cd in agregousCrimeCDs:
        col = col.replace(cd, 1)
    else:
        col = col.replace(cd, 0)

dataframe["Crm Cd"] = col

In [4]:
# Seperate out the date information with seperate columns
dataframe['day'] = pandas.to_datetime(dataframe['DATE OCC'], format='%m/%d/%Y').dt.day
dataframe['month'] = pandas.to_datetime(dataframe['DATE OCC'], format='%m/%d/%Y').dt.month
dataframe['year'] = pandas.to_datetime(dataframe['DATE OCC'], format='%m/%d/%Y').dt.year

In [5]:
# Change the TIME OCC column to seperate time columns
dataframe['hour'] = pandas.to_datetime(dataframe['TIME OCC'], format='%H%M').dt.hour
dataframe['minute'] = pandas.to_datetime(dataframe['TIME OCC'], format='%H%M').dt.minute

In [6]:
# Convert sex into: 0 male, 1 female, 2 unkown
col = dataframe['Vict Sex']

col = col.replace("M", 1)
col = col.replace("F", 2)
col = col.replace("X", 3)
col = col.replace(["-", "H", math.nan], 0)

dataframe['Vict Sex'] = col

In [7]:
# convert victim descent
# TODO: drop the columns of nan, -, etc
col = dataframe['Vict Descent']

"""
Descent Code (victim descent information): 
A - Other Asian - 1
B - Black - 2
C - Chinese - 3
D - Cambodian - 4
F - Filipino - 5
G - Guamanian - 6
H - Hispanic/Latin/Mexican - 7
I - American Indian/Alaskan Native - 8
J - Japanese - 9
K - Korean - 10
L - Laotian - 11
O - Other - 12
P - Pacific Islander - 13
S - Samoan - 14
U - Hawaiian - 15
V - Vietnamese - 16
W - White - 17
X - Unknown - 0
Z - Asian Indian- 18
nan - 0
-   - 0
"""

descentCode = {
    "A" : 1,
    "B" : 2,
    "C" : 3,
    "D" : 4,
    "F" : 5,
    "G" : 6,
    "H" : 7,
    "I" : 8,
    "J" : 9,
    "K" : 10,
    "L" : 11,
    "O" : 12,
    "P" : 13,
    "S" : 14,
    "U" : 15,
    "V" : 16,
    "W" : 17,
    "X" : 0,
    "Z" : 18,
    "nan" : 0,
    "-" : 0
}

# Replace demographic information with numerical key as descrived above
for value, id in descentCode.items():
    if id != 0:
        col = col.replace(value, id)

col = col.replace(["-", "X", math.nan], 0)

dataframe['Vict Descent'] = col

In [8]:
# get the sample
trainingSet = dataframe.sample(frac=0.8)
testingSet = dataframe[~dataframe.index.isin(trainingSet.index)]

# Get the training and testing sets
# Using the following attributes for training:
#   AREA (area code for a given location where the crime occured)
#   hour, day, year (times at which the crime occured)
#   Vict Sex, Vict Descent (demographic information about the victim)

XTrain = trainingSet[['AREA', 'hour', 'Vict Sex', 'day', 'year', 'Vict Descent']]
yTrain = trainingSet['Crm Cd']

XTest = testingSet[['AREA', 'hour', 'Vict Sex', 'day', 'year', 'Vict Descent']]
yTest = testingSet['Crm Cd']



In [9]:
# Create the model (RFC)
rfcModel = ensemble.RandomForestClassifier()

# Train the model
rfcModel.fit(XTrain, yTrain)

# Make prediction off of test set
rfcTestPrediction = rfcModel.predict(XTest)

# Calculate accuracy of the test prediction
RFCAccuracy = metrics.accuracy_score(rfcTestPrediction, yTest)
print("Random Forest Classifier Accuracy: ", RFCAccuracy)

Random Forest Classifier Accuracy:  0.7966199849306623


In [10]:
# Create the model (Naive Bayes)
nbModel = GaussianNB()

# Train model
nbModel.fit(XTrain, yTrain)

# Predict the test set
NByPred = nbModel.predict(XTest)

# Calculate model accuracy
nbAccuracy = metrics.accuracy_score(NByPred, yTest)
print("Naive Bays Accuracy: ", nbAccuracy)

Naive Bays Accuracy:  0.8380046204332746
