In [1]:
# Set up for running the code
import pandas
import math
import numpy
import sklearn
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import matplotlib.pyplot as plt
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn import ensemble
from yellowbrick.classifier import ClassificationReport,ConfusionMatrix


# Read in the data from the CSV file
dataframe = pandas.read_csv("Crime_Data_from_2020_to_Present.csv", dtype={'TIME OCC': str})


In [4]:
print(dataframe.dtypes)

DR_NO               int64
Date Rptd          object
DATE OCC           object
TIME OCC           object
AREA                int64
AREA NAME          object
Rpt Dist No         int64
Part 1-2            int64
Crm Cd              int64
Crm Cd Desc        object
Mocodes            object
Vict Age            int64
Vict Sex           object
Vict Descent       object
Premis Cd         float64
Premis Desc        object
Weapon Used Cd    float64
Weapon Desc        object
Status             object
Status Desc        object
Crm Cd 1          float64
Crm Cd 2          float64
Crm Cd 3          float64
Crm Cd 4          float64
LOCATION           object
Cross Street       object
LAT               float64
LON               float64
dtype: object


In [2]:
# Get only the date in the DATE OCC column
dataframe['DATE OCC'] = dataframe['DATE OCC'].str.split().str.get(0)

In [3]:
# Trim down the data to only have agregous crimes

dataframe.dropna(subset=['Crm Cd'])

agregousCrimeCDs = [
    231,
    230,
    624,
    622,
    623,
    860,
    110,
    753,
    822,
    921,
    882,
    910,
    920,
    113,
    625,
    122,
    121,
    251,
    250,
]

col = dataframe["Crm Cd"]

for cd in pandas.unique(col):
    if cd in agregousCrimeCDs:
        col = col.replace(cd, 1)
    else:
        col = col.replace(cd, 0)

dataframe["Crm Cd"] = col

In [9]:
# Change the type of the column to date
dataframe['day'] = pandas.to_datetime(dataframe['DATE OCC'], format='%m/%d/%Y').dt.day
dataframe['month'] = pandas.to_datetime(dataframe['DATE OCC'], format='%m/%d/%Y').dt.month
dataframe['year'] = pandas.to_datetime(dataframe['DATE OCC'], format='%m/%d/%Y').dt.year

In [10]:
# Change the TIME OCC column to time
dataframe['hour'] = pandas.to_datetime(dataframe['TIME OCC'], format='%H%M').dt.hour
dataframe['minute'] = pandas.to_datetime(dataframe['TIME OCC'], format='%H%M').dt.minute

In [11]:
# Convert sex into: 0 male, 1 female, 2 unkown
col = dataframe['Vict Sex']

col = col.replace("M", 1)
col = col.replace("F", 2)
col = col.replace("X", 3)
col = col.replace(["-", "H", math.nan], 0)

dataframe['Vict Sex'] = col

In [12]:
# convert victim descent
# TODO: drop the columns of nan, -, etc
col = dataframe['Vict Descent']

"""
Descent Code: 
A - Other Asian - 1
B - Black - 2
C - Chinese - 3
D - Cambodian - 4
F - Filipino - 5
G - Guamanian - 6
H - Hispanic/Latin/Mexican - 7
I - American Indian/Alaskan Native - 8
J - Japanese - 9
K - Korean - 10
L - Laotian - 11
O - Other - 12
P - Pacific Islander - 13
S - Samoan - 14
U - Hawaiian - 15
V - Vietnamese - 16
W - White - 17
X - Unknown - 0
Z - Asian Indian- 18
nan - 0
-   - 0
"""

descentCode = {
    "A" : 1,
    "B" : 2,
    "C" : 3,
    "D" : 4,
    "F" : 5,
    "G" : 6,
    "H" : 7,
    "I" : 8,
    "J" : 9,
    "K" : 10,
    "L" : 11,
    "O" : 12,
    "P" : 13,
    "S" : 14,
    "U" : 15,
    "V" : 16,
    "W" : 17,
    "X" : 0,
    "Z" : 18,
    "nan" : 0,
    "-" : 0
}

for value, id in descentCode.items():
    if id != 0:
        col = col.replace(value, id)

col = col.replace(["-", "X", math.nan], 0)

dataframe['Vict Descent'] = col

In [18]:
# get the sample
trainingSet = dataframe.sample(frac=0.8)
testingSet = dataframe[~dataframe.index.isin(trainingSet.index)]


# XTrain = trainingSet[['AREA', 'DATE OCC', 'hour', 'minute']]
# XTrain = trainingSet[['AREA', 'hour', 'Vict Sex', 'Vict Descent', 'day', 'year']]
XTrain = trainingSet[['AREA', 'hour', 'Vict Sex', 'day', 'year']]
yTrain = trainingSet['Crm Cd']

# XTest = trainingSet[['AREA', 'DATE OCC', 'hour', 'minute']]
XTest = testingSet[['AREA', 'hour', 'Vict Sex', 'day', 'year']]
yTest = testingSet['Crm Cd']



In [19]:
# Create the model

rfcModel = ensemble.RandomForestClassifier()
rfcModel.fit(XTrain, yTrain)
# dataframe['DATE OCC']

In [20]:
rfcTestPrediction = rfcModel.predict(XTest)

In [22]:
RFCAccuracy = metrics.accuracy_score(yTest, rfcTestPrediction)
print(RFCAccuracy)

0.5016384697454657
