In [1]:
# Set up for running the code
import pandas
import math
import numpy
import sklearn
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import matplotlib.pyplot as plt
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn import ensemble
from yellowbrick.classifier import ClassificationReport,ConfusionMatrix


# Read in the data from the CSV file
dataframe = pandas.read_csv("Crime_Data_from_2020_to_Present.csv", dtype={'TIME OCC': str})


In [2]:
# Get only the date in the DATE OCC column
dataframe['DATE OCC'] = dataframe['DATE OCC'].str.split().str.get(0)

In [8]:
# Trim down the data to only have agregous crimes
nonAgregousCrimeCDs = [
    648,
    220,
    840,
    948,
    485,
    480,
    432,
    487,
    755,
    761,
    942,
    666,
    662,
    664,
    310,
    330,
    410,
    320,
    870,
    235,
    627,
    813,
    237,
    814,
    922,
    944,
    903,
    954,
    660,
    654,
    653,
    930,
    812,
    943,
    951,
    950,
    345,
    444,
    445,
    880,
    886,
    649,
    652,
    651,
    433,
    865,
    353,
    668,
    670,
    940,
    884,
    890,
    434,
    439,
    904,
    906,
    349,
    347,
    949,
    830,
    850,
    236,
    626,
    956,
    762,
    760,
    435,
    436,
    820,
    946,
    806,
    932,
    446,
    352,
    452,
    805,
    933,
    351,
    451,
    438,
    931,
    437,
    210,
    845,
    810,
    815,
    443,
    442,
    343,
    821,
    763,
    924,
    421,
    331,
    420,
    450,
    354,
    441,
    440,
    475,
    473,
    474,
    350,
    341,
    928,
    647,
    470,
    471,
    888,
    661,
    740,
    745,
    520,
    510,
    522,
    900,
    901,
    902,
    756
]

for cd in nonAgregousCrimeCDs:
    dataframe = dataframe[dataframe["Crm Cd 1"] != cd]

dataframe.dropna(subset=['Crm Cd 1'])

Unnamed: 0,DR_NO,Date Rptd,DATE OCC,TIME OCC,AREA,AREA NAME,Rpt Dist No,Part 1-2,Crm Cd,Crm Cd Desc,...,Crm Cd 4,LOCATION,Cross Street,LAT,LON,day,month,year,hour,minute
0,10304468,01/08/2020 12:00:00 AM,01/08/2020,2230,3,Southwest,377,2,624,BATTERY - SIMPLE ASSAULT,...,,1100 W 39TH PL,,34.0141,-118.2978,8,1,2020,22,30
1,190101086,01/02/2020 12:00:00 AM,01/01/2020,0330,1,Central,163,2,624,BATTERY - SIMPLE ASSAULT,...,,700 S HILL ST,,34.0459,-118.2545,1,1,2020,3,30
5,200100501,01/02/2020 12:00:00 AM,01/01/2020,0030,1,Central,163,1,121,"RAPE, FORCIBLE",...,,700 S BROADWAY,,34.0452,-118.2534,1,1,2020,0,30
24,200100546,01/15/2020 12:00:00 AM,01/15/2020,0700,1,Central,166,1,230,"ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT",...,,600 SAN JULIAN ST,,34.0428,-118.2461,15,1,2020,7,0
27,200100552,01/19/2020 12:00:00 AM,01/19/2020,2000,1,Central,111,1,230,"ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT",...,,ALAMEDA,LOS ANGELES,34.0578,-118.2371,19,1,2020,20,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
802932,231300825,06/07/2023 12:00:00 AM,06/07/2023,2203,13,Newton,1322,2,624,BATTERY - SIMPLE ASSAULT,...,,2300 WALL ST,,34.0254,-118.2629,7,6,2023,22,3
802934,231608412,05/21/2023 12:00:00 AM,05/20/2023,2130,16,Foothill,1663,2,624,BATTERY - SIMPLE ASSAULT,...,,12100 SHELDON ST,,34.2374,-118.3964,20,5,2023,21,30
802935,230512110,08/09/2023 12:00:00 AM,08/09/2023,1310,5,Harbor,516,1,230,"ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT",...,,1200 N AVALON BL,,33.7868,-118.2658,9,8,2023,13,10
802952,231606525,03/22/2023 12:00:00 AM,03/22/2023,1000,16,Foothill,1602,1,230,"ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT",...,,12800 FILMORE ST,,34.2790,-118.4116,22,3,2023,10,0


In [9]:
# Change the type of the column to date
dataframe['day'] = pandas.to_datetime(dataframe['DATE OCC'], format='%m/%d/%Y').dt.day
dataframe['month'] = pandas.to_datetime(dataframe['DATE OCC'], format='%m/%d/%Y').dt.month
dataframe['year'] = pandas.to_datetime(dataframe['DATE OCC'], format='%m/%d/%Y').dt.year

In [10]:
# Change the TIME OCC column to time
dataframe['hour'] = pandas.to_datetime(dataframe['TIME OCC'], format='%H%M').dt.hour
dataframe['minute'] = pandas.to_datetime(dataframe['TIME OCC'], format='%H%M').dt.minute

In [11]:
# Convert sex into: 0 male, 1 female, 2 unkown
col = dataframe['Vict Sex']

col = col.replace("M", 1)
col = col.replace("F", 2)
col = col.replace("X", 3)
col = col.replace(["-", "H", math.nan], 0)

dataframe['Vict Sex'] = col

In [12]:
# convert victim descent
# TODO: drop the columns of nan, -, etc
col = dataframe['Vict Descent']

"""
Descent Code: 
A - Other Asian - 1
B - Black - 2
C - Chinese - 3
D - Cambodian - 4
F - Filipino - 5
G - Guamanian - 6
H - Hispanic/Latin/Mexican - 7
I - American Indian/Alaskan Native - 8
J - Japanese - 9
K - Korean - 10
L - Laotian - 11
O - Other - 12
P - Pacific Islander - 13
S - Samoan - 14
U - Hawaiian - 15
V - Vietnamese - 16
W - White - 17
X - Unknown - 0
Z - Asian Indian- 18
nan - 0
-   - 0
"""

descentCode = {
    "A" : 1,
    "B" : 2,
    "C" : 3,
    "D" : 4,
    "F" : 5,
    "G" : 6,
    "H" : 7,
    "I" : 8,
    "J" : 9,
    "K" : 10,
    "L" : 11,
    "O" : 12,
    "P" : 13,
    "S" : 14,
    "U" : 15,
    "V" : 16,
    "W" : 17,
    "X" : 0,
    "Z" : 18,
    "nan" : 0,
    "-" : 0
}

for value, id in descentCode.items():
    if id != 0:
        col = col.replace(value, id)

col = col.replace(["-", "X", math.nan], 0)

dataframe['Vict Descent'] = col

In [18]:
# get the sample
trainingSet = dataframe.sample(frac=0.8)
testingSet = dataframe[~dataframe.index.isin(trainingSet.index)]


# XTrain = trainingSet[['AREA', 'DATE OCC', 'hour', 'minute']]
# XTrain = trainingSet[['AREA', 'hour', 'Vict Sex', 'Vict Descent', 'day', 'year']]
XTrain = trainingSet[['AREA', 'hour', 'Vict Sex', 'day', 'year']]
yTrain = trainingSet['Crm Cd']

# XTest = trainingSet[['AREA', 'DATE OCC', 'hour', 'minute']]
XTest = testingSet[['AREA', 'hour', 'Vict Sex', 'day', 'year']]
yTest = testingSet['Crm Cd']



In [19]:
# Create the model

rfcModel = ensemble.RandomForestClassifier()
rfcModel.fit(XTrain, yTrain)
# dataframe['DATE OCC']

In [20]:
rfcTestPrediction = rfcModel.predict(XTest)

In [22]:
RFCAccuracy = metrics.accuracy_score(yTest, rfcTestPrediction)
print(RFCAccuracy)

0.5016384697454657
