In [171]:
import pandas as pd
from matplotlib import pyplot as plt
from matplotlib import patches as patches
import matplotlib.lines as mlines
import numpy as np

data = pd.read_csv("Crime_Data_from_2010_to_Present.csv", 
                   usecols = ["DR Number","Date Occurred", "Time Occurred", "Area ID", "Crime Code", "Victim Age", "Victim Sex", "Victim Descent"],
                  nrows=50000)
print(data.shape)
print(data.dtypes)
data.head()

(50000, 8)
DR Number           int64
Date Occurred      object
Time Occurred       int64
Area ID             int64
Crime Code          int64
Victim Age        float64
Victim Sex         object
Victim Descent     object
dtype: object


Unnamed: 0,DR Number,Date Occurred,Time Occurred,Area ID,Crime Code,Victim Age,Victim Sex,Victim Descent
0,1208575,03/11/2013,1800,12,626,30.0,F,W
1,102005556,01/22/2010,2300,20,510,,,
2,418,03/18/2013,2030,18,510,12.0,,
3,101822289,11/10/2010,1800,18,510,,,
4,42104479,01/04/2014,2300,21,745,84.0,M,W


In [172]:
#PREPROCESSING
#Data Transformation: Map nominal values to integers between 0 and 10 for easier implementation of algorithms
#Data Reduction: Create groups for values (e.g., create 4-hour intervals for time) 
#   in order to find more frequent patterns and clusters

#Crime Features: Time, Location, Type
data['Date Occurred'] = pd.to_datetime(data['Date Occurred'],yearfirst=True)
#Split Date into new Year, Month, Day columns
data["Year Occurred"] = data["Date Occurred"].dt.year
data["Month Occurred"] = data["Date Occurred"].dt.month
data["Day of Week"] = data["Date Occurred"].dt.dayofweek

year_map = {2010:0,2011:1,2012:2,2013:3,2014:4,2015:5,2016:6,2017:7,2018:8}

#Group months into Q1, Q2, Q3, Q4
month_map = {}
for key in [1,2,3]:
    month_map[key] = 1
for key in [4,5,6]:
    month_map[key] = 2
for key in [7,8,9]:
    month_map[key] = 3
for key in [10,11,12]:
    month_map[key] = 4

#Group Time values into 4-hour intervals
time_map = {}
#T0: 1am to 4:59am
for key in range(100,460):
    time_map[key] = 0
#T1: 5am to 8:59am
for key in range(500,860):
    time_map[key] = 1
#T2: 9am to 12:59pm
for key in range(900,1260):
    time_map[key] = 2
#T3: 1pm to 4:59pm
for key in range(1300,1660):
    time_map[key] = 3
#T4: 5pm to 8:59pm
for key in range(1700,2060):
    time_map[key] = 4
#T5: 9pm to 12:59am
for key in range(2100,2360):
    time_map[key] = 5
for key in range(0,60):
    time_map[key] = 5

#Group Crime Codes as Property (0) or Violent (1)
crime_map = {}
for key in [210,220,310,320,330,331,341,343,345,347, 349,350,351,352,353,354,410,420,421, 432,433,434,439,440,441,442,443,444, 445,446,450,451,452,453, 470,471, 472, 473,474,475,480,485,487,510,520,649,651, 652,653, 654, 660,661,662,664,666,668,670,740,745,760,814,822,888,890,900,903, 906,921,924,926,942,944,946, 948,949,950,951,954]:
    crime_map[key] = 0
for key in [110,113,121,122,230,231,235,236,237,250,251,435,436,437,438,622,623,624,625,626,627,647,648,753,755,756,761,762,763,805,806,810,812,813,815,820,821,830,840,850,860,865,870,880,882,884,886,901,902,910,920,922,928,930,931,932,933,940,943,952,956]:
    crime_map[key] = 1

#Group LAPD Geographic Areas by their corresponding bureaus
area_map = {}
#A0: Central
for key in [1,2,4,11,13]:
    area_map[key] = 0
#A1: South
for key in [3,5,12,18]:
    area_map[key] = 1
#A2: West
for key in [6,7,8,14,20]:
    area_map[key] = 2
#A3: Valley
for key in [9,10,15,16,17,19,21]:
    area_map[key] = 3

    
#Victim Features: Sex, Age, Descent
sex_map = {}
for key in ["M"]:
    sex_map[key] = 0
for key in ["F"]:
    sex_map[key] = 1
for key in ["H", "X", "-"]:
    sex_map[key] = np.NaN

age_map = {}
for key in [0,1,2,3,4,5,6,7,8,9]:
    age_map[key] = 0
for key in [10,11,12,13,14,15,16,17,18,19]:
    age_map[key] = 1
for key in [20,21,22,23,24,25,26,27,28,29]:
    age_map[key] = 2
for key in [30,31,32,33,34,35,36,37,38,39]:
    age_map[key] = 3
for key in [40,41,42,43,44,45,46,47,48,49]:
    age_map[key] = 4
for key in [50,51,52,53,54,55,56,57,58,59]:
    age_map[key] = 5
for key in [60,61,62,63,64,65,66,67,68,69]:
    age_map[key] = 6
for key in [70,71,72,73,74,75,76,77,78,79]:
    age_map[key] = 7
for key in [80,81,82,83,84,85,86,87,88,89]:
    age_map[key] = 8
for key in [90,91,92,93,94,95,96,97,98,99]:
    age_map[key] = 9
    
descent_map = {}
#Asian
for key in ["A", "C", "D", "F", "J", "K", "L", "V", "Z"]:
    descent_map[key] = 0
#Pacific Islander
for key in ["G", "P", "S", "U"]:
    descent_map[key] = 1
#Black
for key in ["B"]:
    descent_map[key] = 2
#Hispanic/Latinx
for key in ["H"]:
    descent_map[key] = 3
#American Indian/Alaska Native
for key in ["I"]:
    descent_map[key] = 4
#White
for key in ["W"]:
    descent_map[key] = 5
#Other
for key in ["O"]:
    descent_map[key] = 6
#Unknown
for key in ["X"]:
    descent_map[key] = np.NaN    

In [173]:
#PREPROCESSING
#Apply mappings, clear out null values
data["Year Occurred"].update( data["Year Occurred"].map(year_map))
data["Month Occurred"].update( data["Month Occurred"].map(month_map) )
data["Time Occurred"].update( data["Time Occurred"].map(time_map) )
data["Area ID"].update( data["Area ID"].map(area_map) )
data["Crime Code"].update( data["Crime Code"].map(crime_map) )
data = data.replace({"Victim Sex":sex_map}) #this ensures that certain values are mapped to NaN
data["Victim Age"].update( data["Victim Age"].map(age_map) )
data = data.replace({"Victim Descent":descent_map})
data = data.dropna(axis=0)
print(data.shape)
print(data.dtypes)
data.head()

(41272, 11)
DR Number                  int64
Date Occurred     datetime64[ns]
Time Occurred              int64
Area ID                    int64
Crime Code                 int64
Victim Age               float64
Victim Sex               float64
Victim Descent           float64
Year Occurred              int64
Month Occurred             int64
Day of Week                int64
dtype: object


Unnamed: 0,DR Number,Date Occurred,Time Occurred,Area ID,Crime Code,Victim Age,Victim Sex,Victim Descent,Year Occurred,Month Occurred,Day of Week
0,1208575,2013-03-11,4,1,1,3.0,1.0,5.0,3,1,0
4,42104479,2014-01-04,5,3,0,8.0,0.0,5.0,4,1,5
5,120125367,2013-01-08,3,0,1,4.0,1.0,5.0,3,1,1
9,120908292,2013-01-15,1,3,0,2.0,1.0,6.0,3,1,1
12,121207315,2013-02-13,2,1,0,4.0,0.0,3.0,3,1,2


In [174]:
#Create multidimensional arrays for type of crime, victim information, time and location of crime
size = len(data.index)
crime_type = data["Crime Code"].values.reshape(size,1)
print(crime_type.shape)
victim = np.column_stack((data["Victim Age"],data["Victim Sex"],data["Victim Descent"]))
print(victim.shape)
area_time = np.column_stack((data["Year Occurred"],data["Month Occurred"],data["Day of Week"],
                            data["Time Occurred"],data["Area ID"]))
print(area_time.shape)

(41272, 1)
(41272, 3)
(41272, 5)
