In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import time
import collections
from datetime import datetime

df = pd.read_csv("dataset/complaint_data.csv")
df_col_desc = pd.read_csv("dataset/column_desc.csv")

# Data cleansing - removing null columns and data points

dropcols = ["Lat_Lon","X_COORD_CD","Y_COORD_CD","CMPLNT_NUM"] # Redundant columns
for i in df.columns:
    if float(df[i].isnull().sum())/len(df)*100 > 20: # if more than 10% data is empty, clip the column
        dropcols.append(i)
df = df.drop(dropcols, axis=1)
df = df[~df.isnull().any(axis=1)]

# Parsing the DateTime column 

df["RPT_DT"] = df["RPT_DT"].apply(lambda x: datetime.strptime(x, '%m/%d/%y')
                                  .strftime('%Y-%m-%d')).astype('datetime64[ns]')
df["CMPLNT_FR_DT"] = df["CMPLNT_FR_DT"].apply(lambda x: datetime.strptime(x, '%m/%d/%y')
                                              .strftime('%Y-%m-%d')).astype('datetime64[ns]')
df["CMPLNT_TO_DT"] = df["CMPLNT_TO_DT"].apply(lambda x: datetime.strptime(x, '%m/%d/%y')
                                              .strftime('%Y-%m-%d')).astype('datetime64[ns]')

df["RPT_DT"] = pd.to_datetime(df["RPT_DT"])
df["CMPLNT_FR_DT"] = pd.to_datetime(df["CMPLNT_FR_DT"])
df["CMPLNT_TO_DT"] = pd.to_datetime(df["CMPLNT_TO_DT"])

df["FR_TM_CAT"] = df["CMPLNT_FR_TM"].apply(lambda x: int(x.strip().split(':')[0])//6)
df["FR_TM_CAT"] = df["CMPLNT_TO_TM"].apply(lambda x: int(x.strip().split(':')[0])//6)

# Removing outliers

df = df[~(df['CMPLNT_TO_DT'] >= '2017-01-01')]

In [2]:
# Compiling list of Offenses, their categories and key codes

offense_code = {}
offlist = df["OFNS_DESC"].unique()
for each in offlist:
    key = df[df["OFNS_DESC"]==each]["KY_CD"].unique()[0]
    offense_code[key] = each

print "Offense Codes and Description " + str(len(offense_code))
print ""
offense_code = collections.OrderedDict(sorted(offense_code.items()))
for k,v in offense_code.items():
    print k,v
    
# Repeating the same for Police Codes and Descriptions

pd_code = {}
pdlist = df["PD_DESC"].unique()
for each in pdlist:
    key = df[df["PD_DESC"]==each]["PD_CD"].unique()[0]
    pd_code[key] = each

print ""
print "Police Department Codes and Descriptions " + str(len(pd_code))
pd_code = collections.OrderedDict(sorted(pd_code.items()))
for k,v in pd_code.items():
    print k,v

# INFERENCES

# Using only KY_CD, PD_CD for analysis

#df = df.drop(['PD_DESC','OFNS_DESC'], axis=1)

Offense Codes and Description 54

102 HOMICIDE-NEGLIGENT-VEHICLE
103 HOMICIDE-NEGLIGENT,UNCLASSIFIE
105 ROBBERY
106 FELONY ASSAULT
107 BURGLARY
109 GRAND LARCENY
110 GRAND LARCENY OF MOTOR VEHICLE
111 POSSESSION OF STOLEN PROPERTY
112 THEFT-FRAUD
113 FORGERY
114 ARSON
118 DANGEROUS WEAPONS
119 INTOXICATED/IMPAIRED DRIVING
120 ENDAN WELFARE INCOMP
124 KIDNAPPING
125 NYS LAWS-UNCLASSIFIED FELONY
126 MISCELLANEOUS PENAL LAW
230 JOSTLING
231 BURGLAR'S TOOLS
233 SEX CRIMES
235 DANGEROUS DRUGS
237 ESCAPE 3
238 FRAUDULENT ACCOSTING
340 FRAUDS
341 PETIT LARCENY
342 PETIT LARCENY OF MOTOR VEHICLE
343 THEFT OF SERVICES
344 ASSAULT 3 & RELATED OFFENSES
345 OFFENSES RELATED TO CHILDREN
346 ALCOHOLIC BEVERAGE CONTROL LAW
347 INTOXICATED & IMPAIRED DRIVING
348 VEHICLE AND TRAFFIC LAWS
349 DISRUPTION OF A RELIGIOUS SERV
350 GAMBLING
351 CRIMINAL MISCHIEF & RELATED OF
352 CRIMINAL TRESPASS
353 UNAUTHORIZED USE OF A VEHICLE
354 ANTICIPATORY OFFENSES
355 OFFENSES AGAINST THE PERSON
356 PROSTITUTION & RE

In [3]:
# NOTES

# CRM_ATPT_CPTD_CD - Generate Category variable (Successful, Failed, Attempted)
# Types of Crime across Zipcodes - Start with LAW_CAT_CD
# Precinct involved - look for patterns across this
# Categorical variables include - CRM_ATPT_CPTD_CD, KY_CD, PD_CD, LAW_CAT_CD, ADDR_PCT_CD, 
#                                 BORO_NM, PREM_TYP_DESC, ZIPCODE
# Numerical variables - Latitude, Longitude
# TRY - KMODES clustering for mixed data
# Check diff in dates of crime committed and reported
# Drop columns PD_CD, PD_DESC, OFNS_DESC
# Duration of crimes - 'CMPLNT_FR_TM','CMPLNT_TO_TM'

In [4]:
# MACHINE LEARNING - GENERATING CATEGORY VARIABLES

from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(df['CRM_ATPT_CPTD_CD'])
df['CRM_CAT'] = le.transform(df['CRM_ATPT_CPTD_CD'])

le = preprocessing.LabelEncoder()
le.fit(df['LAW_CAT_CD'])
df['LAW_CAT'] = le.transform(df['LAW_CAT_CD'])

le = preprocessing.LabelEncoder()
le.fit(df['JURIS_DESC'])
df['JURIS_CAT'] = le.transform(df['JURIS_DESC'])

le = preprocessing.LabelEncoder()
le.fit(df['BORO_NM'])
df['BORO_CAT'] = le.transform(df['BORO_NM'])

le = preprocessing.LabelEncoder()
le.fit(df['PREM_TYP_DESC'])
df['PREM_CAT'] = le.transform(df['PREM_TYP_DESC'])

df['DATE_CAT'] = df['CMPLNT_FR_DT'].map(lambda x: x.month)
df = df.reset_index(drop=True)

In [5]:
# K-Means Clustering Algorithm
from sklearn.neighbors import NearestNeighbors
from kmodes.kprototypes import KPrototypes

# KMeans Algorithm

from sklearn.cluster import KMeans
def clustering(train, usecols, n):
    #kmeans = KMeans(n_clusters=n, random_state=0,).fit(train)
    kproto = KPrototypes(n_clusters=n, init='Cao', verbose=2)
    kproto.fit_predict(train, categorical=usecols)
    return kproto.cluster_centroids_, kproto.labels_

cols = ["FR_TM_CAT","DATE_CAT","KY_CD","CRM_CAT","LAW_CAT","BORO_CAT","ADDR_PCT_CD",
        "PREM_CAT","Latitude","Longitude"]
cat = [0,1,2,3,4,5,6,7] # as first n columns are categorical
num_clusters = 3
centers, labels = clustering(df[cols].as_matrix(),cat,num_clusters)
print centers

Initialization method and algorithm are deterministic. Setting n_init to 1.
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run: 1, iteration: 1/100, moves: 61537, ncost: 64529186.4909
Run: 1, iteration: 2/100, moves: 3148, ncost: 64529186.2363
Run: 1, iteration: 3/100, moves: 243, ncost: 64529186.2314
Run: 1, iteration: 4/100, moves: 44, ncost: 64529186.2311
Run: 1, iteration: 5/100, moves: 2, ncost: 64529186.2311
Run: 1, iteration: 6/100, moves: 0, ncost: 64529186.2311
[array([[ 40.70255807, -73.94656836],
       [ 40.76374052, -73.93925212],
       [ 40.74696025, -73.87418128]]), array([[   2.,   10.,  341.,    1.,    1.,    1.,   75.,   58.],
       [   3.,    9.,  109.,    1.,    0.,    2.,   14.,   49.],
       [   1.,   11.,  344.,    1.,    1.,    3.,  114.,   49.]])]


In [None]:
# K-Means Clustering Algorithm
from sklearn.neighbors import NearestNeighbors
from kmodes.kprototypes import KPrototypes

# KMeans Algorithm

from sklearn.cluster import KMeans
def clustering(train, usecols, n):
    #kmeans = KMeans(n_clusters=n, random_state=0,).fit(train)
    kproto = KPrototypes(n_clusters=n, init='Cao', verbose=2)
    kproto.fit_predict(train, categorical=usecols)
    return kproto.cluster_centroids_, kproto.labels_

cols = ["FR_TM_CAT","DATE_CAT","KY_CD","CRM_CAT","LAW_CAT","BORO_CAT","ADDR_PCT_CD",
        "PREM_CAT","Latitude","Longitude"]
cat = [0,1,2,3,4,5,6,7] # as first n columns are categorical
num_clusters = 2
centers2, labels22 = clustering(df[cols].as_matrix(),cat,num_clusters)
print centers2

Initialization method and algorithm are deterministic. Setting n_init to 1.
Init: initializing centroids
Init: initializing clusters
Starting iterations...


In [None]:
# K-Means Clustering Algorithm
from sklearn.neighbors import NearestNeighbors
from kmodes.kprototypes import KPrototypes

# KMeans Algorithm

from sklearn.cluster import KMeans
def clustering(train, usecols, n):
    #kmeans = KMeans(n_clusters=n, random_state=0,).fit(train)
    kproto = KPrototypes(n_clusters=n, init='Cao', verbose=2)
    kproto.fit_predict(train, categorical=usecols)
    return kproto.cluster_centroids_, kproto.labels_

cols = ["FR_TM_CAT","DATE_CAT","KY_CD","CRM_CAT","LAW_CAT","BORO_CAT","ADDR_PCT_CD",
        "PREM_CAT","Latitude","Longitude"]
cat = [0,1,2,3,4,5,6,7] # as first n columns are categorical
num_clusters = 4
centers4, labels4 = clustering(df[cols].as_matrix(),cat,num_clusters)
print centers4

In [None]:
# K-Means Clustering Algorithm
from sklearn.neighbors import NearestNeighbors
from kmodes.kprototypes import KPrototypes

# KMeans Algorithm

from sklearn.cluster import KMeans
def clustering(train, usecols, n):
    #kmeans = KMeans(n_clusters=n, random_state=0,).fit(train)
    kproto = KPrototypes(n_clusters=n, init='Cao', verbose=2)
    kproto.fit_predict(train, categorical=usecols)
    return kproto.cluster_centroids_, kproto.labels_

cols = ["FR_TM_CAT","DATE_CAT","KY_CD","CRM_CAT","LAW_CAT","BORO_CAT","ADDR_PCT_CD",
        "PREM_CAT","Latitude","Longitude"]
cat = [0,1,2,3,4,5,6,7] # as first n columns are categorical
num_clusters = 5
centers5, labels5 = clustering(df[cols].as_matrix(),cat,num_clusters)
print centers5

In [6]:
df['labels'] = labels
print df['labels'].value_counts()

# 1. Plot number of crimes vs clusters
# 2. Plot lat-long for every clusters and compare
# 3. Plot number of crimes foiled (%) vs clusters
# 4. Plot category of crimes for every clusters
# 5. Plot category of premises for every clusters

0    227241
1    149692
2    132397
Name: labels, dtype: int64


In [8]:
df['labels'].value_counts()

0    227241
1    149692
2    132397
Name: labels, dtype: int64

In [9]:
cluster0 = df[df['labels']==0]
cluster1 = df[df['labels']==1]
cluster2 = df[df['labels']==2]

In [None]:
for i in range(3):
    