In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import time
import collections
from datetime import datetime

In [2]:
df = pd.read_csv("dataset/complaint_data.csv")
df.head()

Unnamed: 0,CMPLNT_NUM,CMPLNT_FR_DT,CMPLNT_FR_TM,CMPLNT_TO_DT,CMPLNT_TO_TM,RPT_DT,KY_CD,OFNS_DESC,PD_CD,PD_DESC,...,LOC_OF_OCCUR_DESC,PREM_TYP_DESC,PARKS_NM,HADEVELOPT,X_COORD_CD,Y_COORD_CD,Latitude,Longitude,Lat_Lon,ZIPCODE
0,100006411,10/16/15,18:27:00,10/16/15,22:19:00,10/16/15,125,NYS LAWS-UNCLASSIFIED FELONY,844.0,CAUSE SPI/KILL ANIMAL,...,INSIDE,RESIDENCE - APT. HOUSE,,,1015953,248464,40.848603,-73.885411,"(40.848602657, -73.885410626)",10457.0
1,100006874,11/11/16,19:20:00,,,11/11/16,351,CRIMINAL MISCHIEF & RELATED OF,259.0,"CRIMINAL MISCHIEF,UNCLASSIFIED 4",...,INSIDE,RESIDENCE - PUBLIC HOUSING,,WEEKSVILLE GARDENS,1002388,185584,40.676052,-73.934611,"(40.676051596, -73.934611088)",11213.0
2,100007471,11/11/15,23:00:00,11/12/15,11:00:00,11/14/15,109,GRAND LARCENY,421.0,"LARCENY,GRAND FROM VEHICLE/MOTORCYCLE",...,FRONT OF,STREET,,,1039330,225596,40.785722,-73.801102,"(40.78572169, -73.801102117)",11357.0
3,100009724,4/12/16,14:00:00,4/12/16,14:45:00,4/12/16,351,CRIMINAL MISCHIEF & RELATED OF,259.0,"CRIMINAL MISCHIEF,UNCLASSIFIED 4",...,INSIDE,GROCERY/BODEGA,,,1006440,240227,40.826024,-73.919823,"(40.826023516, -73.919822692)",10451.0
4,100009898,8/29/16,21:01:00,8/29/16,21:01:00,8/29/16,235,DANGEROUS DRUGS,511.0,"CONTROLLED SUBSTANCE, POSSESSI",...,INSIDE,PUBLIC BUILDING,,,997580,207756,40.736917,-73.951901,"(40.736917049, -73.951900796)",11222.0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 618192 entries, 0 to 618191
Data columns (total 25 columns):
CMPLNT_NUM           618192 non-null int64
CMPLNT_FR_DT         618192 non-null object
CMPLNT_FR_TM         618192 non-null object
CMPLNT_TO_DT         511554 non-null object
CMPLNT_TO_TM         511834 non-null object
RPT_DT               618192 non-null object
KY_CD                618192 non-null int64
OFNS_DESC            617900 non-null object
PD_CD                617768 non-null float64
PD_DESC              617768 non-null object
CRM_ATPT_CPTD_CD     618192 non-null object
LAW_CAT_CD           618192 non-null object
JURIS_DESC           618192 non-null object
BORO_NM              618192 non-null object
ADDR_PCT_CD          618192 non-null int64
LOC_OF_OCCUR_DESC    492790 non-null object
PREM_TYP_DESC        615660 non-null object
PARKS_NM             6274 non-null object
HADEVELOPT           31443 non-null object
X_COORD_CD           618192 non-null int64
Y_COORD_CD     

In [4]:
df_col_desc = pd.read_csv("dataset/column_desc.csv")
df_col_desc

Unnamed: 0,Column,Description
0,CMPLNT_NUM,Randomly generated persistent ID for each comp...
1,CMPLNT_FR_DT,Exact date of occurrence for the reported even...
2,CMPLNT_FR_TM,Exact time of occurrence for the reported even...
3,CMPLNT_TO_DT,Ending date of occurrence for the reported eve...
4,CMPLNT_TO_TM,Ending time of occurrence for the reported eve...
5,RPT_DT,Date event was reported to police
6,KY_CD,Three digit offense classification code
7,OFNS_DESC,Description of offense corresponding with key ...
8,PD_CD,Three digit internal classification code (more...
9,PD_DESC,Description of internal classification corresp...


In [5]:
# Data cleansing - removing null columns and data points

dropcols = ["Lat_Lon","X_COORD_CD","Y_COORD_CD","CMPLNT_NUM"] # Redundant columns
for i in df.columns:
    if float(df[i].isnull().sum())/len(df)*100 > 20: # if more than 10% data is empty, clip the column
        dropcols.append(i)
df = df.drop(dropcols, axis=1)
df = df[~df.isnull().any(axis=1)]
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 509331 entries, 0 to 618191
Data columns (total 18 columns):
CMPLNT_FR_DT        509331 non-null object
CMPLNT_FR_TM        509331 non-null object
CMPLNT_TO_DT        509331 non-null object
CMPLNT_TO_TM        509331 non-null object
RPT_DT              509331 non-null object
KY_CD               509331 non-null int64
OFNS_DESC           509331 non-null object
PD_CD               509331 non-null float64
PD_DESC             509331 non-null object
CRM_ATPT_CPTD_CD    509331 non-null object
LAW_CAT_CD          509331 non-null object
JURIS_DESC          509331 non-null object
BORO_NM             509331 non-null object
ADDR_PCT_CD         509331 non-null int64
PREM_TYP_DESC       509331 non-null object
Latitude            509331 non-null float64
Longitude           509331 non-null float64
ZIPCODE             509331 non-null float64
dtypes: float64(4), int64(2), object(12)
memory usage: 73.8+ MB


In [6]:
# Parsing the Date column 
df["RPT_DT"] = df["RPT_DT"].apply(lambda x: datetime.strptime(x, '%m/%d/%y').strftime('%Y')).astype('datetime64[ns]')
df["CMPLNT_FR_DT"] = df["CMPLNT_FR_DT"].apply(lambda x: datetime.strptime(x, '%m/%d/%y').strftime('%Y')).astype('datetime64[ns]')
df["CMPLNT_TO_DT"] = df["CMPLNT_TO_DT"].apply(lambda x: datetime.strptime(x, '%m/%d/%y').strftime('%Y')).astype('datetime64[ns]')

df["RPT_DT"] = pd.to_datetime(df["RPT_DT"])
df["CMPLNT_FR_DT"] = pd.to_datetime(df["CMPLNT_FR_DT"])
df["CMPLNT_TO_DT"] = pd.to_datetime(df["CMPLNT_TO_DT"])

df["CMPLNT_FR_TM"] = pd.to_datetime(df["CMPLNT_FR_TM"],format='%H:%M:%S').dt.time
df["CMPLNT_TO_TM"] = pd.to_datetime(df["CMPLNT_TO_TM"],format='%H:%M:%S').dt.time
df = df[~(df['CMPLNT_TO_DT'] >= '2017-01-01')]

In [7]:
df[["RPT_DT","CMPLNT_FR_DT","CMPLNT_TO_DT"]].max()

RPT_DT         2016-01-01
CMPLNT_FR_DT   2016-01-01
CMPLNT_TO_DT   2016-01-01
dtype: datetime64[ns]

In [8]:
df[["RPT_DT","CMPLNT_FR_DT","CMPLNT_TO_DT"]].min()

RPT_DT         2015-01-01
CMPLNT_FR_DT   2015-01-01
CMPLNT_TO_DT   2015-01-01
dtype: datetime64[ns]

In [9]:
# Compiling list of Offenses, their categories and key codes

offense_code = {}
offlist = df["OFNS_DESC"].unique()
for each in offlist:
    key = df[df["OFNS_DESC"]==each]["KY_CD"].unique()[0]
    offense_code[key] = each

print "Offense Codes and Description " + str(len(offense_code))
print ""
offense_code = collections.OrderedDict(sorted(offense_code.items()))
for k,v in offense_code.items():
    print k,v
    
# Repeating the same for Police Codes and Descriptions

pd_code = {}
pdlist = df["PD_DESC"].unique()
for each in pdlist:
    key = df[df["PD_DESC"]==each]["PD_CD"].unique()[0]
    pd_code[key] = each

print ""
print "Police Department Codes and Descriptions " + str(len(pd_code))
pd_code = collections.OrderedDict(sorted(pd_code.items()))
for k,v in pd_code.items():
    print k,v
    
# PD Code list is extremely desriptive and will detract the machine learning algorithm for clustering. 
# Using only OFNS_CODE, KY_CD

Offense Codes and Description 54

102 HOMICIDE-NEGLIGENT-VEHICLE
103 HOMICIDE-NEGLIGENT,UNCLASSIFIE
105 ROBBERY
106 FELONY ASSAULT
107 BURGLARY
109 GRAND LARCENY
110 GRAND LARCENY OF MOTOR VEHICLE
111 POSSESSION OF STOLEN PROPERTY
112 THEFT-FRAUD
113 FORGERY
114 ARSON
118 DANGEROUS WEAPONS
119 INTOXICATED/IMPAIRED DRIVING
120 ENDAN WELFARE INCOMP
124 KIDNAPPING
125 NYS LAWS-UNCLASSIFIED FELONY
126 MISCELLANEOUS PENAL LAW
230 JOSTLING
231 BURGLAR'S TOOLS
233 SEX CRIMES
235 DANGEROUS DRUGS
237 ESCAPE 3
238 FRAUDULENT ACCOSTING
340 FRAUDS
341 PETIT LARCENY
342 PETIT LARCENY OF MOTOR VEHICLE
343 THEFT OF SERVICES
344 ASSAULT 3 & RELATED OFFENSES
345 OFFENSES RELATED TO CHILDREN
346 ALCOHOLIC BEVERAGE CONTROL LAW
347 INTOXICATED & IMPAIRED DRIVING
348 VEHICLE AND TRAFFIC LAWS
349 DISRUPTION OF A RELIGIOUS SERV
350 GAMBLING
351 CRIMINAL MISCHIEF & RELATED OF
352 CRIMINAL TRESPASS
353 UNAUTHORIZED USE OF A VEHICLE
354 ANTICIPATORY OFFENSES
355 OFFENSES AGAINST THE PERSON
356 PROSTITUTION & RE

In [18]:
# NOTES

# CRM_ATPT_CPTD_CD - Generate Category variable (Successful, Failed, Attempted)
# Types of Crime across Zipcodes - Start with LAW_CAT_CD
# Precinct involved - look for patterns across this
# Categorical variables include - CRM_ATPT_CPTD_CD, KY_CD, PD_CD, LAW_CAT_CD, ADDR_PCT_CD, BORO_NM, PREM_TYP_DESC, ZIPCODE
# Numerical variables - Latitude, Longitude
# TRY - KMODES clustering for mixed data
# Check diff in dates of crime committed and reported
# Drop columns PD_CD, PD_DESC, OFNS_DESC
# Lengths of crimes - 'CMPLNT_FR_TM','CMPLNT_TO_TM'

df = df.drop(['PD_CD','PD_DESC','OFNS_DESC'], axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 509330 entries, 0 to 618191
Data columns (total 16 columns):
CMPLNT_FR_DT        509330 non-null datetime64[ns]
CMPLNT_FR_TM        509330 non-null object
CMPLNT_TO_DT        509330 non-null datetime64[ns]
CMPLNT_TO_TM        509330 non-null object
RPT_DT              509330 non-null datetime64[ns]
KY_CD               509330 non-null int64
OFNS_DESC           509330 non-null object
CRM_ATPT_CPTD_CD    509330 non-null object
LAW_CAT_CD          509330 non-null object
JURIS_DESC          509330 non-null object
BORO_NM             509330 non-null object
ADDR_PCT_CD         509330 non-null int64
PREM_TYP_DESC       509330 non-null object
Latitude            509330 non-null float64
Longitude           509330 non-null float64
ZIPCODE             509330 non-null float64
dtypes: datetime64[ns](3), float64(3), int64(2), object(8)
memory usage: 66.1+ MB


In [None]:
# K-Means Clustering Algorithm
from sklearn.neighbors import NearestNeighbors
def findmyneighbor(train, n):
    nbrs = NearestNeighbors(n_neighbors=2, algorithm='ball_tree').fit(train)
    distances, indices = nbrs.kneighbors(X)
    return distances, indices

cols = ['KY_CD','Latitude','Longitude']

centers, labels = clustering(df[cols],3) 

# Segregating the data according to their labels

label0 = []
label1 = []
label2 = []

cols.append('Borough')
cols.append('Zipcode')
for i in range(len(df)):
    if labels[i]%3==0:
        label0.append(df.iloc[i][cols])
    elif labels[i]%3==1:
        label1.append(df.iloc[i][cols])
    else:
        label2.append(df.iloc[i][cols])

cluster0 = pd.DataFrame(label0)
cluster1 = pd.DataFrame(label1)
cluster2 = pd.DataFrame(label2)

print cluster0.mean()
print cluster1.mean()
print cluster2.mean()

In [28]:
from kmodes.kprototypes import KPrototypes

In [36]:
# stocks with their market caps, sectors and countries
syms = np.genfromtxt('stocks.csv', dtype=str, delimiter=',')[:, 0]
X = np.genfromtxt('stocks.csv', dtype=object, delimiter=',')[:, 1:]
X[:, 0] = X[:, 0].astype(float)
print X
kproto = KPrototypes(n_clusters=4, init='Cao', verbose=2)
clusters = kproto.fit_predict(X, categorical=[1, 2])

# Print cluster centroids of the trained model.
print(kproto.cluster_centroids_)
# Print training statistics
print(kproto.cost_)
print(kproto.n_iter_)

for s, c in zip(syms, clusters):
    print("Symbol: {}, cluster:{}".format(s, c))

[[738.5 'tech' 'USA']
 [369.5 'nrg' 'USA']
 [368.2 'tech' 'USA']
 [346.7 'tech' 'USA']
 [343.5 'fin' 'USA']
 [282.4 'fin' 'USA']
 [282.1 'tel' 'CN']
 [279.7 'cons' 'USA']
 [257.2 'cons' 'USA']
 [205.2 'tel' 'USA']
 [192.1 'tech' 'USA']
 [195.7 'nrg' 'NL']]
Initialization method and algorithm are deterministic. Setting n_init to 1.
Init: initializing centroids
Init: initializing clusters
Init: initializing centroids
Init: initializing clusters
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run: 1, iteration: 1/100, moves: 2, ncost: 1664.47085495
Run: 1, iteration: 2/100, moves: 0, ncost: 1664.47085495
[array([[ 197.66666667],
       [ 275.35      ],
       [ 738.5       ],
       [ 356.975     ]]), array([['nrg', 'USA'],
       ['cons', 'USA'],
       ['tech', 'USA'],
       ['tech', 'USA']],
      dtype='|S4')]
1664.47085495
2
Symbol: AAPL, cluster:2
Symbol: XOM, cluster:3
Symbol: GOOGL, cluster:3
Symbol: MSFT, cluster:3
Symbol: BRK-A, cluster:3
Symbol: