In [1]:
import numpy as np
import pandas as pd
from collections import namedtuple
from sklearn.metrics import confusion_matrix

In [2]:
def CreateTable(col_n, series_in, continu=False):
    TupleStructure = namedtuple(col_n,["uniqType","true_p","false_p"])
    
    if(continu):
        f_table = TupleStructure(
            uniqType=np.asarray(["mean", "sd"]),
            true_p=np.zeros(2),
            false_p=np.zeros(2)
            )
    else:
        u_type = series_in.unique()
        f_table = TupleStructure(
            uniqType=np.asarray(u_type),
            true_p=np.zeros(len(u_type)),
            false_p=np.zeros(len(u_type))
            )
    return f_table

In [3]:
def fillTable(i_type, series_in, series_t, truefalse=True):
    index_in = np.where(series_in == i_type)
    index_t = np.where(series_t == truefalse)
    molecular = len(np.intersect1d(index_in[0], index_t[0]))+1
    denominator = len(index_t[0])+len(series_in.unique())
    return molecular/denominator

In [4]:
def fillTableNorm(series_in, series_t, truefalse=True):
    index_t = np.where(series_t == truefalse)[0]
    print(index_t)
    mean = np.mean(series_in.iloc[index_t])
    sd = np.std(series_in.iloc[index_t])
    print(mean, sd)
    return (mean,sd)

In [5]:
def LaplacianSmoothing(i_type, series_in, series_t, truefalse=True, k=0):
    index_in = np.where(series_in == i_type)
    index_t = np.where(series_t == truefalse)
    molecular = len(np.intersect1d(index_in[0], index_t[0])) + k
    denominator = len(index_t[0]) + k*len(series_in.unique())
    return molecular/denominator

In [6]:
def CalculateNaiveBayes(table, cts_table, features, ctsfeatures, queries):
    tp = 1
    fp = 1
    for f in features:
        if not np.where(table[f].uniqType == queries[f]) or not np.where(table[f].uniqType == queries[f])[0]:
            #print(np.where(table[f].uniqType == queries[f]))
            tp = tp * 0.5
            fp = fp * 0.5
        else:
            #print(np.where(table[f].uniqType == queries[f]))
            idx = int(np.where(table[f].uniqType == queries[f])[0])
            tp = tp * list(table[f].true_p)[idx]
            fp = fp * list(table[f].false_p)[idx]
    for f in ctsfeatures:
        if pd.isnull(queries[f]):
            tp = tp * 0.5
            fp = fp * 0.5
        else:
            tmean = cts_table[f].true_p[0]
            tstd = cts_table[f].true_p[1]
            fmean = cts_table[f].false_p[0]
            fstd = cts_table[f].false_p[1]
            #print(f,":",tmean,tstd,fmean,fstd)
            tp = tp * np.exp(-(queries[f]-tmean)**2/(2*(tstd**2)))/np.sqrt(2*np.pi*(tstd**2))
            fp = fp * np.exp(-(queries[f]-fmean)**2/(2*(fstd**2)))/np.sqrt(2*np.pi*(fstd**2))
        
    return tp >= fp

In [7]:
data = pd.DataFrame.from_csv("test.csv")
print("Original Length: ", len(data))
data = data.dropna(axis=0)
print("Dropna Length: ", len(data))
data.head(1)

Original Length:  6305988
Dropna Length:  5621365


Unnamed: 0_level_0,Primary Type,Description,Location Description,Arrest,Domestic,Ward,Community Area,Year,Latitude,Longitude,distance,Month,Day,Hour
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
10404989,WEAPONS VIOLATION,UNLAWFUL POSS OF HANDGUN,STREET,True,False,28.0,25.0,2016,41.888659,-87.748942,0.00168,2,5,11


In [8]:
#Cut off outliers
data = data[data['Latitude'] > 41.644]
data = data[data['Latitude'] < 42.020]
data = data[data['Longitude'] > -87.940]
data = data[data['Longitude'] < -87.521]
data = data[data['distance'] < 100000]
print("Cut off outliers:", len(data))

Cut off outliers: 5602755


In [9]:
#data = data[data['Primary Type']=="BATTERY"]

In [10]:
print(len(data))
features = ['Primary Type','Description','Location Description', 'Domestic', 'Ward', 'Hour']
ctsfeatures = ['distance','Latitude','Longitude']
for col in features:
    data=data[data[col].notnull()]
print(len(data))
for col in ctsfeatures:
    data=data[data[col].notnull()]
print(len(data))

5602755
5602755
5602755


In [11]:
data = data.rename(columns={"Location Description": "LocationDescription"})
data = data.rename(columns={"Primary Type": "PrimaryType"})
data_test = data[data['Year']==2017]
data1 = data[data['Year'] < 2017]

target = "Arrest"


cts_table ={}
ctsfeatures = ['distance','Latitude','Longitude']
for col_n in ctsfeatures:
    small_table = CreateTable(col_n, data1[col_n], continu=True)
    tr_arr = fillTableNorm(data1[col_n], data1[target], True)
    fa_arr = fillTableNorm(data1[col_n], data1[target], False)
    small_table=small_table._replace(true_p=tr_arr)
    small_table=small_table._replace(false_p=fa_arr)
    
    cts_table[col_n]=small_table

print("CTS Done")

big_table = {}
features = ['PrimaryType','Description','LocationDescription', 'Domestic', 'Ward', 'Hour']

# Use np.where(condition) to look for indice
for col_n in features:
    small_table = CreateTable(col_n, data1[col_n])
    tr_arr = list(map(lambda x: fillTable(x, data1[col_n], data1[target], True), data1[col_n].unique()))
    fa_arr = list(map(lambda x: fillTable(x, data1[col_n], data1[target], False), data1[col_n].unique()))
    small_table=small_table._replace(true_p=tr_arr)
    small_table=small_table._replace(false_p=fa_arr)
    #small_table.true_p = 
    #small_table.false_p = 
    big_table[col_n]=small_table



[      0       1       2 ..., 5543065 5543071 5543072]
0.00611311125808 0.00533359405214
[      3       4       5 ..., 5543068 5543069 5543070]
0.00643528568478 0.00557854144056
[      0       1       2 ..., 5543065 5543071 5543072]
41.8417727123 0.0810852127479
[      3       4       5 ..., 5543068 5543069 5543070]
41.8409043035 0.0881223374476
[      0       1       2 ..., 5543065 5543071 5543072]
-87.6751272437 0.0578493569233
[      3       4       5 ..., 5543068 5543069 5543070]
-87.6706332414 0.0595413570756
CTS Done


In [12]:
print('Discrete case:')
for col in features:
    answer = list(map(lambda x: CalculateNaiveBayes(big_table, cts_table, [col], [], data_test.loc[x]), data_test.index))  
    print(col)
    print("AC: ",len(np.where(data_test['Arrest']==answer)[0])/len(data_test))
    print(confusion_matrix(data_test['Arrest'], answer,labels=[True, False]))

print('Cts case:')
for col in ctsfeatures:
    answer = list(map(lambda x: CalculateNaiveBayes(big_table, cts_table, [], [col], data_test.loc[x]), data_test.index))  
    print(col)
    print("AC: ",len(np.where(data_test['Arrest']==answer)[0])/len(data_test))
    print(confusion_matrix(data_test['Arrest'], answer,labels=[True, False]))

Discrete case:
PrimaryType
AC:  0.8704969672598103
[[ 5171  6495]
 [ 1234 46782]]
Description
AC:  0.8696424382560907
[[ 7766  3900]
 [ 3880 44136]]
LocationDescription
AC:  0.589239636741396
[[ 7427  4239]
 [20276 27740]]
Domestic
AC:  0.30335779632049864
[[10033  1633]
 [39944  8072]]
Ward
AC:  0.6113233470728193
[[ 5214  6452]
 [16745 31271]]
Hour
AC:  0.5903287423343722
[[ 5459  6207]
 [18243 29773]]
Cts case:
distance
AC:  0.33505914681143395
[[ 9312  2354]
 [37331 10685]]
Latitude
AC:  0.43689889749003047
[[ 7645  4021]
 [29586 18430]]
Longitude
AC:  0.45298414932475456
[[ 6902  4764]
 [27883 20133]]


In [13]:
answer = list(map(lambda x: CalculateNaiveBayes(big_table, cts_table, features, ctsfeatures, data_test.loc[x]), data_test.index))  

print("AC: ",len(np.where(data_test['Arrest']==answer)[0])/len(data_test))
print(confusion_matrix(data_test['Arrest'], answer,labels=[True, False]))

print("AC: ",len(np.where(data_test['Arrest']==False)[0])/len(data_test))
print(confusion_matrix(data_test['Arrest'], [False]*len(data_test['Arrest']),labels=[True, False]))

AC:  0.855967353492106
[[ 7483  4198]
 [ 4414 43697]]
AC:  0.8046394166443671
[[    0 11681]
 [    0 48111]]


In [14]:
big_table

{'Description': Description(uniqType=array(['UNLAWFUL POSS OF HANDGUN', '$500 AND UNDER',
       'POSS: HEROIN(BRN/TAN)', 'DOMESTIC BATTERY SIMPLE',
       'ANIMAL ABUSE/NEGLECT', 'RETAIL THEFT', 'SIMPLE', 'AGGRAVATED',
       'FORCIBLE ENTRY', 'FROM BUILDING', 'COUNTERFEITING DOCUMENT',
       'UNLAWFUL ENTRY', 'GUN OFFENDER: DUTY TO REGISTER',
       'POSS: HEROIN(WHITE)', 'TO PROPERTY', 'AGG CRIMINAL SEXUAL ABUSE',
       'ATTEMPT FORCIBLE ENTRY', 'BY FIRE', 'STRONGARM - NO WEAPON',
       'TO VEHICLE', 'FINANCIAL IDENTITY THEFT OVER $ 300',
       'FRAUD OR CONFIDENCE GAME', 'POSS: CANNABIS 30GMS OR LESS',
       'TO RESIDENCE', 'MANU/DEL:CANNABIS 10GM OR LESS',
       'HARASSMENT BY TELEPHONE', 'HARASSMENT BY ELECTRONIC MEANS',
       'FIRST DEGREE MURDER', 'CHILD ABUSE', 'NON-AGGRAVATED',
       'CREDIT CARD FRAUD', 'ARMED: HANDGUN', 'POCKET-PICKING',
       'POSS: CRACK', 'ARSON THREAT', 'AGGRAVATED: HANDGUN',
       'PRO EMP HANDS NO/MIN INJURY', 'TO LAND', 'GAME/DICE', 'AUTOMO