In [1]:
import numpy as np
import pandas as pd
from collections import namedtuple
from sklearn.metrics import confusion_matrix

### CreateTable(col_n, series_in)

    col_n: column name(selected feature)
    series_in: pandas series
    
    f_table: naive bayes table for the selected feature

#### Example:
    small_table = CreateTable("Description", theft["Description"])

In [2]:
def CreateTable(col_n, series_in):
    TupleStructure = namedtuple(col_n,["uniqType","true_p","false_p"])
    u_type = series_in.unique()

    f_table = TupleStructure(
              uniqType=np.asarray(u_type),
              true_p=np.zeros(len(u_type)),
              false_p=np.zeros(len(u_type))
              )
    return f_table

### FillTable(i_type, series_in, series_t, truefalse=True)

    i_type: a category in a feature
    series_in: pandas series
    series_t: target pandas series(in our case it's "Arrest")
    truefalse: True or False.
    
    molecular/denominator: an array of conditional probability for each category in a feature
    
#### Example:
    col_n = "Description"
    tr_arr = list(map(lambda x: fillTable(x, theft[col_n], theft[target], True), theft[col_n].unique()))
    fa_arr = list(map(lambda x: fillTable(x, theft[col_n], theft[target], False), theft[col_n].unique()))
    small_table=small_table._replace(true_p=tr_arr)
    small_table=small_table._replace(false_p=fa_arr)

In [3]:
def fillTable(i_type, series_in, series_t, truefalse=True):
    index_in = np.where(series_in == i_type)
    index_t = np.where(series_t == truefalse)
    molecular = len(np.intersect1d(index_in[0], index_t[0]))+1
    denominator = len(index_t[0])+len(series_in.unique())
    return molecular/denominator

In [4]:
def fillTableNorm(series, series_in, series_t, truefalse=True):
    tb = series_in[(series_t == truefalse)&(series_in.notnull())&(series['distance']<100000)]
    mean = np.mean(tb)
    sd = np.std(tb)
    return (mean,sd)

In [5]:
def LaplacianSmoothing(i_type, series_in, series_t, truefalse=True, k=0):
    index_in = np.where(series_in == i_type)
    index_t = np.where(series_t == truefalse)
    molecular = len(np.intersect1d(index_in[0], index_t[0])) + k
    denominator = len(index_t[0]) + k*len(series_in.unique())
    return molecular/denominator

### CalculateNaiveBayes(table, features, queries)
    
    table: a list of small tabel
    features: an array of features (the order of the features should be the same as the table) 
    queries: a row of dataframe

#### Example
    features = ['LocationDescription', 'Description', 'Domestic', 'Hour', 'Ward']
    answer = list(map(lambda x: CalculateNaiveBayes(big_table, features, theft_test.loc[x]), theft_test.index))

In [40]:
def CalculateNaiveBayes(table, cts_table, features, ctsfeatures, queries):
    i = 0
    tp = 1
    fp = 1
    for f in features:
        if not np.where(table[i].uniqType == queries[f])[0]:
            tp = tp * 0.5
            fp = fp * 0.5
        else:
            idx = int(np.where(table[i].uniqType == queries[f])[0])
            tp = tp * list(table[i].true_p)[idx]
            fp = fp * list(table[i].false_p)[idx]
        i=i+1
    for f in ctsfeatures:
        if pd.isnull(queries[f]):
            tp = tp * 0.5
            fp = fp * 0.5
        else:
            tmean = cts_table[f]['tr_arr'][0]
            tstd = cts_table[f]['tr_arr'][1]
            fmean = cts_table[f]['fa_arr'][0]
            fstd = cts_table[f]['fa_arr'][1]
            tp = tp * np.exp(-(queries[f]-tmean)**2/(2*(tstd**2)))/np.sqrt(2*np.pi*(tstd**2))
            fp = fp * np.exp(-(queries[f]-fmean)**2/(2*(fstd**2)))/np.sqrt(2*np.pi*(fstd**2))
        
    return tp > fp

# Complete Example

In [7]:
data = pd.DataFrame.from_csv("test.csv")
data.head()

Unnamed: 0_level_0,Primary Type,Description,Location Description,Arrest,Domestic,Ward,Community Area,Year,Latitude,Longitude,distance,Month,Day,Hour
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
10404989,WEAPONS VIOLATION,UNLAWFUL POSS OF HANDGUN,STREET,True,False,28.0,25.0,2016,41.888659,-87.748942,0.00168,2,5,11
1928012,MOTOR VEHICLE THEFT,"TRUCK, BUS, MOTOR HOME",OTHER COMMERCIAL TRANSPORTATION,False,False,,,2001,41.911495,-87.746059,0.009138,1,7,12
10404991,THEFT,$500 AND UNDER,OTHER,True,False,27.0,28.0,2016,41.877144,-87.647217,0.000968,2,5,12
10404994,NARCOTICS,POSS: HEROIN(BRN/TAN),STREET,True,False,45.0,12.0,2016,41.9781,-87.755168,0.003541,2,5,11
1928017,DECEPTIVE PRACTICE,FORGERY,OTHER,False,False,,,2001,41.922596,-87.697101,0.009915,12,8,12


In [8]:
theft = data[data['Primary Type']=="THEFT"]
theft.head()

Unnamed: 0_level_0,Primary Type,Description,Location Description,Arrest,Domestic,Ward,Community Area,Year,Latitude,Longitude,distance,Month,Day,Hour
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
10404991,THEFT,$500 AND UNDER,OTHER,True,False,27.0,28.0,2016,41.877144,-87.647217,0.000968,2,5,12
10405000,THEFT,$500 AND UNDER,SIDEWALK,False,False,3.0,35.0,2016,41.830751,-87.626644,0.001785,2,5,14
10405001,THEFT,RETAIL THEFT,SMALL RETAIL STORE,False,False,46.0,3.0,2016,41.963009,-87.65476,0.00608,2,5,14
10405007,THEFT,RETAIL THEFT,GAS STATION,True,False,9.0,49.0,2016,41.707095,-87.628834,0.00215,2,5,13
10405013,THEFT,FROM BUILDING,BANK,False,False,42.0,32.0,2016,41.88644,-87.632468,0.001041,2,5,13


In [9]:
#remove blank in column name
theft = theft.rename(columns={"Location Description": "LocationDescription"})
theft_test = theft[theft['Year']==2017]
theft = theft[theft['Year'] < 2017]

In [10]:
big_table = []

target = "Arrest"
features = ['LocationDescription', 'Description', 'Domestic', 'Hour', 'Ward']

# Use np.where(condition) to look for indice
for col_n in features:
    small_table = CreateTable(col_n, theft[col_n])
    tr_arr = list(map(lambda x: fillTable(x, theft[col_n], theft[target], True), theft[col_n].unique()))
    fa_arr = list(map(lambda x: fillTable(x, theft[col_n], theft[target], False), theft[col_n].unique()))
    small_table=small_table._replace(true_p=tr_arr)
    small_table=small_table._replace(false_p=fa_arr)
    #small_table.true_p = 
    #small_table.false_p = 
    big_table.append(small_table)
    


In [11]:
cts_table ={}
ctsfeatures = ['distance','Latitude','Longitude']
for col_n in ctsfeatures:
    cts_table[col_n] = {}
    cts_table[col_n]['tr_arr'] = fillTableNorm(theft, theft[col_n], theft[target], True)
    cts_table[col_n]['fa_arr'] = fillTableNorm(theft, theft[col_n], theft[target], False)


In [14]:
answer = list(map(lambda x: CalculateNaiveBayes(big_table, cts_table, features, ctsfeatures, theft_test.loc[x]), theft_test.index))

In [15]:
print("AC: ",len(np.where(theft_test['Arrest']==answer)[0])/len(theft_test))
confusion_matrix(theft_test['Arrest'], answer,labels=[True, False])

AC:  0.8763594733829422


array([[ 1260,   219],
       [ 1509, 10988]])

In [16]:
print("AC: ",len(np.where(theft_test['Arrest']==False)[0])/len(theft_test))
confusion_matrix(theft_test['Arrest'], [False]*len(theft_test['Arrest']),labels=[True, False])

AC:  0.894175729822553


array([[    0,  1479],
       [    0, 12497]])

In [17]:
big_table = []

target = "Arrest"
features = ['LocationDescription', 'Description', 'Domestic', 'Hour', 'Ward']
# Use np.where(condition) to look for indice
for col_n in features:
    small_table = CreateTable(col_n, theft[col_n])
    tr_arr = list(map(lambda x: LaplacianSmoothing(x, theft[col_n], theft[target], True, k=5), theft[col_n].unique()))
    fa_arr = list(map(lambda x: LaplacianSmoothing(x, theft[col_n], theft[target], False, k=5), theft[col_n].unique()))
    small_table=small_table._replace(true_p=tr_arr)
    small_table=small_table._replace(false_p=fa_arr)
    big_table.append(small_table)
    
answer =  list(map(lambda x: CalculateNaiveBayes(big_table, cts_table, features, ctsfeatures, theft_test.loc[x]), theft_test.index))

print("AC after smoothing: ",len(np.where(theft_test['Arrest']==answer)[0])/len(theft_test))
confusion_matrix(theft_test['Arrest'], answer,labels=[True, False])

AC after smoothing:  0.8760017172295363


array([[ 1260,   219],
       [ 1514, 10983]])

In [18]:
data = data.rename(columns={"Location Description": "LocationDescription"})
data_test = data[data['Year']==2017]
data1 = data[data['Year'] < 2017]
big_table = []

target = "Arrest"
features = ['LocationDescription', 'Description', 'Domestic', 'Hour', 'Ward']

# Use np.where(condition) to look for indice
for col_n in features:
    small_table = CreateTable(col_n, data1[col_n])
    tr_arr = list(map(lambda x: fillTable(x, data1[col_n], data1[target], True), data1[col_n].unique()))
    fa_arr = list(map(lambda x: fillTable(x, data1[col_n], data1[target], False), data1[col_n].unique()))
    small_table=small_table._replace(true_p=tr_arr)
    small_table=small_table._replace(false_p=fa_arr)
    #small_table.true_p = 
    #small_table.false_p = 
    big_table.append(small_table)

cts_table ={}
ctsfeatures = ['distance','Latitude','Longitude']
for col_n in ctsfeatures:
    cts_table[col_n] = {}
    cts_table[col_n]['tr_arr'] = fillTableNorm(data1, data1[col_n], data1[target], True)
    cts_table[col_n]['fa_arr'] = fillTableNorm(data1, data1[col_n], data1[target], False)

In [41]:
answer = list(map(lambda x: CalculateNaiveBayes(big_table, cts_table, features, ctsfeatures, data_test.loc[x]), data_test.index))  

print("AC: ",len(np.where(data_test['Arrest']==answer)[0])/len(data_test))
print(confusion_matrix(data_test['Arrest'], answer,labels=[True, False]))

print("AC: ",len(np.where(data_test['Arrest']==False)[0])/len(data_test))
print(confusion_matrix(data_test['Arrest'], [False]*len(data_test['Arrest']),labels=[True, False]))

AC:  0.8429452658376579
[[ 7652  4078]
 [ 5546 44002]]
AC:  0.8085773034367962
[[    0 11730]
 [    0 49548]]


In [None]:
data = data.rename(columns={"Location Description": "LocationDescription"})
data_test = data[data['Year']==2017]
data1 = data[data['Year'] < 2017]
big_table = []

target = "Arrest"
features = ['LocationDescription', 'Description', 'Domestic', 'Hour', 'Ward']

# Use np.where(condition) to look for indice
for col_n in features:
    small_table = CreateTable(col_n, data1[col_n])
    tr_arr = list(map(lambda x: fillTable(x, data1[col_n], data1[target], True), data1[col_n].unique()))
    fa_arr = list(map(lambda x: fillTable(x, data1[col_n], data1[target], False), data1[col_n].unique()))
    small_table=small_table._replace(true_p=tr_arr)
    small_table=small_table._replace(false_p=fa_arr)
    #small_table.true_p = 
    #small_table.false_p = 
    big_table.append(small_table)

In [42]:
cts_table ={}
ctsfeatures = ['distance']
for col_n in ctsfeatures:
    cts_table[col_n] = {}
    cts_table[col_n]['tr_arr'] = fillTableNorm(data1, data1[col_n], data1[target], True)
    cts_table[col_n]['fa_arr'] = fillTableNorm(data1, data1[col_n], data1[target], False)

In [43]:
answer = list(map(lambda x: CalculateNaiveBayes(big_table, cts_table, features, ctsfeatures, data_test.loc[x]), data_test.index))  

print("AC: ",len(np.where(data_test['Arrest']==answer)[0])/len(data_test))
print(confusion_matrix(data_test['Arrest'], answer,labels=[True, False]))

print("AC: ",len(np.where(data_test['Arrest']==False)[0])/len(data_test))
print(confusion_matrix(data_test['Arrest'], [False]*len(data_test['Arrest']),labels=[True, False]))

AC:  0.8440712816997944
[[ 7645  4085]
 [ 5470 44078]]
AC:  0.8085773034367962
[[    0 11730]
 [    0 49548]]


In [44]:
cts_table ={}
ctsfeatures = []
for col_n in ctsfeatures:
    cts_table[col_n] = {}
    cts_table[col_n]['tr_arr'] = fillTableNorm(data1, data1[col_n], data1[target], True)
    cts_table[col_n]['fa_arr'] = fillTableNorm(data1, data1[col_n], data1[target], False)

In [45]:
answer = list(map(lambda x: CalculateNaiveBayes(big_table, cts_table, features, ctsfeatures, data_test.loc[x]), data_test.index))  

print("AC: ",len(np.where(data_test['Arrest']==answer)[0])/len(data_test))
print(confusion_matrix(data_test['Arrest'], answer,labels=[True, False]))

print("AC: ",len(np.where(data_test['Arrest']==False)[0])/len(data_test))
print(confusion_matrix(data_test['Arrest'], [False]*len(data_test['Arrest']),labels=[True, False]))

AC:  0.8453931264075198
[[ 7636  4094]
 [ 5380 44168]]
AC:  0.8085773034367962
[[    0 11730]
 [    0 49548]]
