In [1]:
import numpy as np
import pandas as pd
from collections import namedtuple

### CreateTable(col_n, series_in)

    col_n: column name(selected feature)
    series_in: pandas series
    
    f_table: naive bayes table for the selected feature

#### Example:
    small_table = CreateTable("Description", theft["Description"])

In [2]:
def CreateTable(col_n, series_in):
    TupleStructure = namedtuple(col_n,["uniqType","true_p","false_p"])
    u_type = series_in.unique()

    f_table = TupleStructure(
              uniqType=np.asarray(u_type),
              true_p=np.zeros(len(u_type)),
              false_p=np.zeros(len(u_type))
              )
    return f_table

### FillTable(i_type, series_in, series_t, truefalse=True)

    i_type: a category in a feature
    series_in: pandas series
    series_t: target pandas series(in our case it's "Arrest")
    truefalse: True or False.
    
    molecular/denominator: an array of conditional probability for each category in a feature
    
#### Example:
    col_n = "Description"
    tr_arr = list(map(lambda x: fillTable(x, theft[col_n], theft[target], True), theft[col_n].unique()))
    fa_arr = list(map(lambda x: fillTable(x, theft[col_n], theft[target], False), theft[col_n].unique()))
    small_table=small_table._replace(true_p=tr_arr)
    small_table=small_table._replace(false_p=fa_arr)

In [3]:
def fillTable(i_type, series_in, series_t, truefalse=True):
    index_in = np.where(series_in == i_type)
    index_t = np.where(series_t == truefalse)
    molecular = len(np.intersect1d(index_in[0], index_t[0]))
    denominator = len(index_t[0])
    return molecular/denominator

### CalculateNaiveBayes(table, features, queries)
    
    table: a list of small tabel
    features: an array of features (the order of the features should be the same as the table) 
    queries: a row of dataframe

#### Example
    features = ['LocationDescription', 'Description', 'Domestic', 'Hour', 'Ward']
    answer = list(map(lambda x: CalculateNaiveBayes(big_table, features, theft_test.loc[x]), theft_test.index))

In [4]:
def CalculateNaiveBayes(table, features, queries):
    i = 0
    tp = 1
    fp = 1
    for f in features:
        idx = int(np.where(table[i].uniqType == queries[f])[0])
        #print(type(idx))
        tp = tp * list(table[i].true_p)[idx]
        fp = fp * list(table[i].false_p)[idx]
        i=i+1
        
    return tp > fp

# Complete Example

In [5]:
data = pd.DataFrame.from_csv("Modified_Crimes.csv")
data.head()

Unnamed: 0_level_0,Date,Primary Type,Description,Location Description,Arrest,Domestic,Ward,Year,Hour
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
10504504,04/30/2016 12:00:00 AM,BATTERY,DOMESTIC BATTERY SIMPLE,RESIDENCE,False,True,6.0,2016,0
10504512,04/30/2016 12:00:00 AM,NARCOTICS,POSS: CANNABIS 30GMS OR LESS,SIDEWALK,True,False,29.0,2016,0
10504523,04/30/2016 01:05:00 AM,OTHER OFFENSE,HARASSMENT BY ELECTRONIC MEANS,RESIDENCE,False,True,6.0,2016,1
10504536,04/30/2016 12:31:00 AM,ROBBERY,ARMED: HANDGUN,RESTAURANT,False,False,8.0,2016,0
10504538,04/30/2016 12:14:00 AM,BATTERY,DOMESTIC BATTERY SIMPLE,RESIDENCE,True,True,35.0,2016,0


In [6]:
theft = data[data['Primary Type']=="THEFT"]
theft.head()

Unnamed: 0_level_0,Date,Primary Type,Description,Location Description,Arrest,Domestic,Ward,Year,Hour
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
10504543,04/30/2016 12:30:00 AM,THEFT,OVER $500,CTA BUS,False,False,4.0,2016,0
10505337,04/30/2016 10:45:00 PM,THEFT,$500 AND UNDER,RESIDENCE PORCH/HALLWAY,False,True,7.0,2016,22
10505351,04/30/2016 11:30:00 PM,THEFT,$500 AND UNDER,STREET,False,False,27.0,2016,23
10477562,03/31/2016 11:40:00 AM,THEFT,$500 AND UNDER,PARKING LOT/GARAGE(NON.RESID.),True,False,32.0,2016,11
10504520,04/29/2016 09:00:00 PM,THEFT,POCKET-PICKING,RESTAURANT,False,False,2.0,2016,21


In [7]:
#remove blank in column name
theft = theft.rename(columns={"Location Description": "LocationDescription"})
theft_test = theft[theft['Year']==2017]
theft = theft[theft['Year'] < 2017]

In [8]:
big_table = []
target = "Arrest"
features = ['LocationDescription', 'Description', 'Domestic', 'Hour', 'Ward']
# Use np.where(condition) to look for indice
for col_n in features:
    small_table = CreateTable(col_n, theft[col_n])
    tr_arr = list(map(lambda x: fillTable(x, theft[col_n], theft[target], True), theft[col_n].unique()))
    fa_arr = list(map(lambda x: fillTable(x, theft[col_n], theft[target], False), theft[col_n].unique()))
    small_table=small_table._replace(true_p=tr_arr)
    small_table=small_table._replace(false_p=fa_arr)
    #small_table.true_p = 
    #small_table.false_p = 
    big_table.append(small_table)

In [9]:
answer = list(map(lambda x: CalculateNaiveBayes(big_table, features, theft_test.loc[x]), theft_test.index))

In [10]:
len(np.where(theft_test['Arrest']==answer)[0])/len(theft_test)

0.8752560998323711