# Naive Bayes on Race

In [1]:
import numpy as np
from matplotlib import pyplot as plt
import seaborn
import pandas as pd
from sklearn import preprocessing
%matplotlib inline

In [2]:
def getClassProbability(labels, classId,nClasses):
    return sum([1 for i in labels if i == classId]) / len(labels)

def getNBEstimate(data, labels, classLabel, attId, attValue, nValues):
    indices, = np.where(labels == classLabel)
    subset = data[indices,attId]
    return sum([1 for i in subset if i == attValue]) / len(subset)

def estimateNBModel(data, labels, nAttributes, attributeRanges, nClasses):
    ## Naive Bayes Model consists of two types of estimators
    ## First, we estimate the probability of seeing an object from a specific class

    classProbabilities = [getClassProbability(labels,l, nClasses) for l in range(nClasses)]
    #print(classProbabilities, nClasses)
    
    ## now we estimate the probabilities of seeing a specific value of a specific attribute in
    ## a data point from a given class
    
    ## for each class create the appropriate estimates
    model = []                  # model is the list of estimates for all classes
    for i in range(nClasses):   # for each class
        ## for each attribute
        classDistr = []         # classDistr is the collection of estimates for one class
        for j in range(nAttributes):
            estimates = []                    # estimates is a distribution of estimates for a single attribute
            for k in range(attributeRanges[j]):
                est = getNBEstimate(data, labels, i,j,k, attributeRanges[j])

                estimates.append(est)
            classDistr.append(estimates)
        model.append(classDistr)
    
    return classProbabilities, model
    

def predictNBClass(point, classProb, classModel):
    ans = 1
    for i in range(len(classModel)):
        ans *= classModel[i][int(point[i])]
    return ans * classProb

def predictNB(point, classProbabilities, model,nClasses):
    
    predictions= np.array([predictNBClass(point, classProbabilities[i],model[i]) for i in range(nClasses)])
    
    predictedClass = np.argmax(predictions)
    return predictedClass

def predictNBLabels(data, classProbabilities, model, nClasses):
    predicted = [predictNB(point, classProbabilities, model, nClasses) for point in data]
    return predicted

# Load data

In [3]:
bad = ["Unnamed: 0", "id", "name", "Ethnicity", "date", "city"]
path = "C:\\Users\\Sam\\Documents\\cal poly\\3\\spring\\cs466\\project\\KDD_Group_Project\\datasets\\fatal.csv"
rawData = pd.read_csv(path)
rawData = rawData[[x for x in rawData.columns if x not in bad]]
rawData = rawData.dropna()
data = rawData[[x for x in rawData if x != "race"]].values
for i in range(data.shape[1]):
    le = preprocessing.LabelEncoder()
    le.fit(data[:,i])
    data[:,i] = le.transform(data[:,i]) 
nAttributes = data.shape[1] - 1
labels = rawData["race"].values
le = preprocessing.LabelEncoder()
le.fit(labels)
labels = le.transform(labels)

# Train Model

In [4]:
### Let us compute how many unique values each attribute has.
### all attributes have values 0,1,.., k-1 where k is the number of unique values for that attribute.

attributeRanges= [np.unique(data[:,i]).shape[0] for i in range(nAttributes)]

## number of classes

nClasses = np.unique(labels).shape[0]

In [5]:
d, m = estimateNBModel(data ,labels, nAttributes, attributeRanges, nClasses)

In [6]:
predicted = predictNBLabels(data, d, m, nClasses)

80 - 20 split

In [7]:
np.random.seed(0)
indices = list(range(len(data)))
np.random.shuffle(indices)
diff = np.array(list(set(list(range(len(data)))).difference(indices[:int(len(data) * 0.8)])))
X_train, y_train, X_test, y_test = data[indices,:], labels[indices], data[diff,:], labels[diff]

In [8]:
attributeRanges= [np.unique(X_train[:,i]).shape[0] for i in range(nAttributes)]
nClasses = np.unique(y_train).shape[0]
d, m = estimateNBModel(X_train, y_train, nAttributes, attributeRanges, nClasses)

In [9]:
predicted_train = predictNBLabels(X_train, d, m, nClasses) 
predicted_test = predictNBLabels(X_test, d, m, nClasses)

Overall accuracy

In [10]:
print("Accuracy on train")
print(sum([1 for i in range(len(y_train)) if y_train[i] == predicted_train[i]]) / len(y_train))
print()
print("Accuracy on test")
print(sum([1 for i in range(len(y_test)) if y_test[i] == predicted_test[i]]) / len(y_test))

Accuracy on train
0.6399735274652548

Accuracy on test
0.6760330578512397
