In [1]:
import csv

file = 'cleaned2.csv'

with open(file, 'r') as fi:
    reader = csv.reader(fi)
    headers = next(reader)
    for i in range(len(headers)):
        print(i, headers[i])     

0 HRHHID
1 HETENURE
2 HETELHHD
3 HEFAMINC
4 HRNUMHOU
5 GEREG
6 GEDIV
7 GTINDVPC
8 PRTAGE
9 PEMARITL
10 PESEX
11 PEAFEVER
12 PEEDUCA
13 PTDTRACE
14 PUCHINHH
15 PEHSPNON
16 PENATVTY
17 PRCITSHP
18 PUWK
19 PUBUS1
20 PUBUS2OT
21 PUDIS
22 PERET1
23 PRIOELG
24 PRCOW1
25 PRCOW2
26 PRCOWPG
27 PEERNPER
28 PEERNHRY
29 PEERNLAB
30 PEERNCOV
31 PRCHLD
32 PRNMCHLD
33 race-binary


In [2]:
with open(file, 'r') as fi:
    reader = csv.reader(fi)
    headers = next(reader)
    
    white = []
    nonwhite = []
    income_w = []
    income_n = []
    
    for line in reader:
        
        # remove id, income, race, and race_binary
        data = []
        for i in range(len(line)):
            if i != 0 and i != 13 and i != 3 and i != 33:
                data.append(int(line[i]))

        # put in the correct racial category
        if line[33] == '1':
            white.append(data)
            income_w.append(int(line[3]))
        else:
            nonwhite.append(data)
            income_n.append(int(line[3]))

In [3]:
# something that might cause problems - there's more than twice as much data for white people
print(len(white))
print(len(nonwhite))

104841
47354


I am going to randomly divide the datasets into training data and experimental data. 

In [4]:
import random

# training sets
training_w = []
training_iw = []
training_n = []
training_in = []

# experimental sets 
exp_w = []
exp_iw = []
exp_n = []
exp_in = []

# white people
for i in range(len(white)):
    
    # flip a coin to place in the training dataset
    r = random.randint(0, 1)
    if (r == 0):
        training_w.append(white[i])
        training_iw.append(income_w[i])
    else:
        exp_w.append(white[i])
        exp_iw.append(income_w[i])

# nonwhite people
for i in range(len(nonwhite)):
    
    # flip a coin to place in the training dataset
    r = random.randint(0, 1)
    if (r == 0):
        training_n.append(nonwhite[i])
        training_in.append(income_n[i])
    else:
        exp_n.append(nonwhite[i])
        exp_in.append(income_n[i])


Let's make a model for white people

In [10]:
import numpy as np
from sklearn.naive_bayes import GaussianNB

features_w = np.array(training_w)
labels_w = np.array(training_iw)

clf_w = GaussianNB()
clf_w.fit(features_w, labels_w)

res_w = clf_w.predict(exp_w)
accuracy_w = [1 if res_w[i] == exp_iw[i] else 0 for i in range(len(res_w))]
print("accuracy rate: ", sum(accuracy_w)/len(accuracy_w))


accuracy rate:  0.6608596856655936


And one for nonwhite people

In [6]:
features_n = np.array(training_n)
labels_n = np.array(training_in)

clf_n = GaussianNB()
clf_n.fit(features_n, labels_n)

res_n = clf_n.predict(exp_n)
accuracy_n = [1 if res_n[i] == exp_in[i] else 0 for i in range(len(res_n))]
print("accuracy rate: ", sum(accuracy_n)/len(accuracy_n))

accuracy rate:  0.7550078346673442


That's not very accurate! Let's use k-fold cross validation to try to improve the accuracy of our model. Let's start with white people.

In [32]:
from sklearn.model_selection import KFold

def run_kfold(fields, labels):
    kf = KFold(n_splits=10)
    best = [], []
    best_accuracy = 0

    for train_index, test_index in kf.split(fields):
        train_fields = [fields[i] for i in train_index]
        train_labels = [labels[i] for i in train_index]
        test_fields = [fields[i] for i in test_index]
        test_labels = [labels[i] for i in test_index]

        clf = GaussianNB()
        clf.fit(train_fields, train_labels)

        res = clf.predict(test_fields).tolist()
        
        accuracy = []
        for i in range(len(res)):
            if res[i] == test_labels[i]:
                accuracy.append(1)
            else:
                accuracy.append(0)
        #accuracy = [1 if res[i] == test_labels[i] else 0 for i in range(len(res))]
        acc = sum(accuracy)/len(accuracy)

        if (acc > best_accuracy):
            best = train_index, test_index
            best_accuracy = acc

        print("accuracy rate: ", acc)
    return best

training_w, testing_w = run_kfold(white, income_w)

accuracy rate:  0.6401525989508822
accuracy rate:  0.6266692102251049
accuracy rate:  0.6318199160625715
accuracy rate:  0.6787485692483785
accuracy rate:  0.6728347958794353
accuracy rate:  0.6633918351774132
accuracy rate:  0.6809423884013736
accuracy rate:  0.6805608546356352
accuracy rate:  0.6398321251430752
accuracy rate:  0.6716901945822206


And now for nonwhite people

In [26]:
training_n, testing_n = run_kfold(nonwhite, income_n)

accuracy rate:  0.7204391891891891
accuracy rate:  0.7584459459459459
accuracy rate:  0.7132601351351351
accuracy rate:  0.7362753378378378
accuracy rate:  0.7387539598732841
accuracy rate:  0.7662090813093981
accuracy rate:  0.7531151003167899
accuracy rate:  0.7607180570221753
accuracy rate:  0.7465681098204857
accuracy rate:  0.8259767687434002


Let's examine the false positive and false negative rates respectively. White people:

In [41]:
train_fields = [white[i] for i in training_w]
train_labels = [income_w[i] for i in training_w]
test_fields = [white[i] for i in testing_w]
test_labels = [income_w[i] for i in testing_w]

clf = GaussianNB()
clf.fit(train_fields, train_labels)

res = clf.predict(test_fields).tolist()

accuracy = []
for i in range(len(res)):
    if res[i] == 1 and test_labels[i] == 0:
        accuracy.append(1)
    elif res[i] == 0 and test_labels[i] == 1:
        accuracy.append(-1)
    else:
        accuracy.append(0)

fp = sum([1 if accuracy[i] == 1 else 0 for i in range(len(accuracy))])/len(accuracy)
fn = sum([1 if accuracy[i] == -1 else 0 for i in range(len(accuracy))])/len(accuracy)
print("false positive rate: %4f" % fp)
print("false negative rate: %4f" % fn)
print("accuracy: %4f" % (1-(fp + fn)))

false positive rate: 0.148989
false negative rate: 0.170069
accuracy: 0.680942


And now for nonwhite people

In [None]:
train_fields = [nonwhite[i] for i in training_n]
train_labels = [income_n[i] for i in training_n]
test_fields = [nonwhite[i] for i in testing_n]
test_labels = [income_n[i] for i in testing_n]

clf = GaussianNB()
clf.fit(train_fields, train_labels)

res = clf.predict(test_fields).tolist()

accuracy = []
for i in range(len(res)):
    if res[i] == 1 and test_labels[i] == 0:
        accuracy.append(1)
    elif res[i] == 0 and test_labels[i] == 1:
        accuracy.append(-1)
    else:
        accuracy.append(0)

fp = sum([1 if accuracy[i] == 1 else 0 for i in range(len(accuracy))])/len(accuracy)
fn = sum([1 if accuracy[i] == -1 else 0 for i in range(len(accuracy))])/len(accuracy)
print("false positive rate: %4f" % fp)
print("false negative rate: %4f" % fn)
print("accuracy: %4f" % (1-(fp + fn)))