# 2-NB Models

In [1]:
import csv

file = 'cleaned_acs_2.csv'

with open(file, 'r') as fi:
    reader = csv.reader(fi)
    headers = next(reader)
    for i in range(len(headers)):
        print(i, headers[i])     

0 ST
1 AGEP
2 CIT
3 COW
4 DDRS
5 DEAR
6 DEYE
7 DOUT
8 DREM
9 ENG
10 FER
11 HINS1
12 HINS2
13 HINS3
14 HINS4
15 HINS5
16 HINS6
17 HINS7
18 JWMNP
19 JWTR
20 LANX
21 MAR
22 MARHD
23 MARHYP
24 MIG
25 PAP
26 SCH
27 SCHL
28 SEX
29 ESP
30 FOD1P
31 INDP
32 LANP
33 NATIVITY
34 NOP
35 PINCP
36 POBP
37 POWPUMA
38 RAC1P
39 race-binary


## Prep: Reading the data in

In [2]:
race = 38
race_b = 39
income = 35
sex = 28
with open(file, 'r') as fi:
    reader = csv.reader(fi)
    headers = next(reader)
    
    white = []
    nonwhite = []
    income_w = []
    income_n = []
    
    for line in reader:
        
        # remove id, income, race, and race_binary
        data = []
        for i in range(len(line)):
            if i != race and i != race_b and i != income:
                if (int(line[i]) == -1):
                    data.append(1000000000)
                else:
                    data.append(int(line[i]))


        # put in the correct racial category
        if int(line[race_b]) == 1:
            white.append(data)
            income_w.append(int(line[income]))
        else:
            nonwhite.append(data)
            income_n.append(int(line[income]))


In [3]:
# something that might cause problems - there's WAY more data for white people
print(len(white))
print(len(nonwhite))

1005027
320470


## Making the Models

I am going to make the models using k-fold cross validation to try to improve the accuracy. Let's start with white people.

In [None]:
import numpy as np
from sklearn.naive_bayes import GaussianNB, CategoricalNB
from sklearn.model_selection import KFold

# returns indexes into fields and labels
def run_kfold(fields, labels):
    kf = KFold(n_splits=10)
    best = [], []
    best_accuracy = 0

    # train_index and test_index index into fields and labels
    for train_index, test_index in kf.split(fields):
        train_fields = [fields[i] for i in train_index]
        train_labels = [labels[i] for i in train_index]
        test_fields = [fields[i] for i in test_index]
        test_labels = [labels[i] for i in test_index]

        clf = CategoricalNB()
        clf.fit(train_fields, train_labels)

        res = clf.predict(test_fields).tolist()
        
        accuracy = []
        for i in range(len(res)):
            if res[i] == test_labels[i]:
                accuracy.append(1)
            else:
                accuracy.append(0)
        accuracy = [1 if res[i] == test_labels[i] else 0 for i in range(len(res))]
        acc = sum(accuracy)/len(accuracy)

        if (acc > best_accuracy):
            best = train_index, test_index
            best_accuracy = acc

        print("accuracy rate: ", acc)
    return best

training_w, testing_w = run_kfold(white, income_w)

And now for nonwhite people

In [34]:
training_n, testing_n = run_kfold(nonwhite, income_n)

accuracy rate:  0.6214310231846975
accuracy rate:  0.6186538521546479
accuracy rate:  0.6185914438168939
accuracy rate:  0.623396885823946
accuracy rate:  0.6108216057665304
accuracy rate:  0.5383031172964708
accuracy rate:  0.5632040440602865
accuracy rate:  0.6254875651387025
accuracy rate:  0.6396854619777202
accuracy rate:  0.6535713171279683


## Accuracy

### Differential Accuracy by Race Binary

Let's examine the false positive and false negative rates respectively. **White people**:

In [35]:
#training and testing index into fields and labels 
#so max(training) < len(fields) 
def run_model(training, testing, fields, labels):
    train_fields = [fields[i] for i in training]
    train_labels = [labels[i] for i in training]
    test_fields = [fields[i] for i in testing]
    test_labels = [labels[i] for i in testing]

    clf = GaussianNB()
    clf.fit(train_fields, train_labels)

    res = clf.predict(test_fields).tolist()

    accuracy = []
    for i in range(len(res)):
        if res[i] == 1 and test_labels[i] == 0:
            accuracy.append(1)
        elif res[i] == 0 and test_labels[i] == 1:
            accuracy.append(-1)
        else:
            accuracy.append(0)

    fp = sum([1 if accuracy[i] == 1 else 0 for i in range(len(accuracy))])/len(accuracy)
    fn = sum([1 if accuracy[i] == -1 else 0 for i in range(len(accuracy))])/len(accuracy)
    acc = sum([1 if accuracy[i] == 0 else 0 for i in range(len(accuracy))])/len(accuracy)
    print("false positive rate: %4f" % fp)
    print("false negative rate: %4f" % fn)
    print("accuracy: %4f" % acc)
    return res, acc, fp, fn

print("Results of running the model for white people:")
res_w, acc_w, fp_w, fn_w = run_model(training_w, testing_w, white, income_w)


Results of running the model for white people:
false positive rate: 0.279285
false negative rate: 0.046267
accuracy: 0.674448


And now for **nonwhite people:**

In [36]:
print("Results of running the model for nonwhite people:")
res_n, acc_n, fp_n, fn_n = run_model(training_n, testing_n, nonwhite, income_n)

Results of running the model for nonwhite people:
false positive rate: 0.330795
false negative rate: 0.015633
accuracy: 0.653571


**Overall Accuracy**

In [37]:
acc = (acc_n*len(res_n)+acc_w*len(res_w))/(len(res_n)+len(res_w))
fp = (fp_n*len(res_n)+fp_w*len(res_w))/(len(res_n)+len(res_w))
fn = (fn_n*len(res_n)+fn_w*len(res_w))/(len(res_n)+len(res_w))
print("false positive rate: %4f" % fp)
print("false negative rate: %4f" % fn)
print("accuracy: %4f" % acc)

false positive rate: 0.291739
false negative rate: 0.038861
accuracy: 0.669400


### Differential Accuracy by Sex

Let's examine differential accuracy for different subgroups. Let's initially break it down by sex.

In [38]:
def calc_accuracy(res, labels):
    
    accuracy = []
    for i in range(len(res)):
        if res[i] == 1 and labels[i] == 0:
            accuracy.append(1)
        elif res[i] == 0 and labels[i] == 1:
            accuracy.append(-1)
        else:
            accuracy.append(0)
    return accuracy


def sex_accuracy(testing, fields, labels, res):
    test_fields = [fields[i] for i in testing]
    test_labels = [labels[i] for i in testing]
        
    male = []
    female = []
    
    for i in range(len(testing)):
        if fields[testing[i]][sex] == 1:
            male.append(i)
        if fields[testing[i]][sex] == 2:
            female.append(i)
            
    accuracy = calc_accuracy([res[i] for i in male], [test_labels[i] for i in male])
    fp = sum([1 if accuracy[i] == 1 else 0 for i in range(len(accuracy))])/len(accuracy)
    fn = sum([1 if accuracy[i] == -1 else 0 for i in range(len(accuracy))])/len(accuracy)
    print("Accuracy for male:")
    print("    false positive rate: %4f" % fp)
    print("    false negative rate: %4f" % fn)
    print("    accuracy: %4f" % (1-(fp + fn)))
    
    accuracy = calc_accuracy([res[i] for i in female], [test_labels[i] for i in female])
    fpf = sum([1 if accuracy[i] == 1 else 0 for i in range(len(accuracy))])/len(accuracy)
    fnf = sum([1 if accuracy[i] == -1 else 0 for i in range(len(accuracy))])/len(accuracy)
    print("Accuracy for female:")
    print("    false positive rate: %4f" % fpf)
    print("    false negative rate: %4f" % fnf)
    print("    accuracy: %4f" % (1-(fpf + fnf)))
    

**Accuracy broken down by sex for nonwhite people**

In [39]:
sex_accuracy(testing_n, nonwhite, income_n, res_n)

Accuracy for male:
    false positive rate: 0.310132
    false negative rate: 0.019408
    accuracy: 0.670461
Accuracy for female:
    false positive rate: 0.349439
    false negative rate: 0.012228
    accuracy: 0.638333


**Accuracy broken down by sex for white people**

In [40]:
sex_accuracy(testing_w, white, income_w, res_w)

Accuracy for male:
    false positive rate: 0.234184
    false negative rate: 0.057163
    accuracy: 0.708652
Accuracy for female:
    false positive rate: 0.322796
    false negative rate: 0.035755
    accuracy: 0.641448


### Differential Accuracy By Race Categorical

This is a little tricker. I first need some way to lookup up the race of each entry. Since I've removed unique identifiers, I'm going to have to go back and collect them. Let's make a list of just the racial categories.

In [63]:
race_list = []
with open(file, 'r') as fi:
    reader = csv.reader(fi)
    headers = next(reader)
    
    for line in reader:
        if (int(line[race_b]) != 1):
            race_list.append(int(line[race]))
print(race_list.count(4))

1468


**Accuracy by specific racial group for nonwhite people**

In [61]:
import pandas as pd
def race_accuracy(testing, labels, res):
    test_labels = [labels[i] for i in testing]
    data = {"race":[], "false positive":[], "false negative":[], "accuracy":[], 'count':[]}
    
    for r in range(max(race_list)+1):

        # indeces within res for the current group
        current = []
        for i in range(len(testing)):
            if (race_list[testing[i]] == r):
                current.append(i)

        # calculate accuracy
        accuracy = calc_accuracy([res[i] for i in current], [test_labels[i] for i in current])
        try:
            fp = sum([1 if accuracy[i] == 1 else 0 for i in range(len(accuracy))])/len(accuracy)
            fn = sum([1 if accuracy[i] == -1 else 0 for i in range(len(accuracy))])/len(accuracy)
            print("Results for race %d:" % r)
            print("    %d ENTRIES TOTAL" % len(current))
            print("    false positive rate: %4f" % fp)
            print("    false negative rate: %4f" % fn)
            print("    accuracy: %4f" % (1-(fp + fn)))
            data["race"].append(r)
            data["false positive"].append(fp)
            data["false negative"].append(fn)
            data["accuracy"].append((1-(fp + fn)))
            data["count"].append(len(current))
        except ZeroDivisionError:
            print("No results for race %d" % r)
    
    return pd.DataFrame(data)

df = race_accuracy(testing_n, income_n, res_n)
with open("2nb.csv", "w") as fo:
    writer = csv.writer(fo)
    writer.writerow(['race', 'false positive', 'false negative', 'accuracy'])
    for index, row in df.iterrows():
        writer.writerow([row['race'], row['false positive'], row['false negative'], row['accuracy'], row['count']])

No results for race 0
No results for race 1
Results for race 2:
    19731 ENTRIES TOTAL
    false positive rate: 0.335715
    false negative rate: 0.013633
    accuracy: 0.650651
Results for race 3:
    1002 ENTRIES TOTAL
    false positive rate: 0.250499
    false negative rate: 0.044910
    accuracy: 0.704591
Results for race 4:
    8 ENTRIES TOTAL
    false positive rate: 0.250000
    false negative rate: 0.000000
    accuracy: 0.750000
Results for race 5:
    212 ENTRIES TOTAL
    false positive rate: 0.259434
    false negative rate: 0.009434
    accuracy: 0.731132
Results for race 6:
    5646 ENTRIES TOTAL
    false positive rate: 0.360432
    false negative rate: 0.014346
    accuracy: 0.625221
Results for race 7:
    47 ENTRIES TOTAL
    false positive rate: 0.382979
    false negative rate: 0.000000
    accuracy: 0.617021
Results for race 8:
    2424 ENTRIES TOTAL
    false positive rate: 0.312706
    false negative rate: 0.016914
    accuracy: 0.670380
Results for race 9:
   

In [65]:
print(len(testing_w))

100503


In [64]:
print(df['count'])

0    19731
1     1002
2        8
3      212
4     5646
5       47
6     2424
7     2977
Name: count, dtype: int64


# n-NB Models

In [46]:
# race_fields[i] = 2d array of fields for race i
race_fields = [[] for i in range(max(race_list)+1)]
race_labels = [[] for i in range(max(race_list)+1)]

# iterate over each row in the data
for i in range(len(nonwhite)):
    race_fields[race_list[i]].append(nonwhite[i])
    race_labels[race_list[i]].append(income_n[i])
for race_i in race_fields:
    print(len(race_i))

0
0
141760
8818
1468
1690
80366
2777
53477
30114


In [59]:
def run_race(race, fields, labels):
    
    print("TRAINING")
    print("-----------------------")
    print()
    
    training, testing = run_kfold(fields, labels)
    
    print()
    print("RUNNING THE BEST MODEL")
    print("-----------------------")
    print()
    
    res, acc, fp, fn = run_model(training, testing, fields, labels)
    
    print()
    print("ACCURACY BY SEX")
    print("-----------------------")
    print()
    
    sex_accuracy(testing, fields, labels, res)
    return res, acc, fp, fn 

In [60]:
fp_overall = 0
fn_overall = 0
accuracy_overall = 0
total = 0
with open("nNB.csv", "w") as fo:
    writer = csv.writer(fo)
    writer.writerow(['race', 'false positive', 'false negative', 'accuracy'])
    
    for i in range(len(race_fields)):
        if len(race_fields[i]) > 350:
            print()
            print("RACE %d " % i)
            print()
            res, acc, fp, fn = run_race(i, race_fields[i], race_labels[i])
            writer.writerow([i, fp, fn, acc])
            fp_overall += fp*len(res)
            fn_overall += fn*len(res)
            accuracy_overall += acc*len(res)
            total += len(res)
print()
print("false positive rate: %4f" % (fp_overall/total))
print("false negative rate: %4f" % (fn_overall/total))
print("accuracy: %4f" % (accuracy_overall/total))


RACE 2 

TRAINING
-----------------------

accuracy rate:  0.6093397291196389
accuracy rate:  0.6676777652370203
accuracy rate:  0.6187923250564334
accuracy rate:  0.5359762979683973
accuracy rate:  0.5759734762979684
accuracy rate:  0.6173814898419865
accuracy rate:  0.6563910835214447
accuracy rate:  0.6233069977426636
accuracy rate:  0.661117381489842
accuracy rate:  0.638191309255079

RUNNING THE BEST MODEL
-----------------------

false positive rate: 0.313699
false negative rate: 0.018623
accuracy: 0.667678
0.6676777652370203 0.3136992099322799 0.018623024830699775

ACCURACY BY SEX
-----------------------

Accuracy for male:
    false positive rate: 0.299844
    false negative rate: 0.020576
    accuracy: 0.679580
Accuracy for female:
    false positive rate: 0.327395
    false negative rate: 0.016692
    accuracy: 0.655912

RACE 3 

TRAINING
-----------------------

accuracy rate:  0.6224489795918368
accuracy rate:  0.6893424036281179
accuracy rate:  0.6961451247165533
accuracy