# 2-NB Models

In [7]:
import csv

file = 'cleaned_acs_2.csv'

with open(file, 'r') as fi:
    reader = csv.reader(fi)
    headers = next(reader)
    for i in range(len(headers)):
        print(i, headers[i])     

0 ST
1 AGEP
2 CIT
3 COW
4 DDRS
5 DEAR
6 DEYE
7 DOUT
8 DREM
9 ENG
10 FER
11 HINS1
12 HINS2
13 HINS3
14 HINS4
15 HINS5
16 HINS6
17 HINS7
18 JWMNP
19 JWTR
20 LANX
21 MAR
22 MARHD
23 MARHYP
24 MIG
25 PAP
26 SCH
27 SCHL
28 SEX
29 ESP
30 FOD1P
31 INDP
32 LANP
33 NATIVITY
34 NOP
35 PINCP
36 POBP
37 POWPUMA
38 RAC1P
39 race-binary


## Prep: Reading the data in

In [8]:
race = 37
race_binary = 38
income = 35

with open(file, 'r') as fi:
    reader = csv.reader(fi)
    headers = next(reader)
    
    white = []
    nonwhite = []
    income_w = []
    income_n = []
    counts = [0 for i in range(38)]
    
    for line in reader:
        
        # remove income, race, and race_binary
        data = []
        for i in range(len(line)):
            if i != race and i != race_binary and i != income:
                data.append(int(line[i]))

        # put in the correct racial category
        if int(line[race_binary]) == 1:
            white.append(data)
            income_w.append(int(line[income]))
        else:
            nonwhite.append(data)
            income_n.append(int(line[income]))


In [9]:
# something that might cause problems - there's WAY less data for white people
print(len(white))
print(len(nonwhite))

1005027
320470


## Making the Models

I am going to make the models using k-fold cross validation to try to improve the accuracy. Let's start with white people.

In [85]:
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import KFold

# returns indexes into fields and labels
def run_kfold(fields, labels):
    kf = KFold(n_splits=10)
    best = [], []
    best_accuracy = 0

    # train_index and test_index index into fields and labels
    for train_index, test_index in kf.split(fields):
        train_fields = [fields[i] for i in train_index]
        train_labels = [labels[i] for i in train_index]
        test_fields = [fields[i] for i in test_index]
        test_labels = [labels[i] for i in test_index]

        clf = GaussianNB()
        clf.fit(train_fields, train_labels)

        res = clf.predict(test_fields).tolist()
        
        accuracy = []
        for i in range(len(res)):
            if res[i] == test_labels[i]:
                accuracy.append(1)
            else:
                accuracy.append(0)
        #accuracy = [1 if res[i] == test_labels[i] else 0 for i in range(len(res))]
        acc = sum(accuracy)/len(accuracy)

        if (acc > best_accuracy):
            best = train_index, test_index
            best_accuracy = acc

        print("accuracy rate: ", acc)
    return best

training_w, testing_w = run_kfold(white, income_w)

accuracy rate:  0.6297567954220314
accuracy rate:  0.6452689813048454
accuracy rate:  0.6009156810377718
accuracy rate:  0.581934376192293
accuracy rate:  0.5891835177413202
accuracy rate:  0.6288630293781
accuracy rate:  0.5890881342998855
accuracy rate:  0.5914727203357497
accuracy rate:  0.6184662342617322
accuracy rate:  0.584128195345288


And now for nonwhite people

In [86]:
training_n, testing_n = run_kfold(nonwhite, income_n)

accuracy rate:  0.6578751857355126
accuracy rate:  0.5973254086181278
accuracy rate:  0.7035661218424963
accuracy rate:  0.5995542347696879
accuracy rate:  0.6927934621099554
accuracy rate:  0.7069093610698366
accuracy rate:  0.6266716196136701
accuracy rate:  0.6849925705794948
accuracy rate:  0.7094017094017094
accuracy rate:  0.6228167967298402


## Accuracy

### Differential Accuracy by Race Binary

Let's examine the false positive and false negative rates respectively. **White people**:

In [87]:
#training and testing index into fields and labels 
#so max(training) < len(fields) 
def run_model(training, testing, fields, labels):
    train_fields = [fields[i] for i in training]
    train_labels = [labels[i] for i in training]
    test_fields = [fields[i] for i in testing]
    test_labels = [labels[i] for i in testing]

    clf = GaussianNB()
    clf.fit(train_fields, train_labels)

    res = clf.predict(test_fields).tolist()

    accuracy = []
    for i in range(len(res)):
        if res[i] == 1 and test_labels[i] == 0:
            accuracy.append(1)
        elif res[i] == 0 and test_labels[i] == 1:
            accuracy.append(-1)
        else:
            accuracy.append(0)

    fp = sum([1 if accuracy[i] == 1 else 0 for i in range(len(accuracy))])/len(accuracy)
    fn = sum([1 if accuracy[i] == -1 else 0 for i in range(len(accuracy))])/len(accuracy)
    acc = sum([1 if accuracy[i] == 0 else 0 for i in range(len(accuracy))])/len(accuracy)
    print("false positive rate: %4f" % fp)
    print("false negative rate: %4f" % fn)
    print("accuracy: %4f" % acc)
    return res, acc, fp, fn

print("Results of running the model for white people:")
res_w, acc_w, fp_w, fn_w = run_model(training_w, testing_w, white, income_w)


Results of running the model for white people:
false positive rate: 0.190671
false negative rate: 0.164060
accuracy: 0.645269


And now for **nonwhite people:**

In [88]:
print("Results of running the model for nonwhite people:")
res_n, acc_n, fp_n, fn_n = run_model(training_n, testing_n, nonwhite, income_n)

Results of running the model for nonwhite people:
false positive rate: 0.144928
false negative rate: 0.145671
accuracy: 0.709402


**Overall Accuracy**

In [89]:
acc = (acc_n*len(res_n)+acc_w*len(res_w))/(len(res_n)+len(res_w))
fp = (fp_n*len(res_n)+fp_w*len(res_w))/(len(res_n)+len(res_w))
fn = (fn_n*len(res_n)+fn_w*len(res_w))/(len(res_n)+len(res_w))
print("false positive rate: %4f" % fp)
print("false negative rate: %4f" % fn)
print("accuracy: %4f" % acc)

false positive rate: 0.181328
false negative rate: 0.160304
accuracy: 0.658368


### Differential Accuracy by Sex

Let's examine differential accuracy for different subgroups. Let's initially break it down by sex.

In [90]:
def calc_accuracy(res, labels):
    
    accuracy = []
    for i in range(len(res)):
        if res[i] == 1 and labels[i] == 0:
            accuracy.append(1)
        elif res[i] == 0 and labels[i] == 1:
            accuracy.append(-1)
        else:
            accuracy.append(0)
    return accuracy


def sex_accuracy(testing, fields, labels, res):
    test_fields = [fields[i] for i in testing]
    test_labels = [labels[i] for i in testing]
        
    male = []
    female = []
    
    for i in range(len(testing)):
        if fields[testing[i]][8] == 1:
            male.append(i)
        if fields[testing[i]][8] == 2:
            female.append(i)
            
    accuracy = calc_accuracy([res[i] for i in male], [test_labels[i] for i in male])
    fp = sum([1 if accuracy[i] == 1 else 0 for i in range(len(accuracy))])/len(accuracy)
    fn = sum([1 if accuracy[i] == -1 else 0 for i in range(len(accuracy))])/len(accuracy)
    print("Accuracy for male:")
    print("    false positive rate: %4f" % fp)
    print("    false negative rate: %4f" % fn)
    print("    accuracy: %4f" % (1-(fp + fn)))
    
    accuracy = calc_accuracy([res[i] for i in female], [test_labels[i] for i in female])
    fpf = sum([1 if accuracy[i] == 1 else 0 for i in range(len(accuracy))])/len(accuracy)
    fnf = sum([1 if accuracy[i] == -1 else 0 for i in range(len(accuracy))])/len(accuracy)
    print("Accuracy for female:")
    print("    false positive rate: %4f" % fpf)
    print("    false negative rate: %4f" % fnf)
    print("    accuracy: %4f" % (1-(fpf + fnf)))
    

**Accuracy broken down by sex for nonwhite people**

In [91]:
sex_accuracy(testing_n, nonwhite, income_n, res_n)

Accuracy for male:
    false positive rate: 0.135852
    false negative rate: 0.143891
    accuracy: 0.720257
Accuracy for female:
    false positive rate: 0.152730
    false negative rate: 0.147201
    accuracy: 0.700069


**Accuracy broken down by sex for white people**

In [92]:
sex_accuracy(testing_w, white, income_w, res_w)

Accuracy for male:
    false positive rate: 0.192112
    false negative rate: 0.161070
    accuracy: 0.646818
Accuracy for female:
    false positive rate: 0.189295
    false negative rate: 0.166915
    accuracy: 0.643790


### Differential Accuracy By Race Categorical

This is a little tricker. I first need some way to lookup up the race of each entry. Since I've removed unique identifiers, I'm going to have to go back and collect them. Let's make a list of just the racial categories.

In [93]:
race = []
with open(file, 'r') as fi:
    reader = csv.reader(fi)
    headers = next(reader)
    
    for line in reader:
        if (int(line[13]) != 1):
            race.append(int(line[13]))

**Accuracy by specific racial group for nonwhite people**

In [94]:
import pandas as pd
def race_accuracy(testing, labels, res):
    test_labels = [labels[i] for i in testing]
    data = {"race":[], "false positive":[], "false negative":[], "accuracy":[], 'count':[]}
    
    # racial codes go from 1 to 26
    for r in range(0, 27):

        # indeces within res for the current group
        current = []
        for i in range(len(testing)):
            if (race[testing[i]] == r):
                current.append(i)

        # calculate accuracy
        accuracy = calc_accuracy([res[i] for i in current], [test_labels[i] for i in current])
        try:
            fp = sum([1 if accuracy[i] == 1 else 0 for i in range(len(accuracy))])/len(accuracy)
            fn = sum([1 if accuracy[i] == -1 else 0 for i in range(len(accuracy))])/len(accuracy)
            print("Results for race %d:" % r)
            print("    %d ENTRIES TOTAL" % len(current))
            print("    false positive rate: %4f" % fp)
            print("    false negative rate: %4f" % fn)
            print("    accuracy: %4f" % (1-(fp + fn)))
            data["race"].append(r)
            data["false positive"].append(fp)
            data["false negative"].append(fn)
            data["accuracy"].append((1-(fp + fn)))
            data["count"].append(len(current))
        except ZeroDivisionError:
            print("No results for race %d" % r)
    
    return pd.DataFrame(data)

df = race_accuracy(testing_n, income_n, res_n)
with open("2nb.csv", "w") as fo:
    writer = csv.writer(fo)
    writer.writerow(['race', 'false positive', 'false negative', 'accuracy'])
    for index, row in df.iterrows():
        writer.writerow([row['race'], row['false positive'], row['false negative'], row['accuracy'], row['count']])

No results for race 0
No results for race 1
Results for race 2:
    1776 ENTRIES TOTAL
    false positive rate: 0.128941
    false negative rate: 0.114865
    accuracy: 0.756194
Results for race 3:
    183 ENTRIES TOTAL
    false positive rate: 0.092896
    false negative rate: 0.038251
    accuracy: 0.868852
Results for race 4:
    436 ENTRIES TOTAL
    false positive rate: 0.266055
    false negative rate: 0.204128
    accuracy: 0.529817
Results for race 5:
    21 ENTRIES TOTAL
    false positive rate: 0.380952
    false negative rate: 0.190476
    accuracy: 0.428571
Results for race 6:
    112 ENTRIES TOTAL
    false positive rate: 0.098214
    false negative rate: 0.312500
    accuracy: 0.589286
Results for race 7:
    80 ENTRIES TOTAL
    false positive rate: 0.062500
    false negative rate: 0.250000
    accuracy: 0.687500
Results for race 8:
    35 ENTRIES TOTAL
    false positive rate: 0.028571
    false negative rate: 0.600000
    accuracy: 0.371429
Results for race 9:
    11 

# n-NB Models

In [95]:
# race_fields[i] = 2d array of fields for race i
race_fields = [[] for i in range(27)]
race_labels = [[] for i in range(27)]

# iterate over each row in the data
for i in range(len(nonwhite)):
    race_fields[race[i]].append(nonwhite[i])
    race_labels[race[i]].append(income_n[i])
for race_i in race_fields:
    print(len(race_i))

0
0
14541
1944
6790
791
824
767
540
164
133
45
20
7
2
88
129
8
2
10
4
87
3
2
0
1
16


In [99]:
def run_race(race, fields, labels):
    
    print("TRAINING")
    print("-----------------------")
    print()
    
    training, testing = run_kfold(fields, labels)
    
    print()
    print("RUNNING THE BEST MODEL")
    print("-----------------------")
    print()
    
    res, acc, fp, fn = run_model(training, testing, fields, labels)
    
    '''print()
    print("ACCURACY BY SEX")
    print("-----------------------")
    print()
    
    sex_accuracy(testing, fields, labels, res) '''
    return res, acc, fp, fn 

In [102]:
fp_overall = 0
fn_overall = 0
accuracy_overall = 0
total = 0
with open("nNB.csv", "w") as fo:
    writer = csv.writer(fo)
    writer.writerow(['race', 'false positive', 'false negative', 'accuracy'])
    
    for i in range(len(race_fields)):
        if len(race_fields[i]) > 350:
            print()
            print("RACE %d " % i)
            print()
            res, acc, fp, fn = run_race(i, race_fields[i], race_labels[i])
            writer.writerow([i, fp, fn, acc])
            fp_overall += fp*len(res)
            fn_overall += fn*len(res)
            accuracy_overall += acc*len(res)
            total += len(res)
print()
print("OVERALL RESULTS")
print("-------------------------")
print()
print("false positive rate: %4f" % (fp_overall/total))
print("false negative rate: %4f" % (fn_overall/total))
print("accuracy: %4f" % (accuracy_overall/total))


RACE 2 

TRAINING
-----------------------

accuracy rate:  0.7353951890034365
accuracy rate:  0.7077028885832187
accuracy rate:  0.6808803301237965
accuracy rate:  0.7049518569463549
accuracy rate:  0.6822558459422283
accuracy rate:  0.7462173314993122
accuracy rate:  0.6306740027510316
accuracy rate:  0.734525447042641
accuracy rate:  0.7324621733149931
accuracy rate:  0.6781292984869326

RUNNING THE BEST MODEL
-----------------------

false positive rate: 0.152682
false negative rate: 0.101100
accuracy: 0.746217

RACE 3 

TRAINING
-----------------------

accuracy rate:  0.36923076923076925
accuracy rate:  0.49230769230769234
accuracy rate:  0.48717948717948717
accuracy rate:  0.3128205128205128
accuracy rate:  0.26804123711340205
accuracy rate:  0.4175257731958763
accuracy rate:  0.30927835051546393
accuracy rate:  0.4020618556701031
accuracy rate:  0.24742268041237114
accuracy rate:  0.4896907216494845

RUNNING THE BEST MODEL
-----------------------

false positive rate: 0.420513
