# 2-NB Models

In [1]:
import csv

file = 'cleaned3.csv'

with open(file, 'r') as fi:
    reader = csv.reader(fi)
    headers = next(reader)
    for i in range(len(headers)):
        print(i, headers[i])     

0 HOUSEHOLD IDENTIFIER
1 ARE YOUR LIVING QUARTERS...
2 IS THERE A TELEPHONE IN THIS 
3 FAMILY INCOME
4 TOTAL NUMBER OF PERSONS LIVING 
5 REGION
6 DIVISION
7 INDIVIDUAL PRINCIPAL CITY
8 PERSONS AGE  
9 MARITAL STATUS 
10 SEX
11 DID YOU EVER SERVE ON ACTIVE 
12 HIGHEST LEVEL OF SCHOOL 
13 RACE
14 CHANGE IN HOUSEHOLD COMPOSITION
15 HISPANIC OR NON-HISPANIC
16 COUNTRY OF BIRTH
17 CITIZENSHIP STATUS
18 LAST WEEK, DID YOU DO ANY WORK
19 LAST WEEK, DID YOU DO ANY
20 DO YOU RECEIVE ANY PAYMENTS
21 DISABILITY STATUS
22 DO YOU CURRENTLY WANT A JOB, EITHER 
23 INDUSTRY AND OCCUPATION 
24 CLASS OF WORKER
25 CLASS OF WORKER
26 COW - PRIVATE OR GOVERNMENT
27 PERIODICITY
28 HOURLY/NONHOURLY STATUS
29 ON THIS JOB, ARE YOU A MEMBER OF A 
30 ON THIS JOB ARE YOU COVERED BY A UNION 
31 PRESENCE OF OWN CHILDREN <18 YEARS 
32 Number of own children <18 years of age 
33 race-binary


## Prep: Reading the data in

In [2]:
with open(file, 'r') as fi:
    reader = csv.reader(fi)
    headers = next(reader)
    
    white = []
    nonwhite = []
    income_w = []
    income_n = []
    counts = [0 for i in range(27)]
    
    for line in reader:
        
        # remove id, income, race, and race_binary
        data = []
        for i in range(len(line)):
            if i != 0 and i != 13 and i != 3 and i != 33:
                data.append(int(line[i]))

        # put in the correct racial category
        if int(line[13]) == 1:
            white.append(data)
            income_w.append(int(line[3]))
        else:
            nonwhite.append(data)
            income_n.append(int(line[3]))


In [3]:
# something that might cause problems - there's WAY more data for white people
print(len(white))
print(len(nonwhite))

104841
26918


## Making the Models

I am going to make the models using k-fold cross validation to try to improve the accuracy. Let's start with white people.

In [4]:
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import KFold

# returns indexes into fields and labels
def run_kfold(fields, labels):
    kf = KFold(n_splits=10)
    best = [], []
    best_accuracy = 0

    # train_index and test_index index into fields and labels
    for train_index, test_index in kf.split(fields):
        train_fields = [fields[i] for i in train_index]
        train_labels = [labels[i] for i in train_index]
        test_fields = [fields[i] for i in test_index]
        test_labels = [labels[i] for i in test_index]

        clf = GaussianNB()
        clf.fit(train_fields, train_labels)

        res = clf.predict(test_fields).tolist()
        
        accuracy = []
        for i in range(len(res)):
            if res[i] == test_labels[i]:
                accuracy.append(1)
            else:
                accuracy.append(0)
        #accuracy = [1 if res[i] == test_labels[i] else 0 for i in range(len(res))]
        acc = sum(accuracy)/len(accuracy)

        if (acc > best_accuracy):
            best = train_index, test_index
            best_accuracy = acc

        print("accuracy rate: ", acc)
    return best

training_w, testing_w = run_kfold(white, income_w)

accuracy rate:  0.8371006199332379
accuracy rate:  0.8151468904998093
accuracy rate:  0.8441434566959176
accuracy rate:  0.8863983212514307
accuracy rate:  0.8987981686379245
accuracy rate:  0.8512018313620755
accuracy rate:  0.8915490270888974
accuracy rate:  0.8908813429988554
accuracy rate:  0.8283098054177794
accuracy rate:  0.9009919877909195


And now for nonwhite people

In [5]:
training_n, testing_n = run_kfold(nonwhite, income_n)

accuracy rate:  0.6861069836552749
accuracy rate:  0.736627043090639
accuracy rate:  0.6485884101040119
accuracy rate:  0.6853640416047548
accuracy rate:  0.6523031203566122
accuracy rate:  0.6530460624071323
accuracy rate:  0.6797919762258544
accuracy rate:  0.6225854383358098
accuracy rate:  0.6387959866220736
accuracy rate:  0.6250464511334076


## Accuracy

### Differential Accuracy by Race Binary

Let's examine the false positive and false negative rates respectively. **White people**:

In [42]:
#training and testing index into fields and labels 
#so max(training) < len(fields) 
def run_model(training, testing, fields, labels):
    train_fields = [fields[i] for i in training]
    train_labels = [labels[i] for i in training]
    test_fields = [fields[i] for i in testing]
    test_labels = [labels[i] for i in testing]

    clf = GaussianNB()
    clf.fit(train_fields, train_labels)

    res = clf.predict(test_fields).tolist()

    accuracy = []
    for i in range(len(res)):
        if res[i] == 1 and test_labels[i] == 0:
            accuracy.append(1)
        elif res[i] == 0 and test_labels[i] == 1:
            accuracy.append(-1)
        else:
            accuracy.append(0)

    fp = sum([1 if accuracy[i] == 1 else 0 for i in range(len(accuracy))])/len(accuracy)
    fn = sum([1 if accuracy[i] == -1 else 0 for i in range(len(accuracy))])/len(accuracy)
    acc = sum([1 if accuracy[i] == 0 else 0 for i in range(len(accuracy))])/len(accuracy)
    print("false positive rate: %4f" % fp)
    print("false negative rate: %4f" % fn)
    print("accuracy: %4f" % acc)
    return res, acc, fp, fn

print("Results of running the model for white people:")
res_w, acc_w, fp_w, fn_w = run_model(training_w, testing_w, white, income_w)


Results of running the model for white people:
false positive rate: 0.022797
false negative rate: 0.076211
accuracy: 0.900992


And now for **nonwhite people:**

In [43]:
print("Results of running the model for nonwhite people:")
res_n, acc_n, fp_n, fn_n = run_model(training_n, testing_n, nonwhite, income_n)

Results of running the model for nonwhite people:
false positive rate: 0.026374
false negative rate: 0.236999
accuracy: 0.736627


**Overall Accuracy**

In [44]:
acc = (acc_n*len(res_n)+acc_w*len(res_w))/(len(res_n)+len(res_w))
fp = (fp_n*len(res_n)+fp_w*len(res_w))/(len(res_n)+len(res_w))
fn = (fn_n*len(res_n)+fn_w*len(res_w))/(len(res_n)+len(res_w))
print("false positive rate: %4f" % fp)
print("false negative rate: %4f" % fn)
print("accuracy: %4f" % acc)

false positive rate: 0.023528
false negative rate: 0.109062
accuracy: 0.867410


### Differential Accuracy by Sex

Let's examine differential accuracy for different subgroups. Let's initially break it down by sex.

In [12]:
def calc_accuracy(res, labels):
    
    accuracy = []
    for i in range(len(res)):
        if res[i] == 1 and labels[i] == 0:
            accuracy.append(1)
        elif res[i] == 0 and labels[i] == 1:
            accuracy.append(-1)
        else:
            accuracy.append(0)
    return accuracy


def sex_accuracy(testing, fields, labels, res):
    test_fields = [fields[i] for i in testing]
    test_labels = [labels[i] for i in testing]
        
    male = []
    female = []
    
    for i in range(len(testing)):
        if fields[testing[i]][8] == 1:
            male.append(i)
        if fields[testing[i]][8] == 2:
            female.append(i)
            
    accuracy = calc_accuracy([res[i] for i in male], [test_labels[i] for i in male])
    fp = sum([1 if accuracy[i] == 1 else 0 for i in range(len(accuracy))])/len(accuracy)
    fn = sum([1 if accuracy[i] == -1 else 0 for i in range(len(accuracy))])/len(accuracy)
    print("Accuracy for male:")
    print("    false positive rate: %4f" % fp)
    print("    false negative rate: %4f" % fn)
    print("    accuracy: %4f" % (1-(fp + fn)))
    
    accuracy = calc_accuracy([res[i] for i in female], [test_labels[i] for i in female])
    fpf = sum([1 if accuracy[i] == 1 else 0 for i in range(len(accuracy))])/len(accuracy)
    fnf = sum([1 if accuracy[i] == -1 else 0 for i in range(len(accuracy))])/len(accuracy)
    print("Accuracy for female:")
    print("    false positive rate: %4f" % fpf)
    print("    false negative rate: %4f" % fnf)
    print("    accuracy: %4f" % (1-(fpf + fnf)))
    

**Accuracy broken down by sex for nonwhite people**

In [9]:
sex_accuracy(testing_n, nonwhite, income_n, res_n)

Accuracy for male:
    false positive rate: 0.022533
    false negative rate: 0.243978
    accuracy: 0.733489
Accuracy for female:
    false positive rate: 0.029893
    false negative rate: 0.230605
    accuracy: 0.739502


**Accuracy broken down by sex for white people**

In [10]:
sex_accuracy(testing_w, white, income_w, res_w)

Accuracy for male:
    false positive rate: 0.018324
    false negative rate: 0.074060
    accuracy: 0.907616
Accuracy for female:
    false positive rate: 0.027264
    false negative rate: 0.078360
    accuracy: 0.894376


### Differential Accuracy By Race Categorical

This is a little tricker. I first need some way to lookup up the race of each entry. Since I've removed unique identifiers, I'm going to have to go back and collect them. Let's make a list of just the racial categories.

In [13]:
race = []
with open(file, 'r') as fi:
    reader = csv.reader(fi)
    headers = next(reader)
    
    for line in reader:
        if (int(line[13]) != 1):
            race.append(int(line[13]))

**Accuracy by specific racial group for nonwhite people**

In [30]:
import pandas as pd
def race_accuracy(testing, labels, res):
    test_labels = [labels[i] for i in testing]
    data = {"race":[], "false positive":[], "false negative":[], "accuracy":[], 'count':[]}
    
    # racial codes go from 1 to 26
    for r in range(0, 27):

        # indeces within res for the current group
        current = []
        for i in range(len(testing)):
            if (race[testing[i]] == r):
                current.append(i)

        # calculate accuracy
        accuracy = calc_accuracy([res[i] for i in current], [test_labels[i] for i in current])
        try:
            fp = sum([1 if accuracy[i] == 1 else 0 for i in range(len(accuracy))])/len(accuracy)
            fn = sum([1 if accuracy[i] == -1 else 0 for i in range(len(accuracy))])/len(accuracy)
            print("Results for race %d:" % r)
            print("    %d ENTRIES TOTAL" % len(current))
            print("    false positive rate: %4f" % fp)
            print("    false negative rate: %4f" % fn)
            print("    accuracy: %4f" % (1-(fp + fn)))
            data["race"].append(r)
            data["false positive"].append(fp)
            data["false negative"].append(fn)
            data["accuracy"].append((1-(fp + fn)))
            data["count"].append(len(current))
        except ZeroDivisionError:
            print("No results for race %d" % r)
    
    return pd.DataFrame(data)

df = race_accuracy(testing_n, income_n, res_n)
with open("2nb.csv", "w") as fo:
    writer = csv.writer(fo)
    writer.writerow(['race', 'false positive', 'false negative', 'accuracy'])
    for index, row in df.iterrows():
        writer.writerow([row['race'], row['false positive'], row['false negative'], row['accuracy'], row['count']])

No results for race 0
No results for race 1
Results for race 2:
    795 ENTRIES TOTAL
    false positive rate: 0.030189
    false negative rate: 0.304403
    accuracy: 0.665409
Results for race 3:
    112 ENTRIES TOTAL
    false positive rate: 0.026786
    false negative rate: 0.142857
    accuracy: 0.830357
Results for race 4:
    1367 ENTRIES TOTAL
    false positive rate: 0.024140
    false negative rate: 0.177762
    accuracy: 0.798098
Results for race 5:
    106 ENTRIES TOTAL
    false positive rate: 0.028302
    false negative rate: 0.179245
    accuracy: 0.792453
Results for race 6:
    45 ENTRIES TOTAL
    false positive rate: 0.000000
    false negative rate: 0.644444
    accuracy: 0.355556
Results for race 7:
    77 ENTRIES TOTAL
    false positive rate: 0.064935
    false negative rate: 0.233766
    accuracy: 0.701299
Results for race 8:
    110 ENTRIES TOTAL
    false positive rate: 0.027273
    false negative rate: 0.509091
    accuracy: 0.463636
Results for race 9:
    14

# n-NB Models

In [18]:
# race_fields[i] = 2d array of fields for race i
race_fields = [[] for i in range(27)]
race_labels = [[] for i in range(27)]
for i in range(len(nonwhite)):
    race_fields[race[i]].append(nonwhite[i])
    race_labels[race[i]].append(income_n[i])
for race_i in race_fields:
    print(len(race_i))

0
0
14541
1944
6790
791
824
767
540
164
133
45
20
7
2
88
129
8
2
10
4
87
3
2
0
1
16


In [46]:
def run_race(race, fields, labels):
    
    print("TRAINING")
    print("-----------------------")
    print()
    
    training, testing = run_kfold(fields, labels)
    
    print()
    print("RUNNING THE BEST MODEL")
    print("-----------------------")
    print()
    
    res, acc, fp, fn = run_model(training, testing, fields, labels)
    
  '''  print()
    print("ACCURACY BY SEX")
    print("-----------------------")
    print()
    
    sex_accuracy(testing, fields, labels, res) '''
    return res, acc, fp, fn 

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 16)

In [41]:
fp_overall = 0
fn_overall = 0
accuracy_overall = 0
total = 0
with open("nNB.csv", "w") as fo:
    writer = csv.writer(fo)
    writer.writerow(['race', 'false positive', 'false negative', 'accuracy'])
    
    for i in range(len(race_fields)):
        if len(race_fields[i]) > 350:
            print()
            print("RACE %d " % i)
            print()
            res, acc, fp, fn = run_race(i, race_fields[i], race_labels[i])
            writer.writerow([i, fp, fn, acc])
            fp_overall += fp*len(res)
            fn_overall += fn*len(res)
            accuracy_overall += acc*len(res)
            total += len(res)
print()
print("false positive rate: %4f" % (fp_overall/total))
print("false negative rate: %4f" % (fn_overall/total))
print("accuracy: %4f" % (accuracy_overall/total))


RACE 2 

TRAINING
-----------------------

accuracy rate:  0.6
accuracy rate:  0.625171939477304
accuracy rate:  0.6141678129298487
accuracy rate:  0.5873452544704264
accuracy rate:  0.6368638239339752
accuracy rate:  0.5845942228335625
accuracy rate:  0.702200825309491
accuracy rate:  0.6045392022008254
accuracy rate:  0.579092159559835
accuracy rate:  0.6176066024759285

RUNNING THE BEST MODEL
-----------------------

false positive rate: 0.038514
false negative rate: 0.259285
accuracy: 0.702201

ACCURACY BY SEX
-----------------------

Accuracy for male:
    false positive rate: 0.035439
    false negative rate: 0.268105
    accuracy: 0.696456
Accuracy for female:
    false positive rate: 0.040994
    false negative rate: 0.252174
    accuracy: 0.706832

RACE 3 

TRAINING
-----------------------

accuracy rate:  0.6871794871794872
accuracy rate:  0.7076923076923077
accuracy rate:  0.7487179487179487
accuracy rate:  0.5230769230769231
accuracy rate:  0.7731958762886598
accuracy rate