# 2-NB Models

In [1]:
import csv

file = 'cleaned2.csv'

with open(file, 'r') as fi:
    reader = csv.reader(fi)
    headers = next(reader)
    for i in range(len(headers)):
        print(i, headers[i])     

0 HOUSEHOLD IDENTIFIER
1 ARE YOUR LIVING QUARTERS...
2 IS THERE A TELEPHONE IN THIS 
3 FAMILY INCOME
4 TOTAL NUMBER OF PERSONS LIVING 
5 REGION
6 DIVISION
7 INDIVIDUAL PRINCIPAL CITY
8 PERSONS AGE  
9 MARITAL STATUS 
10 SEX
11 DID YOU EVER SERVE ON ACTIVE 
12 HIGHEST LEVEL OF SCHOOL 
13 RACE
14 CHANGE IN HOUSEHOLD COMPOSITION
15 HISPANIC OR NON-HISPANIC
16 COUNTRY OF BIRTH
17 CITIZENSHIP STATUS
18 LAST WEEK, DID YOU DO ANY WORK
19 LAST WEEK, DID YOU DO ANY
20 DO YOU RECEIVE ANY PAYMENTS
21 DISABILITY STATUS
22 DO YOU CURRENTLY WANT A JOB, EITHER 
23 INDUSTRY AND OCCUPATION 
24 CLASS OF WORKER
25 CLASS OF WORKER
26 COW - PRIVATE OR GOVERNMENT
27 PERIODICITY
28 HOURLY/NONHOURLY STATUS
29 ON THIS JOB, ARE YOU A MEMBER OF A 
30 ON THIS JOB ARE YOU COVERED BY A UNION 
31 PRESENCE OF OWN CHILDREN <18 YEARS 
32 Number of own children <18 years of age 
33 race-binary


## Prep: Reading the data in

In [16]:
with open(file, 'r') as fi:
    reader = csv.reader(fi)
    headers = next(reader)
    
    white = []
    nonwhite = []
    income_w = []
    income_n = []
    counts = [0 for i in range(27)]
    
    for line in reader:
        
        # remove id, income, race, and race_binary
        data = []
        for i in range(len(line)):
            if i != 0 and i != 13 and i != 3 and i != 33:
                data.append(int(line[i]))

        # put in the correct racial category
        if int(line[13]) == 1:
            white.append(data)
            income_w.append(int(line[3]))
        else:
            nonwhite.append(data)
            income_n.append(int(line[3]))


In [5]:
# something that might cause problems - there's WAY more data for white people
print(len(white))
print(len(nonwhite))

104841
26918


## Making the Models

I am going to make the models using k-fold cross validation to try to improve the accuracy. Let's start with white people.

In [6]:
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import KFold

def run_kfold(fields, labels):
    kf = KFold(n_splits=10)
    best = [], []
    best_accuracy = 0

    for train_index, test_index in kf.split(fields):
        train_fields = [fields[i] for i in train_index]
        train_labels = [labels[i] for i in train_index]
        test_fields = [fields[i] for i in test_index]
        test_labels = [labels[i] for i in test_index]

        clf = GaussianNB()
        clf.fit(train_fields, train_labels)

        res = clf.predict(test_fields).tolist()
        
        accuracy = []
        for i in range(len(res)):
            if res[i] == test_labels[i]:
                accuracy.append(1)
            else:
                accuracy.append(0)
        #accuracy = [1 if res[i] == test_labels[i] else 0 for i in range(len(res))]
        acc = sum(accuracy)/len(accuracy)

        if (acc > best_accuracy):
            best = train_index, test_index
            best_accuracy = acc

        print("accuracy rate: ", acc)
    return best

training_w, testing_w = run_kfold(white, income_w)

accuracy rate:  0.6395803528850739
accuracy rate:  0.6350629530713469
accuracy rate:  0.6240938573063716
accuracy rate:  0.6565242273941244
accuracy rate:  0.6539488744753911
accuracy rate:  0.6576688286913391
accuracy rate:  0.6619610835558947
accuracy rate:  0.6602441816100725
accuracy rate:  0.631056848531095
accuracy rate:  0.6458412819534529


And now for nonwhite people

In [7]:
training_n, testing_n = run_kfold(nonwhite, income_n)

accuracy rate:  0.6110698365527489
accuracy rate:  0.5913818722139673
accuracy rate:  0.6080980683506686
accuracy rate:  0.587667161961367
accuracy rate:  0.6218424962852898
accuracy rate:  0.6437592867756315
accuracy rate:  0.5976968796433878
accuracy rate:  0.6419019316493314
accuracy rate:  0.6510590858416946
accuracy rate:  0.5715347454477889


## Accuracy

### Differential Accuracy by Race Binary

Let's examine the false positive and false negative rates respectively. **White people**:

In [8]:

def run_model(training, testing, fields, labels):
    train_fields = [fields[i] for i in training]
    train_labels = [labels[i] for i in training]
    test_fields = [fields[i] for i in testing]
    test_labels = [labels[i] for i in testing]

    clf = GaussianNB()
    clf.fit(train_fields, train_labels)

    res = clf.predict(test_fields).tolist()

    accuracy = []
    for i in range(len(res)):
        if res[i] == 1 and test_labels[i] == 0:
            accuracy.append(1)
        elif res[i] == 0 and test_labels[i] == 1:
            accuracy.append(-1)
        else:
            accuracy.append(0)

    fp = sum([1 if accuracy[i] == 1 else 0 for i in range(len(accuracy))])/len(accuracy)
    fn = sum([1 if accuracy[i] == -1 else 0 for i in range(len(accuracy))])/len(accuracy)
    print("false positive rate: %4f" % fp)
    print("false negative rate: %4f" % fn)
    print("accuracy: %4f" % (1-(fp + fn)))
    return res

print("Results of running the model for white people:")
res_w = run_model(training_w, testing_w, white, income_w)


Results of running the model for white people:
false positive rate: 0.198588
false negative rate: 0.139451
accuracy: 0.661961


And now for **nonwhite people:**

In [9]:
print("Results of running the model for nonwhite people:")
res_n = run_model(training_n, testing_n, nonwhite, income_n)

Results of running the model for nonwhite people:
false positive rate: 0.139353
false negative rate: 0.209588
accuracy: 0.651059


### Differential Accuracy by Sex

Let's examine differential accuracy for different subgroups. Let's initially break it down by sex.

In [10]:
def calc_accuracy(res, labels):
    
    accuracy = []
    for i in range(len(res)):
        if res[i] == 1 and labels[i] == 0:
            accuracy.append(1)
        elif res[i] == 0 and labels[i] == 1:
            accuracy.append(-1)
        else:
            accuracy.append(0)
    return accuracy


def sex_accuracy(testing, fields, labels, res):
    test_fields = [fields[i] for i in testing]
    test_labels = [labels[i] for i in testing]
        
    male = []
    female = []
    
    for i in range(len(testing)):
        if fields[testing[i]][8] == 1:
            male.append(i)
        if fields[testing[i]][8] == 2:
            female.append(i)
            
    accuracy = calc_accuracy([res[i] for i in male], [test_labels[i] for i in male])
    fp = sum([1 if accuracy[i] == 1 else 0 for i in range(len(accuracy))])/len(accuracy)
    fn = sum([1 if accuracy[i] == -1 else 0 for i in range(len(accuracy))])/len(accuracy)
    print("Accuracy for male:")
    print("    false positive rate: %4f" % fp)
    print("    false negative rate: %4f" % fn)
    print("    accuracy: %4f" % (1-(fp + fn)))
    
    accuracy = calc_accuracy([res[i] for i in female], [test_labels[i] for i in female])
    fp = sum([1 if accuracy[i] == 1 else 0 for i in range(len(accuracy))])/len(accuracy)
    fn = sum([1 if accuracy[i] == -1 else 0 for i in range(len(accuracy))])/len(accuracy)
    print("Accuracy for female:")
    print("    false positive rate: %4f" % fp)
    print("    false negative rate: %4f" % fn)
    print("    accuracy: %4f" % (1-(fp + fn)))
    

**Accuracy broken down by sex for nonwhite people**

In [11]:
sex_accuracy(testing_n, nonwhite, income_n, res_n)

Accuracy for male:
    false positive rate: 0.126206
    false negative rate: 0.212219
    accuracy: 0.661576
Accuracy for female:
    false positive rate: 0.150657
    false negative rate: 0.207326
    accuracy: 0.642018


**Accuracy broken down by sex for white people**

In [12]:
sex_accuracy(testing_w, white, income_w, res_w)

Accuracy for male:
    false positive rate: 0.206658
    false negative rate: 0.136810
    accuracy: 0.656533
Accuracy for female:
    false positive rate: 0.190656
    false negative rate: 0.142047
    accuracy: 0.667297


### Differential Accuracy By Race Categorical

This is a little tricker. I first need some way to lookup up the race of each entry. Since I've removed unique identifiers, I'm going to have to go back and collect them. Let's make a list of just the racial categories.

In [29]:
race = []
with open(file, 'r') as fi:
    reader = csv.reader(fi)
    headers = next(reader)
    
    for line in reader:
        if (int(line[13]) != 1):
            race.append(int(line[13]))

**Accuracy by specific racial group for nonwhite people**

In [30]:
def race_accuracy(testing, labels, res):
    test_labels = [labels[i] for i in testing]
    
    # racial codes go from 1 to 26
    for r in range(0, 27):

        # indeces within res for the current group
        current = []
        for i in range(len(testing)):
            if (race[testing[i]] == r):
                current.append(i)

        # calculate accuracy
        accuracy = calc_accuracy([res[i] for i in current], [test_labels[i] for i in current])
        try:
            fp = sum([1 if accuracy[i] == 1 else 0 for i in range(len(accuracy))])/len(accuracy)
            fn = sum([1 if accuracy[i] == -1 else 0 for i in range(len(accuracy))])/len(accuracy)
            print("Results for race %d:" % r)
            print("    %d ENTRIES TOTAL" % len(current))
            print("    false positive rate: %4f" % fp)
            print("    false negative rate: %4f" % fn)
            print("    accuracy: %4f" % (1-(fp + fn)))
        except ZeroDivisionError:
            print("No results for race %d" % r)

race_accuracy(testing_n, income_n, res_n)

No results for race 0
No results for race 1
Results for race 2:
    1776 ENTRIES TOTAL
    false positive rate: 0.134572
    false negative rate: 0.196509
    accuracy: 0.668919
Results for race 3:
    183 ENTRIES TOTAL
    false positive rate: 0.131148
    false negative rate: 0.163934
    accuracy: 0.704918
Results for race 4:
    436 ENTRIES TOTAL
    false positive rate: 0.185780
    false negative rate: 0.211009
    accuracy: 0.603211
Results for race 5:
    21 ENTRIES TOTAL
    false positive rate: 0.285714
    false negative rate: 0.142857
    accuracy: 0.571429
Results for race 6:
    112 ENTRIES TOTAL
    false positive rate: 0.071429
    false negative rate: 0.312500
    accuracy: 0.616071
Results for race 7:
    80 ENTRIES TOTAL
    false positive rate: 0.125000
    false negative rate: 0.262500
    accuracy: 0.612500
Results for race 8:
    35 ENTRIES TOTAL
    false positive rate: 0.057143
    false negative rate: 0.542857
    accuracy: 0.400000
Results for race 9:
    11 

# n-NB Models

In [31]:
race_fields = [[] for i in range(27)]
race_labels = [[] for i in range(27)]
for i in range(len(nonwhite)):
    race_fields[race[i]].append(nonwhite[i])
    race_labels[race[i]].append(income_n[i])
for race_i in race_fields:
    print(len(race_i))

0
0
14541
1944
6790
791
824
767
540
164
133
45
20
7
2
88
129
8
2
10
4
87
3
2
0
1
16


In [45]:
def run_race(race, fields, labels):
    
    print("TRAINING")
    print("-----------------------")
    print()
    
    training, testing = run_kfold(fields, labels)
    
    print()
    print("RUNNING THE BEST MODEL")
    print("-----------------------")
    print()
    
    res = run_model(training, testing, fields, labels)
    
    print()
    print("ACCURACY BY SEX")
    print("-----------------------")
    print()
    
    sex_accuracy(testing, fields, labels, res) 

In [47]:
for i in range(len(race_fields)):
    if len(race_fields[i]) > 350:
        print()
        print("RACE %d " % i)
        print()
        run_race(i, race_fields[i], race_labels[i])


RACE 2 

TRAINING
-----------------------

accuracy rate:  0.6336769759450172
accuracy rate:  0.5845942228335625
accuracy rate:  0.5914718019257221
accuracy rate:  0.6306740027510316
accuracy rate:  0.640990371389271
accuracy rate:  0.6898211829436038
accuracy rate:  0.6114167812929848
accuracy rate:  0.671939477303989
accuracy rate:  0.6657496561210454
accuracy rate:  0.6004126547455296

RUNNING THE BEST MODEL
-----------------------

false positive rate: 0.090784
false negative rate: 0.219395
accuracy: 0.689821

ACCURACY BY SEX
-----------------------

Accuracy for male:
    false positive rate: 0.081571
    false negative rate: 0.229607
    accuracy: 0.688822
Accuracy for female:
    false positive rate: 0.098485
    false negative rate: 0.210859
    accuracy: 0.690657

RACE 3 

TRAINING
-----------------------

accuracy rate:  0.5538461538461539
accuracy rate:  0.5538461538461539
accuracy rate:  0.6256410256410256
accuracy rate:  0.6205128205128205
accuracy rate:  0.61855670103092