In [1]:
from __future__ import print_function

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix

#Setting the parameters
NGRAMS = 2
EPOCHS = 15

# Reading in the Harvard data. NEW!!
io = '/Users/boraozaltun/Documents/GitHub/ethnicolr/ethnicolr/data/harvard/firstnames.xlsx'
datadesc = pd.read_excel(io, sheet_name=0, header=0)
df = pd.read_excel(io, sheet_name=1, header=0)
# Dropping the "All other names" row. NEW!!
df = df.drop(4250)
df

Unnamed: 0,firstname,obs,pcthispanic,pctwhite,pctblack,pctapi,pctaian,pct2prace
0,AARON,3646,2.880,91.607,3.264,2.057,0.055,0.137
1,ABBAS,59,0.000,71.186,3.390,25.424,0.000,0.000
2,ABBEY,57,0.000,96.491,3.509,0.000,0.000,0.000
3,ABBIE,74,1.351,95.946,2.703,0.000,0.000,0.000
4,ABBY,262,1.527,94.656,1.527,2.290,0.000,0.000
5,ABDUL,221,0.452,44.344,6.787,47.511,0.452,0.452
6,ABDULLAH,44,2.273,65.909,9.091,22.727,0.000,0.000
7,ABE,31,6.452,77.419,12.903,3.226,0.000,0.000
8,ABEL,327,82.569,12.844,2.141,2.446,0.000,0.000
9,ABELARDO,76,78.947,9.211,0.000,11.842,0.000,0.000


## Resampling with weight

In [2]:
sdf = df.sample(200000, weights=df['obs'], replace=True)

In [3]:
sdf = sdf.dropna()

## Assign race by pertcentage

In [4]:
from numpy.random import choice

races = ['white', 'black', 'api', 'aian', 'prace','hispanic']

def to_race(c):
    w = np.array(c).astype(float)
    probs = w/w.sum()
    return choice(races, p=probs)

sdf['race'] = sdf[['pctwhite', 'pctblack', 'pctapi', 'pctaian', 'pct2prace','pcthispanic']].apply(lambda c: to_race(c), axis=1)
sdf

Unnamed: 0,firstname,obs,pcthispanic,pctwhite,pctblack,pctapi,pctaian,pct2prace,race
3933,TRAVIS,1908,0.734,94.235,3.564,0.996,0.419,0.052,white
2782,MICHAEL,59532,1.693,94.363,2.142,1.592,0.092,0.118,white
1674,IAN,1156,0.779,93.166,2.855,2.855,0.000,0.346,white
2186,KIMBERLY,9637,1.090,93.982,3.756,0.892,0.104,0.176,white
1934,JOHN,51696,1.729,94.384,1.729,1.932,0.124,0.101,white
2753,MELODY,634,2.997,89.748,2.997,3.785,0.473,0.000,white
1774,JAMES,45722,1.000,94.016,3.279,1.472,0.116,0.118,white
1971,JOSEPH,22172,2.936,92.139,2.553,2.120,0.144,0.108,white
3256,REBECCA,7229,3.486,94.052,0.830,1.369,0.124,0.138,white
2527,LYNN,3651,0.931,94.412,1.616,2.821,0.164,0.055,white


## Check the correctness of race assignment

In [5]:
df[df.firstname == 'FRANCIS']

Unnamed: 0,firstname,obs,pcthispanic,pctwhite,pctblack,pctapi,pctaian,pct2prace
1343,FRANCIS,2114,2.271,88.269,3.075,6.197,0.095,0.095


In [6]:
# Additional features
sdf['name_first'] = sdf.firstname.str.title()
sdf.groupby('race').agg({'name_first': 'count'})

Unnamed: 0_level_0,name_first
race,Unnamed: 1_level_1
aian,319
api,8631
black,6999
hispanic,13625
prace,278
white,170148


In [7]:
len(sdf)

200000

## Preprocessing the input data

In [8]:
# only first name in Harvard data
sdf['name_last_name_first'] = sdf['name_first']

# build n-gram list
vect = CountVectorizer(analyzer='char', max_df=0.3, min_df=3, ngram_range=(NGRAMS, NGRAMS), lowercase=False) 
#vect = CountVectorizer(analyzer='char', ngram_range=(2, 2), lowercase=False) 
a = vect.fit_transform(sdf.name_last_name_first)
vocab = vect.vocabulary_
len(vocab)

611

In [9]:
import operator
sorted_vocab = sorted(vocab.items(), key=operator.itemgetter(1))
cols = list(map(operator.itemgetter(0), sorted_vocab))

In [10]:
count_df = pd.DataFrame(a.todense(), columns=cols)
count_df

Unnamed: 0,Aa,Ab,Ad,Af,Ag,Ah,Ai,Aj,Ak,Al,...,ze,zh,zi,zk,zl,zo,zr,zt,zu,zy
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
count_df.sum().sort_values(ascending=False).describe()

count      611.000000
mean      1573.911620
std       3158.062455
min          3.000000
25%         37.000000
50%        249.000000
75%       1598.500000
max      28735.000000
dtype: float64

In [12]:
pd.set_option('display.max_rows', 20)
count_df.sum().sort_values(ascending=False)

ar    28735
an    25340
er    19609
ri    17845
en    17085
el    16247
ic    15888
ha    13695
th    12903
in    12802
      ...  
zh        3
sv        3
zb        3
Zi        3
Us        3
yj        3
Hs        3
Ub        3
qb        3
uv        3
Length: 611, dtype: int64

In [13]:
# sort n-gram by freq (highest -> lowest)
words = []
for b in vocab:
    c = vocab[b]
    #print(b, c, a[:, c].sum())
    words.append((a[:, c].sum(), b))
    #break
words = sorted(words, reverse=True)
words_list = [w[1] for w in words]
num_words = len(words_list)
print("num_words = %d" % num_words)

num_words = 611


In [14]:
def find_ngrams(text, n):
    a = zip(*[text[i:] for i in range(n)])
    wi = []
    for i in a:
        w = ''.join(i)
        try:
            idx = words_list.index(w)
        except:
            idx = 0
        wi.append(idx)
    return wi

# build X from index of n-gram sequence
X = np.array(sdf.name_last_name_first.apply(lambda c: find_ngrams(c, NGRAMS)))

# check max/avg feature
X_len = []
for x in X:
    X_len.append(len(x))

max_feature_len = max(X_len)
avg_feature_len = int(np.mean(X_len))

print("Max feature len = %d, Avg. feature len = %d" % (max_feature_len, avg_feature_len))
y = np.array(sdf.race.astype('category').cat.codes)

# Split train and test dataset
X_train,  X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21, stratify=y)

Max feature len = 11, Avg. feature len = 4


In [15]:
# Split train and test for inhouse sampling accuracy. NEW!!
train_names, test_names = train_test_split(sdf.name_last_name_first.to_numpy(),
                                           test_size=0.2, random_state=21, stratify=y)

train_indexes, test_indexes = train_test_split(sdf.index.values,
                                           test_size=0.2, random_state=21, stratify=y)

## Train a LSTM model

ref: http://machinelearningmastery.com/sequence-classification-lstm-recurrent-neural-networks-python-keras/

In [16]:
'''The dataset is actually too small for LSTM to be of any advantage
compared to simpler, much faster methods such as TF-IDF + LogReg.
Notes:

- RNNs are tricky. Choice of batch size is important,
choice of loss and optimizer is critical, etc.
Some configurations won't converge.

- LSTM loss decrease patterns during training can be quite different
from what you see with CNNs/MLPs/etc.
'''
import keras
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding, Dropout, Activation
from keras.layers import LSTM
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.models import load_model

max_features = num_words # 20000
feature_len = 20 # avg_feature_len # cut texts after this number of words (among top max_features most common words)
batch_size = 32

print(len(X_train), 'train sequences')
print(len(X_test), 'test sequences')

print('Pad sequences (samples x time)')
X_train = sequence.pad_sequences(X_train, maxlen=feature_len)
X_test = sequence.pad_sequences(X_test, maxlen=feature_len)
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)

num_classes = np.max(y_train) + 1
print(num_classes, 'classes')

print('Convert class vector to binary class matrix '
      '(for use with categorical_crossentropy)')
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

Using TensorFlow backend.


160000 train sequences
40000 test sequences
Pad sequences (samples x time)
X_train shape: (160000, 20)
X_test shape: (40000, 20)
6 classes
Convert class vector to binary class matrix (for use with categorical_crossentropy)
y_train shape: (160000, 6)
y_test shape: (40000, 6)


In [17]:
print('Build model...')

model = Sequential()
model.add(Embedding(num_words, 32, input_length=feature_len))
model.add(LSTM(128, dropout=0.1, recurrent_dropout=0.1))
model.add(Dense(num_classes, activation='softmax'))

# try using different optimizers and different optimizer configs
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

print(model.summary())

Build model...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 20, 32)            19552     
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               82432     
_________________________________________________________________
dense_1 (Dense)              (None, 6)                 774       
Total params: 102,758
Trainable params: 102,758
Non-trainable params: 0
_________________________________________________________________
None


In [18]:
print('Train...')
model.fit(X_train, y_train, batch_size=batch_size, epochs=EPOCHS,
          validation_split=0.1, verbose=2)
score, acc = model.evaluate(X_test, y_test,
                            batch_size=batch_size, verbose=2)
print('Test score:', score)
print('Test accuracy:', acc)

Train...
Train on 144000 samples, validate on 16000 samples
Epoch 1/15
 - 104s - loss: 0.4873 - acc: 0.8736 - val_loss: 0.4701 - val_acc: 0.8773
Epoch 2/15
 - 105s - loss: 0.4468 - acc: 0.8835 - val_loss: 0.4604 - val_acc: 0.8789
Epoch 3/15
 - 101s - loss: 0.4369 - acc: 0.8853 - val_loss: 0.4520 - val_acc: 0.8822
Epoch 4/15
 - 107s - loss: 0.4296 - acc: 0.8875 - val_loss: 0.4487 - val_acc: 0.8822
Epoch 5/15
 - 95s - loss: 0.4242 - acc: 0.8889 - val_loss: 0.4437 - val_acc: 0.8837
Epoch 6/15
 - 91s - loss: 0.4199 - acc: 0.8906 - val_loss: 0.4390 - val_acc: 0.8851
Epoch 7/15
 - 92s - loss: 0.4159 - acc: 0.8916 - val_loss: 0.4361 - val_acc: 0.8872
Epoch 8/15
 - 91s - loss: 0.4129 - acc: 0.8925 - val_loss: 0.4367 - val_acc: 0.8869
Epoch 9/15
 - 91s - loss: 0.4098 - acc: 0.8927 - val_loss: 0.4330 - val_acc: 0.8878
Epoch 10/15
 - 91s - loss: 0.4081 - acc: 0.8931 - val_loss: 0.4324 - val_acc: 0.8885
Epoch 11/15
 - 91s - loss: 0.4065 - acc: 0.8938 - val_loss: 0.4319 - val_acc: 0.8882
Epoch 12/1

### Inhouse accuracy measure. NEW!!

In [22]:
sdf_subsample = sdf.sample(n=40000, random_state=2) #0.2
# sdf_subsample = sdf.loc[test_indexes]

X_inhouse_test = np.array(sdf_subsample.name_last_name_first.apply(lambda c: find_ngrams(c, NGRAMS)))
X_inhouse_test = sequence.pad_sequences(X_inhouse_test, maxlen=feature_len)
y_inhouse_predicted = pd.DataFrame(model.predict(X_inhouse_test), columns = [ 'pctaian_predict','pctapi_predict',
                                                                             'pctblack_predict','pcthispanic_predict', 
                                                                             'pct2prace_predict','pctwhite_predict'])

y_inhouse = sdf_subsample[[ 'pctaian','pctapi',  'pctblack','pcthispanic', 'pct2prace','pctwhite']]
y_inhouse = y_inhouse.apply(lambda x: round(x/100, 2))

y_inhouse_predicted = y_inhouse_predicted.set_index(y_inhouse.index)

inhouse = pd.concat([y_inhouse, y_inhouse_predicted], axis=1, join_axes=[y_inhouse_predicted.index])

In [23]:
from numpy.random import choice


def sampling_accuracy_4_one(row):
    """Given a row of true percentages and predicted percentages, draw 100 samples from both 
        empirical distributions and compare how many times they match!"""
    
    true = row[[ 'pctaian','pctapi',  'pctblack','pcthispanic', 'pct2prace','pctwhite']].to_numpy()
    predicted = row[['pctaian_predict','pctapi_predict','pctblack_predict','pcthispanic_predict','pct2prace_predict','pctwhite_predict']].to_numpy()
    true= np.array(true).astype(float)
    probs_true = true/true.sum()
    
    predicted= np.array(predicted).astype(float)
    probs_predicted = predicted/predicted.sum()

    draws_true = choice(6, 100, p = probs_true)
    draws_predicted = choice(6, 100, p = probs_predicted)
    
    return (draws_true == draws_predicted).sum()/100
 

In [None]:
inhouse['inhouse_acc'] = inhouse.apply(lambda row: sampling_accuracy_4_one(row), axis=1)

In [None]:
inhouse['inhouse_acc'].mean()