In [1]:
from __future__ import print_function

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix

NGRAMS = 2
EPOCHS = 15
#YEAR = '2000'
YEAR = '2010'

io = '/Users/ozaltun/Documents/GitHub/ethnicolr/ethnicolr/data/harvard/firstnames.xlsx'
datadesc = pd.read_excel(io, sheet_name=0, header=0)

df = pd.read_excel(io, sheet_name=1, header=0)
df

Unnamed: 0,firstname,obs,pcthispanic,pctwhite,pctblack,pctapi,pctaian,pct2prace
0,AARON,3646,2.880,91.607,3.264,2.057,0.055,0.137
1,ABBAS,59,0.000,71.186,3.390,25.424,0.000,0.000
2,ABBEY,57,0.000,96.491,3.509,0.000,0.000,0.000
3,ABBIE,74,1.351,95.946,2.703,0.000,0.000,0.000
4,ABBY,262,1.527,94.656,1.527,2.290,0.000,0.000
5,ABDUL,221,0.452,44.344,6.787,47.511,0.452,0.452
6,ABDULLAH,44,2.273,65.909,9.091,22.727,0.000,0.000
7,ABE,31,6.452,77.419,12.903,3.226,0.000,0.000
8,ABEL,327,82.569,12.844,2.141,2.446,0.000,0.000
9,ABELARDO,76,78.947,9.211,0.000,11.842,0.000,0.000


## Resampling with weight

In [2]:
sdf = df.sample(400000, weights=df['obs'], replace=True)

In [3]:
sdf = sdf.dropna()

## Assign race by pertcentage

In [4]:
from numpy.random import choice

races = ['white', 'black', 'api', 'aian', 'prace','hispanic']

def to_race(c):
    w = np.array(c).astype(float)
    probs = w/w.sum()
    return choice(races, p=probs)

sdf['race'] = sdf[['pctwhite', 'pctblack', 'pctapi', 'pctaian', 'pct2prace','pcthispanic']].apply(lambda c: to_race(c), axis=1)
sdf

Unnamed: 0,firstname,obs,pcthispanic,pctwhite,pctblack,pctapi,pctaian,pct2prace,race
2248,KYLE,2487,0.362,96.381,1.407,1.568,0.121,0.161,white
3327,ROBERT,48260,2.138,94.511,2.161,0.924,0.128,0.137,white
2402,LINDA,16217,2.097,92.674,3.114,1.899,0.080,0.136,white
2705,MATTHEW,15316,1.019,97.349,0.679,0.783,0.078,0.091,white
3840,TERESA,5068,11.089,83.268,2.802,2.506,0.237,0.099,white
2186,KIMBERLY,9637,1.090,93.982,3.756,0.892,0.104,0.176,white
3943,TRICIA,596,2.013,92.282,3.020,2.181,0.168,0.336,white
879,DANIEL,20876,6.189,90.491,1.150,1.859,0.172,0.139,white
4250,ALL OTHER FIRST NAMES,214124,8.226,51.422,11.541,28.117,0.396,0.298,white
3119,PERRY,595,1.345,88.571,7.059,2.857,0.000,0.168,white


## Check the correctness of race assignment

In [5]:
df[df.firstname == 'FRANCIS']

Unnamed: 0,firstname,obs,pcthispanic,pctwhite,pctblack,pctapi,pctaian,pct2prace
1343,FRANCIS,2114,2.271,88.269,3.075,6.197,0.095,0.095


In [6]:
# Additional features
sdf['name_first'] = sdf.firstname.str.title()
sdf.groupby('race').agg({'name_first': 'count'})

Unnamed: 0_level_0,name_first
race,Unnamed: 1_level_1
aian,677
api,25224
black,16646
hispanic,27339
prace,681
white,329433


In [7]:
len(sdf)

400000

## Preprocessing the input data

In [8]:
# only last name in Census data
sdf['name_last_name_first'] = sdf['name_first']

# build n-gram list
vect = CountVectorizer(analyzer='char', max_df=0.3, min_df=3, ngram_range=(NGRAMS, NGRAMS), lowercase=False) 
#vect = CountVectorizer(analyzer='char', ngram_range=(2, 2), lowercase=False) 
a = vect.fit_transform(sdf.name_last_name_first)
vocab = vect.vocabulary_
len(vocab)

642

In [9]:
import operator
sorted_vocab = sorted(vocab.items(), key=operator.itemgetter(1))
cols = list(map(operator.itemgetter(0), sorted_vocab))

In [10]:
count_df = pd.DataFrame(a.todense(), columns=cols)
count_df

Unnamed: 0,F,N,O,Aa,Ab,Ad,Af,Ag,Ah,Ai,...,ze,zh,zi,zk,zl,zo,zr,zt,zu,zy
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
count_df.sum().sort_values(ascending=False).describe()

count      642.000000
mean      3749.129283
std       8578.366415
min          3.000000
25%         50.250000
50%        439.000000
75%       3009.750000
max      68058.000000
dtype: float64

In [12]:
pd.set_option('display.max_rows', 20)
count_df.sum().sort_values(ascending=False)

er    68058
th    55835
ar    52323
am    50164
he    49458
ll    48196
es    47657
an    46296
st    42306
me    41737
      ...  
wt        3
Sn        3
Pu        3
Xo        3
Ob        3
tj        3
cr        3
qi        3
Lj        3
Ih        3
Length: 642, dtype: int64

In [13]:
# sort n-gram by freq (highest -> lowest)
words = []
for b in vocab:
    c = vocab[b]
    #print(b, c, a[:, c].sum())
    words.append((a[:, c].sum(), b))
    #break
words = sorted(words, reverse=True)
words_list = [w[1] for w in words]
num_words = len(words_list)
print("num_words = %d" % num_words)

num_words = 642


In [14]:
def find_ngrams(text, n):
    a = zip(*[text[i:] for i in range(n)])
    wi = []
    for i in a:
        w = ''.join(i)
        try:
            idx = words_list.index(w)
        except:
            idx = 0
        wi.append(idx)
    return wi

# build X from index of n-gram sequence
X = np.array(sdf.name_last_name_first.apply(lambda c: find_ngrams(c, NGRAMS)))

# check max/avg feature
X_len = []
for x in X:
    X_len.append(len(x))

max_feature_len = max(X_len)
avg_feature_len = int(np.mean(X_len))

print("Max feature len = %d, Avg. feature len = %d" % (max_feature_len, avg_feature_len))
y = np.array(sdf.race.astype('category').cat.codes)

# Split train and test dataset
X_train,  X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21, stratify=y)

Max feature len = 20, Avg. feature len = 6


## Train a LSTM model

ref: http://machinelearningmastery.com/sequence-classification-lstm-recurrent-neural-networks-python-keras/

In [15]:
'''The dataset is actually too small for LSTM to be of any advantage
compared to simpler, much faster methods such as TF-IDF + LogReg.
Notes:

- RNNs are tricky. Choice of batch size is important,
choice of loss and optimizer is critical, etc.
Some configurations won't converge.

- LSTM loss decrease patterns during training can be quite different
from what you see with CNNs/MLPs/etc.
'''
import keras
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding, Dropout, Activation
from keras.layers import LSTM
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.models import load_model

max_features = num_words # 20000
feature_len = 20 # avg_feature_len # cut texts after this number of words (among top max_features most common words)
batch_size = 32

print(len(X_train), 'train sequences')
print(len(X_test), 'test sequences')

print('Pad sequences (samples x time)')
X_train = sequence.pad_sequences(X_train, maxlen=feature_len)
X_test = sequence.pad_sequences(X_test, maxlen=feature_len)
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)

num_classes = np.max(y_train) + 1
print(num_classes, 'classes')

print('Convert class vector to binary class matrix '
      '(for use with categorical_crossentropy)')
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


320000 train sequences
80000 test sequences
Pad sequences (samples x time)
X_train shape: (320000, 20)
X_test shape: (80000, 20)
6 classes
Convert class vector to binary class matrix (for use with categorical_crossentropy)
y_train shape: (320000, 6)
y_test shape: (80000, 6)


In [16]:
print('Build model...')

model = Sequential()
model.add(Embedding(num_words, 32, input_length=feature_len))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(num_classes, activation='softmax'))

# try using different optimizers and different optimizer configs
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

print(model.summary())

Build model...
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 20, 32)            20544     
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               82432     
_________________________________________________________________
dense_1 (Dense)              (None, 6)                 774       
Total params: 103,750
Trainable params: 103,750
Non-trainable params: 0
_________________________________________________________________
None


In [17]:
print('Train...')
model.fit(X_train, y_train, batch_size=batch_size, epochs=EPOCHS,
          validation_split=0.1, verbose=2)
score, acc = model.evaluate(X_test, y_test,
                            batch_size=batch_size, verbose=2)
print('Test score:', score)
print('Test accuracy:', acc)

Train...
Instructions for updating:
Use tf.cast instead.
Train on 288000 samples, validate on 32000 samples
Epoch 1/15
 - 166s - loss: 0.5328 - acc: 0.8478 - val_loss: 0.5093 - val_acc: 0.8529
Epoch 2/15
 - 161s - loss: 0.5007 - acc: 0.8564 - val_loss: 0.4960 - val_acc: 0.8589
Epoch 3/15
 - 154s - loss: 0.4911 - acc: 0.8589 - val_loss: 0.4951 - val_acc: 0.8587
Epoch 4/15


KeyboardInterrupt: 

In [48]:
#predicted = pd.DataFrame(model.predict(X_test), columns = ['white', 'black', 'api', 'aian', 'prace','hispanic'])

array([[0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 1.],
       ...,
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 1.]], dtype=float32)

## Confusion Matrix

In [43]:
y_pred = model.predict_classes(X_test, verbose=2)
p = model.predict_proba(X_test, verbose=2) # to predict probability
target_names = list(sdf.race.astype('category').cat.categories)
print(classification_report(np.argmax(y_test, axis=1), y_pred, target_names=target_names))
print(confusion_matrix(np.argmax(y_test, axis=1), y_pred))

             precision    recall  f1-score   support

       aian       0.00      0.00      0.00        11
        api       0.81      0.16      0.26       500
      black       0.38      0.01      0.02       329
   hispanic       0.70      0.46      0.56       528
      prace       0.00      0.00      0.00        13
      white       0.87      0.99      0.92      6619

avg / total       0.83      0.86      0.82      8000

[[   0    1    0    0    0   10]
 [   0   79    1   29    0  391]
 [   0    1    3    8    0  317]
 [   0    4    0  243    0  281]
 [   0    0    0    0    0   13]
 [   0   12    4   66    0 6537]]


  'precision', 'predicted', average, warn_for)
