# Neural networks for predicting ethnicity from surnames

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
memorial_data = pd.read_csv('./MemorialData/memorial_data_final.csv')

In [9]:
memorial_data.ethnicity_col = memorial_data.ethnicity_col.astype('category')
y = memorial_data.ethnicity_col.cat.codes

### Hyperparameters

In [16]:
maxlen = 50
training_samples = 750000
validation_samples = 250000
embedding_dim = 100
NGRAMS = 2
#SAMPLE = 1000000
EPOCHS = 15
batch_size = 32
feature_len = 18

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer(analyzer='char', max_df=0.3, min_df=3, ngram_range=(NGRAMS, NGRAMS), lowercase=False) 
a = vect.fit_transform(memorial_data.surn)
vocab = vect.vocabulary_

In [7]:
words = []
for b in vocab:
    c = vocab[b]
    #print(b, c, a[:, c].sum())
    words.append((a[:, c].sum(), b))
    #break
words = sorted(words, reverse=True)
words_list = [w[1] for w in words]
num_words = len(words_list)
print("num_words = %d" % num_words)


def find_ngrams(text, n):
    a = zip(*[text[i:] for i in range(n)])
    wi = []
    for i in a:
        w = ''.join(i)
        try:
            idx = words_list.index(w)
        except:
            idx = 0
        wi.append(idx)
    return wi

# build X from index of n-gram sequence
X = np.array(memorial_data.surn.apply(lambda c: find_ngrams(c, NGRAMS)))

# check max/avg feature
X_len = []
for x in X:
    X_len.append(len(x))

max_name_len = max(X_len)
avg_name_len = int(np.mean(X_len))

print("Max (sur)name len = %d, Avg. (sur)name len = %d" % (max_name_len, avg_name_len))
#y = np.array(sdf.race.astype('category').cat.codes)

num_words = 1522
Max (sur)name len = 44, Avg. (sur)name len = 6


In [18]:
from sklearn.model_selection import train_test_split

X_train,  X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21, stratify=y)

In [19]:
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

print('Pad sequences (samples x time)')
X_train = pad_sequences(X_train, maxlen=feature_len)
X_test = pad_sequences(X_test, maxlen=feature_len)
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)

num_classes = np.max(y_train) + 1
print(num_classes, 'classes')

print('Convert class vector to binary class matrix '
      '(for use with categorical_crossentropy)')
y_train = to_categorical(y_train, num_classes)
y_test = to_categorical(y_test, num_classes)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

Pad sequences (samples x time)
X_train shape: (971636, 18)
X_test shape: (242909, 18)
11 classes
Convert class vector to binary class matrix (for use with categorical_crossentropy)
y_train shape: (971636, 11)
y_test shape: (242909, 11)


In [20]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense, LSTM

model = Sequential()
model.add(Embedding(num_words, 32, input_length=feature_len))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(num_classes, activation='softmax'))

# try using different optimizers and different optimizer configs
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 18, 32)            48704     
_________________________________________________________________
lstm_2 (LSTM)                (None, 128)               82432     
_________________________________________________________________
dense_2 (Dense)              (None, 11)                1419      
Total params: 132,555
Trainable params: 132,555
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.fit(X_train, y_train, batch_size=batch_size, epochs=5,
          validation_split=0.1)
score, acc = model.evaluate(X_test, y_test,
                            batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

Train on 874472 samples, validate on 97164 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5