In [21]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix

NGRAMS = 2

# Wikilabels
df = pd.read_csv('data/wiki_name_race.txt')
df.dropna(subset=['name_first', 'name_last'], inplace=True)
df

Unnamed: 0,name_last,name_suffix,name_first,name_middle,race
0,heynis,,aafje,,"GreaterEuropean,WestEuropean,Germanic"
1,noordewier-reddingius,,aaltje,,"GreaterEuropean,WestEuropean,Germanic"
2,de quant,,abbie,,"GreaterEuropean,WestEuropean,Germanic"
4,ahanfouf,,abdelaziz,,"GreaterEuropean,WestEuropean,Germanic"
5,falaturi,,abdoldjavad,,"GreaterEuropean,WestEuropean,Germanic"
...,...,...,...,...,...
148267,dediÄ?,,zlatko,,"GreaterEuropean,WestEuropean,Italian"
148268,gattai,,zélia,,"GreaterEuropean,WestEuropean,Italian"
148269,bonaparte,,zénaïde,laetitia julie,"GreaterEuropean,WestEuropean,Italian"
148271,karbonopsina,,zoe,,"GreaterEuropean,WestEuropean,Italian"


In [22]:
# add middle name to first name
df['name_first'] = df['name_first'] + ' ' + df['name_middle'].fillna('')
# drop middle name
df = df.drop('name_middle', axis=1)

In [23]:
# concat last name and first name
df['name_last_name_first'] = df['name_last'] + ' ' + df['name_first'] 

# build n-gram list
vect = CountVectorizer(analyzer='char', ngram_range=(NGRAMS, NGRAMS), lowercase=False) 
a = vect.fit_transform(df.name_last_name_first)
vocab = vect.vocabulary_ 

In [24]:
vocab

{'he': 611,
 'ey': 496,
 'yn': 1413,
 'ni': 915,
 'is': 677,
 's ': 1129,
 ' a': 13,
 'aa': 276,
 'af': 281,
 'fj': 525,
 'je': 712,
 'e ': 464,
 'no': 921,
 'oo': 973,
 'or': 976,
 'rd': 1081,
 'de': 423,
 'ew': 494,
 'wi': 1339,
 'ie': 663,
 'er': 489,
 'r-': 1076,
 '-r': 115,
 're': 1082,
 'ed': 475,
 'dd': 422,
 'di': 427,
 'in': 672,
 'ng': 913,
 'gi': 568,
 'iu': 679,
 'us': 1261,
 'al': 287,
 'lt': 818,
 'tj': 1198,
 ' q': 29,
 'qu': 1066,
 'ua': 1243,
 'an': 289,
 'nt': 926,
 't ': 1184,
 'ab': 277,
 'bb': 327,
 'bi': 333,
 'ah': 283,
 'ha': 607,
 'nf': 912,
 'fo': 530,
 'ou': 979,
 'uf': 1248,
 'f ': 512,
 'bd': 329,
 'el': 483,
 'la': 799,
 'az': 301,
 'zi': 1451,
 'iz': 684,
 'z ': 1439,
 'fa': 516,
 'at': 295,
 'tu': 1208,
 'ur': 1260,
 'ri': 1086,
 'i ': 654,
 'do': 433,
 'ol': 970,
 'ld': 802,
 'dj': 428,
 'ja': 708,
 'av': 297,
 'va': 1287,
 'ad': 279,
 'd ': 412,
 'ho': 621,
 'lz': 824,
 'zm': 1455,
 'ma': 855,
 'nn': 920,
 'n ': 901,
 'be': 330,
 'wa': 1331,
 'ar': 293

In [26]:
# sort n-gram by freq (highest -> lowest)
words = []
for b in vocab:
    freq = vocab[b]
    
    words.append((a[:, freq].sum(), b))
    #break
words = sorted(words, reverse=True)
words_list = ['<UNK>']
words_list.extend([w[1] for w in words])
num_words = len(words_list)
print("num_words = %d" % num_words)

num_words = 2323


In [27]:
words_list[:10]

['<UNK>', 'an', 'n ', 'ar', 'er', 'e ', 'a ', 's ', 'ma', 'in']

In [30]:
ngram2idx = {ngram: i for i, ngram in enumerate(words_list)}
idx2ngram = {i: ngram for i, ngram in enumerate(words_list)}

# build X from index of n-gram sequence
X = np.array(df.name_last_name_first.apply(lambda c: [ngram2idx.get(ngram, 0) for ngram in [c[i:i+NGRAMS] for i in range(len(c)-NGRAMS+1)]]))


In [48]:
races = np.unique(df.race.astype('category'))
race2idx = {x:i for i,x in enumerate(races)}
idx2race = {i:x for i,x in enumerate(races)}

In [49]:
races

array(['Asian,GreaterEastAsian,EastAsian',
       'Asian,GreaterEastAsian,Japanese', 'Asian,IndianSubContinent',
       'GreaterAfrican,Africans', 'GreaterAfrican,Muslim',
       'GreaterEuropean,British', 'GreaterEuropean,EastEuropean',
       'GreaterEuropean,Jewish', 'GreaterEuropean,WestEuropean,French',
       'GreaterEuropean,WestEuropean,Germanic',
       'GreaterEuropean,WestEuropean,Hispanic',
       'GreaterEuropean,WestEuropean,Italian',
       'GreaterEuropean,WestEuropean,Nordic'], dtype=object)

In [53]:
y = np.array(df.race.apply(lambda c: race2idx[c]))

# Split train and test dataset
X_train,  X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21, stratify=y)

In [54]:
print ("X_train.shape = %s" % str(X_train.shape))
print ("y_train.shape = %s" % str(y_train.shape))
print ("X_test.shape = %s" % str(X_test.shape))
print ("y_test.shape = %s" % str(y_test.shape))


X_train.shape = (107097,)
y_train.shape = (107097,)
X_test.shape = (26775,)
y_test.shape = (26775,)


In [63]:
# padd sequences to the 80th percentile
maxlen = np.percentile([len(x) for x in X_train], 80)
maxlen = int(maxlen)
print("maxlen = %d" % maxlen)

# pad with 0s (i.e. UNK token)
from keras.utils import pad_sequences
X_train = pad_sequences(X_train, maxlen=maxlen, padding='post', truncating='post')
X_test = pad_sequences(X_test, maxlen=maxlen, padding='post', truncating='post')


maxlen = 17


In [66]:
# save dataset 
np.savetxt('data/X_train.txt', X_train, fmt='%d')
np.savetxt('data/X_test.txt', X_test, fmt='%d')
np.savetxt('data/y_train.txt', y_train, fmt='%d')
np.savetxt('data/y_test.txt', y_test, fmt='%d')

In [None]:
# save vocab
with open('data/vocab.txt', 'w') as f:
    for v, fr in vocab.items():
        f.write(f"{v}\t{fr}\n")

# save ngram2idx
with open('data/ngram2idx.txt', 'w') as f:
    for v, fr in ngram2idx.items():
        f.write(f"{v}\t{fr}\n")

# save idx2ngram
with open('data/idx2ngram.txt', 'w') as f:
    for v, fr in idx2ngram.items():
        f.write(f"{v}\t{fr}\n")

# save race2idx
with open('data/race2idx.txt', 'w') as f:
    for v, fr in race2idx.items():
        f.write(f"{v}\t{fr}\n")

# save idx2race
with open('data/idx2race.txt', 'w') as f:
    for v, fr in idx2race.items():
        f.write(f"{v}\t{fr}\n")