In [1]:
from __future__ import division, print_function, absolute_import
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
%matplotlib inline

In [2]:
ethnea_df = pd.read_csv('names_ethnea_genni_country_sample.csv')
ethnea_df

Unnamed: 0,AUID,Last,First,Ethnea,Genni,PubCountry
0,12872768_1,_Filho_,_Elias_Abdalla_,HISPANIC,M,Brazil
1,12565879_3,_Bou_Abdallah_,_Jad_,ARAB,M,France
2,17569133_1,_Abdel_Aziz_,_Ayman_,ARAB,M,USA
3,11213157_2,_Abdelmoula_,_Salma_,ARAB,F,Tunisia
4,11439058_1,_Abdou_,_Ibrahim_,ARAB,M,Egypt
5,10919608_3,_Abou_El_Fettouh_,_Hazem_,ARAB,M,USA
6,17314740_3,_Aboutaam_,_Rola_,ARAB,F,France
7,16564217_2,_Alvim_de_Abreu_Silva_Rodrigues_,_Aida_Alexandra_,HISPANIC,F,Brazil
8,6339395_2,_Cristina_Affonso_Scaletsky_,_Isabel_,HISPANIC,F,Brazil
9,7796089_2,_Agbenyega_,_Tsiri_,AFRICAN,-,Ghana


In [3]:
# First thing first, use the character feature for making the dnn model
def extract_structure(word,n_char=2):
    x_struct = []
    word_len = len(word) + n_char
    n_char-=1
    counter = 0
    for i in range(word_len):
        end = i+1
        start = (i - n_char) if (i - n_char) > 0 else 0
        if word[start:end]!='_' and word[start:end]!='':
        #if word[start:end]!='_':
            x_struct.append(word[start:end])
    return x_struct

first_name_struct = ethnea_df.First.apply(lambda x: extract_structure(x.lower(),2))
last_name_struct = ethnea_df.Last.apply(lambda x: extract_structure(x.lower(),2))
                                                                      

In [4]:
first_name_struct

0        [_e, el, li, ia, as, s_, _a, ab, bd, da, al, l...
1                                         [_j, ja, ad, d_]
2                                 [_a, ay, ym, ma, an, n_]
3                                 [_s, sa, al, lm, ma, a_]
4                         [_i, ib, br, ra, ah, hi, im, m_]
5                                 [_h, ha, az, ze, em, m_]
6                                     [_r, ro, ol, la, a_]
7        [_a, ai, id, da, a_, _a, al, le, ex, xa, an, n...
8                             [_i, is, sa, ab, be, el, l_]
9                                 [_t, ts, si, ir, ri, i_]
10                                    [_j, jo, os, se, e_]
11                        [_m, ma, an, nu, ue, el, la, a_]
12                            [_a, ar, rt, tu, ur, ro, o_]
13                                        [_a, al, li, i_]
14                                [_a, ah, hm, me, ed, d_]
15                                [_a, ah, hm, me, ed, d_]
16                    [_m, mo, oh, ha, am, mm, me, ed, d

In [5]:
# make struct dictionary
struct_dict = {}
for name_struct_i in first_name_struct:
    for struct_j in name_struct_i:
        if struct_j not in struct_dict:
            struct_dict[struct_j]=0
        struct_dict[struct_j]+=1
for name_struct_i in last_name_struct:
    for struct_j in name_struct_i:
        if struct_j not in struct_dict:
            struct_dict[struct_j]=0
        struct_dict[struct_j]+=1

In [6]:
struct_dict_keys = list(struct_dict.keys())
ethnic_series = ethnea_df['Ethnea'].str.lower()
ethnic_keys = list(np.unique(ethnic_series.values))

In [7]:
# load test train data
with open('train_test_fix_index.pickle', 'rb') as f:
    trainIndex,testIndex,trainX,trainY,testX,testY,ethnic_keys,tmp = pickle.load(f)

In [8]:
# transform the dataset into structure
def transform_structure(name_struct):
    list_structure = []
    for x in name_struct:
        try:
            list_structure.append(struct_dict_keys.index(x)+1)
        except BaseException:
            list_structure.append(0)
    #add pading 0 for structure less than num_input
    #for i in range(len(list_structure),timesteps):
    #    list_structure.append(0)
    return list_structure   
    #return [*map(lambda x:struct_dict_keys.index(x)+1, name_struct)]

In [9]:
first_name_ds = first_name_struct.apply(lambda x:transform_structure(x))
last_name_ds = last_name_struct.apply(lambda x:transform_structure(x))

In [10]:
len(struct_dict_keys)

666

In [11]:
def transform_labels(x):
    y = np.zeros(len(ethnic_keys))
    y[ethnic_keys.index(x)]=1
    return y

labels = np.array(list(map(lambda x: transform_labels(x),ethnic_series)))

In [12]:
import numpy as np
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Bidirectional
from keras.layers import TimeDistributed
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from tflearn.data_utils import to_categorical, pad_sequences



Using TensorFlow backend.


In [13]:
first_name_ds

0          [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
1                                         [15, 16, 17, 18]
2                                  [7, 19, 20, 21, 22, 23]
3                                 [24, 25, 11, 26, 21, 14]
4                         [27, 28, 29, 30, 31, 32, 33, 34]
5                                 [35, 36, 37, 38, 39, 34]
6                                     [40, 41, 42, 13, 14]
7        [7, 43, 44, 10, 14, 7, 11, 45, 46, 47, 22, 48,...
8                               [27, 50, 25, 8, 51, 2, 52]
9                                 [53, 54, 55, 56, 57, 58]
10                                    [15, 59, 60, 61, 62]
11                         [63, 21, 22, 64, 65, 2, 13, 14]
12                             [7, 66, 67, 68, 69, 41, 70]
13                                          [7, 11, 3, 58]
14                                 [7, 31, 71, 72, 73, 18]
15                                 [7, 31, 71, 72, 73, 18]
16                    [63, 74, 75, 36, 76, 77, 72, 73, 1

In [14]:
from keras.layers import Input
from keras.models import Model

# max sequence length
seq_length = 100
# multi input with single output

# first name input
first_name_input = Input(shape=(len(struct_dict_keys),),name='first_name_input')
last_name_input = Input(shape=(len(struct_dict_keys),),name='last_name_input')

# first tensor for first name
first_name_l = Dense(units=1000)(first_name_input)
last_name_l = Dense(units=1000)(last_name_input)

# merge the two layer together
x = keras.layers.concatenate([first_name_l,last_name_l])

# stack dense network for memory
x = Dense(1000, activation='relu')(x)
x = Dense(500, activation='relu')(x)
output_l = Dense(len(ethnic_keys),activation='softmax')(x)

model = Model(inputs=[first_name_input, last_name_input], outputs=[output_l])
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

In [15]:
first_name_ds_mat = np.zeros((len(first_name_ds),len(struct_dict_keys)),dtype=np.int32)
for i,x in enumerate(first_name_ds):
    for y in x:
        first_name_ds_mat[i,y-1]+=1
last_name_ds_mat = np.zeros((len(last_name_ds),len(struct_dict_keys)),dtype=np.int32)
for i,x in enumerate(last_name_ds):
    for y in x:
        last_name_ds_mat[i,y-1]+=1        

In [16]:
model.summary()
first_trainX = first_name_ds_mat[trainIndex]
first_testX = first_name_ds_mat[testIndex]
last_trainX = last_name_ds_mat[trainIndex]
last_testX = last_name_ds_mat[testIndex]
trainY = labels[trainIndex]
testY = labels[testIndex]

#trainX =np.array([to_categorical(x,nb_classes=len(struct_dict_keys)+1) for x in trainX])
#testX =np.array([to_categorical(x,nb_classes=len(struct_dict_keys)+1) for x in testX])

batch_size = 1000
input_dim = 50 

for x in range(10):
    model.fit([first_trainX, last_trainX],trainY,epochs=1,batch_size=1000)
    scores = model.evaluate([first_testX, last_testX],testY,verbose=0)
    print("Accuracy: %.2f%%" %(scores[1]*100))

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
first_name_input (InputLayer)    (None, 666)           0                                            
____________________________________________________________________________________________________
last_name_input (InputLayer)     (None, 666)           0                                            
____________________________________________________________________________________________________
dense_1 (Dense)                  (None, 1000)          667000      first_name_input[0][0]           
____________________________________________________________________________________________________
dense_2 (Dense)                  (None, 1000)          667000      last_name_input[0][0]            
___________________________________________________________________________________________

In [71]:
for x in range(10):
    model.fit([first_trainX, last_trainX],trainY,epochs=1,batch_size=1000)
    scores = model.evaluate([first_testX, last_testX],testY,verbose=0)
    print("Accuracy: %.2f%%" %(scores[1]*100))

Epoch 1/1
Accuracy: 85.43%
Epoch 1/1
Accuracy: 85.30%
Epoch 1/1
Accuracy: 85.37%
Epoch 1/1
Accuracy: 85.66%
Epoch 1/1
Accuracy: 85.66%
Epoch 1/1
Accuracy: 85.70%
Epoch 1/1
Accuracy: 85.69%
Epoch 1/1
Accuracy: 85.75%
Epoch 1/1
Accuracy: 85.74%
Epoch 1/1
Accuracy: 85.76%


In [17]:
def trans_name(name):
    name = name.lower()
    # transform space into underscore
    name = '_'+name.replace(' ','_')+'_'
    #transform the name into sequence structure
    ext_name = extract_structure(name)
    trans_name = transform_structure(ext_name)
    name_ds_mat = np.zeros((1,len(struct_dict_keys)),dtype=np.int32)
    for i,x in enumerate(trans_name):
        name_ds_mat[0,x-1]+=1
    #trans_name = pad_sequences([trans_name], maxlen=50,value=0.)
    return name_ds_mat

def predict_ethnicity(fname,lname):
    # lower case the name
    fnamex = trans_name(fname)
    lnamex = trans_name(lname)
    pred = model.predict([np.array(fnamex),np.array(lnamex)])
    pred_class = np.argsort(pred[0])[::-1]
    return_item = []
    for x in np.argsort(pred[0])[::-1]:
        return_item.append((ethnic_keys[x],pred[0][x]))
    return return_item

In [22]:
predict_ethnicity('Shinji','Kagawa')
#trans_name('Nikolaus')

[('japanese', 1.0),
 ('indian', 5.0325966e-08),
 ('african', 1.4800873e-08),
 ('indonesian', 8.8743637e-11),
 ('arab', 8.7172121e-13),
 ('korean', 3.7096197e-13),
 ('slav', 1.1345596e-13),
 ('german', 5.7383182e-14),
 ('thai', 5.4225884e-14),
 ('chinese', 2.72746e-14),
 ('english', 3.5170739e-15),
 ('israeli', 1.5735749e-15),
 ('french', 2.2933817e-17),
 ('turkish', 1.2016868e-17),
 ('greek', 1.1056446e-17),
 ('nordic', 7.038771e-18),
 ('hispanic', 6.3994195e-18),
 ('hungarian', 5.473129e-18),
 ('vietnamese', 5.199001e-18),
 ('dutch', 4.4446176e-18),
 ('baltic', 4.1480262e-18),
 ('romanian', 1.9028031e-18),
 ('italian', 1.5801921e-22)]

In [8]:
# embedd the structure vocabulary using text embedding and reduce the dimensionality

# convert the names into word structure vector
struct_dict_keys = list(struct_dict.keys())

def transform_structure(name_struct):
    list_structure = []
    for x in name_struct:
        try:
            list_structure.append(struct_dict_keys.index(x)+1)
        except BaseException:
            list_structure.append(0)
    #add pading 0 for structure less than num_input
    #for i in range(len(list_structure),timesteps):
    #    list_structure.append(0)
    return list_structure   
    #return [*map(lambda x:struct_dict_keys.index(x)+1, name_struct)]

#data_source = full_name_struct.apply(lambda x: transform_structure(x))

In [9]:
def transform_labels(x):
    y = np.zeros(len(ethnic_keys))
    y[ethnic_keys.index(x)]=1
    return y

labels = np.array(list(map(lambda x: transform_labels(x),ethnic_series)))

In [26]:
# using tflearn make the graph creation simple
import tensorflow as tf
import tflearn
from tflearn.data_utils import to_categorical, pad_sequences
from sklearn.model_selection import train_test_split

# separate train and training set
trainX, testX, trainY, testY = train_test_split(data_source,[ethnic_keys.index(x) for x in ethnic_series],test_size = 0.2)

trainX = pad_sequences(trainX, maxlen=50,value=0.)
testX = pad_sequences(testX,maxlen=50,value=0.)
# Converting labels to binary vectors
trainY = to_categorical(trainY,nb_classes=len(ethnic_keys))
testY = to_categorical(testY,nb_classes=len(ethnic_keys))    

In [16]:
import numpy as np
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Bidirectional
from keras.layers import TimeDistributed
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence

import pickle

with open('train_test_fix.pickle', 'rb') as f:
    trainX,trainY,testX,testY,ethnic_keys,struct_dict_keys = pickle.load(f)
    #aha = pickle.load(f)

#with open('traintest-smote.pickle','rb') as f:
#    train_res,test_res = pickle.load(f)

with open('ethnic_keys.pickle','rb') as f:
    name_struct_keys,ethnic_keys = pickle.load(f)
        
embedding_vector_length = 1000
lstm_layer = 1000
max_sequence = 50

In [None]:
# convert categorical to binary crossentropy
#trainY = np.array([np.where(x>0)[0][0] for x in trainY])
#testY = np.array([np.where(x>0)[0][0] for x in testY])

#test_res

In [2]:
model = Sequential()
model.add(Embedding(len(name_struct_keys)+1,embedding_vector_length,input_length=max_sequence))
model.add(Conv1D(filters=embedding_vector_length,kernel_size=3,padding='same',activation='relu'))
#model.add(MaxPooling1D(pool_size=2))
#model.add(LSTM(lstm_layer,dropout=0.8))
model.add(Bidirectional(LSTM(max_sequence*2,return_sequences=False),input_shape=(max_sequence,1)))
#model.add(TimeDistributed(keras.layers.Dense(len(ethnic_keys),activation='softmax')))
model.add(keras.layers.Dense(len(ethnic_keys),activation='softmax'))
#model.add(keras.layers.Dense(len(ethnic_keys),activation='softmax'))
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())
for x in range(10):
    model.fit(trainX,trainY,epochs=1,batch_size=1000)
    scores = model.evaluate(testX,testY,verbose=0)
    print("Accuracy: %.2f%%" %(scores[1]*100))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 50, 1000)          62696000  
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 50, 1000)          3001000   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 200)               880800    
_________________________________________________________________
dense_1 (Dense)              (None, 23)                4623      
Total params: 66,582,423
Trainable params: 66,582,423
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/1
Accuracy: 75.88%
Epoch 1/1
Accuracy: 84.37%
Epoch 1/1
Accuracy: 85.12%
Epoch 1/1
Accuracy: 85.57%
Epoch 1/1
Accuracy: 85.63%
Epoch 1/1
Accuracy: 85.55%
Epoch 1/1
Accuracy: 85.79%
Epoch 1/1
Accuracy: 85.69%
Epoch 1/1
Accuracy: 85.82%
Epoch 1/1
Accu

In [4]:
testX

array([[  112,  1331,  1332, ...,     0,     0,     0],
       [  251,   252,   232, ...,     0,     0,     0],
       [ 4633, 11731, 23123, ...,     0,     0,     0],
       ..., 
       [  696,   697,   698, ...,     0,     0,     0],
       [   25,    26,  5195, ...,     0,     0,     0],
       [ 8671,  8672,  6568, ...,     0,     0,     0]], dtype=int32)

In [12]:
from keras.models import model_from_json
from keras import backend as K

# compute the accuracy
def f1_score(y_true, y_pred):

    # Count positive samples.
    c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    c2 = K.sum(K.round(K.clip(y_pred, 0, 1)))
    c3 = K.sum(K.round(K.clip(y_true, 0, 1)))

    # If there are no true samples, fix the F1 score at 0.
    if c3 == 0:
        return 0

    # How many selected items are relevant?
    precision = c1 / c2
    #print(precision)

    # How many relevant items are selected?
    recall = c1 / c3
    #print(recall)

    # Calculate f1_score
    f1_score = 2 * (precision * recall) / (precision + recall)
    return f1_score

def precision(y_true, y_pred):

    # Count positive samples.
    c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    c2 = K.sum(K.round(K.clip(y_pred, 0, 1)))
    c3 = K.sum(K.round(K.clip(y_true, 0, 1)))

    # If there are no true samples, fix the F1 score at 0.
    if c3 == 0:
        return 0

    # How many selected items are relevant?
    precision = c1 / c2

    return precision


def recall(y_true, y_pred):

    # Count positive samples.
    c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    c3 = K.sum(K.round(K.clip(y_true, 0, 1)))

    # If there are no true samples, fix the F1 score at 0.
    if c3 == 0:
        return 0

    recall = c1 / c3

    return recall

# load model
# load json and create model
json_file = open('model-keras-embed-bilstm-womaxpool.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
#                               ,custom_objects= {'f1_score': f1_score})
loaded_model.load_weights("model-keras-embed-bilstm-womaxpool-10.h5")

loaded_model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy',f1_score,precision,recall])

In [18]:
scores = loaded_model.evaluate(testX,testY,verbose=0)

In [19]:
print('Accuracy: {}, F1: {}, Precision: {}, Recal: {}'.format(scores[1],scores[2],scores[3],scores[4]))

Accuracy: 0.8561812878216707, F1: 0.8601400889915978, Precision: 0.8757469038779302, Recal: 0.845523895023381


In [181]:
from tflearn.data_utils import to_categorical, pad_sequences

# transform prediction
# given name compute the prediction
def predict_ethnicity(name):
    # lower case the name
    name = name.lower()
    # transform space into underscore
    name = '_'+name.replace(' ','_')+'_'
    #transform the name into sequence structure
    ext_name = extract_structure(name)
    trans_name = transform_structure(ext_name)
    trans_name = pad_sequences([trans_name], maxlen=50,value=0.)
    pred = loaded_model.predict(trans_name)
    pred_class = np.argsort(pred[0])[::-1]
    return_item = []
    for x in np.argsort(pred[0])[::-1]:
        return_item.append((ethnic_keys[x],pred[0][x]))
    return return_item

name='helen lamothe'
ext_name = extract_structure(name)
#print(ext_name)
trans_name = transform_structure(ext_name)
#trans_name
pad_sequences([trans_name], maxlen=50,value=0.)
#extract_structure('Nikolaus Nova')
#transform_structure('Robert Nova')
ethnic_prob = predict_ethnicity('Filho  Elias Abdalla')
#ethnic_prob

In [132]:
test = loaded_model.predict(trainX[10].reshape(1,50))
test

array([[  1.22544670e-03,   2.45175033e-05,   5.54050894e-06,
          3.56604069e-05,   1.31730601e-04,   1.58663862e-03,
          9.95586514e-01,   3.65224201e-04,   4.18016425e-05,
          1.64734403e-04,   3.29397808e-05,   1.42851750e-05,
          5.77516516e-07,   3.13429664e-05,   1.98912196e-04,
          5.65968139e-06,   4.63458673e-06,   4.45792568e-04,
          5.28864875e-05,   2.15678101e-05,   8.53615438e-06,
          4.57901763e-07,   1.44505730e-05]], dtype=float32)

In [133]:
np.argsort(test)[0][::-1]

array([ 6,  5,  0, 17,  7, 14,  9,  4, 18,  8,  3, 10, 13,  1, 19, 22, 11,
       20, 15,  2, 16, 12, 21])

In [134]:
np.where(trainY[10]==1)

(array([6]),)

In [151]:
name='_helen__lamothe_'
ext_name = extract_structure(name)
#print(ext_name)
trans_name = transform_structure(ext_name)
#trans_name
pad_sequences([trans_name], maxlen=50,value=0.)

array([[    0,     0,     0,  2304,  2305,     0,     0,     0,     0,
         4088, 15559,  2523, 15102,     0,     0,     0,    24,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0]], dtype=int32)

In [191]:
predict_ethnicity('harry potter')

[('ITALIAN', 0.52735686),
 ('INDIAN', 0.30269179),
 ('KOREAN', 0.097788125),
 ('JAPANESE', 0.017470013),
 ('ROMANIAN', 0.015210837),
 ('TURKISH', 0.014220745),
 ('HISPANIC', 0.0092350421),
 ('GERMAN', 0.0046770978),
 ('BALTIC', 0.0032188322),
 ('ARAB', 0.0028918688),
 ('ISRAELI', 0.0022467086),
 ('GREEK', 0.0014537659),
 ('SLAV', 0.00046977124),
 ('NORDIC', 0.00024971511),
 ('HUNGARIAN', 0.0002173665),
 ('DUTCH', 0.00021363674),
 ('INDONESIAN', 0.00019762212),
 ('VIETNAMESE', 9.9613972e-05),
 ('AFRICAN', 3.4369114e-05),
 ('CHINESE', 3.3702971e-05),
 ('ENGLISH', 1.7328795e-05),
 ('FRENCH', 4.1550811e-06),
 ('THAI', 9.4588233e-07)]

In [175]:
ethnic_keys[15]

'ISRAELI'

In [71]:
ethnea_df['First']+ethnea_df['Last']+ethnea_df['Ethnea']

0                           _Elias_Abdalla__Filho_HISPANIC
1                                  _Jad__Bou_Abdallah_ARAB
2                                  _Ayman__Abdel_Aziz_ARAB
3                                  _Salma__Abdelmoula_ARAB
4                                     _Ibrahim__Abdou_ARAB
5                             _Hazem__Abou_El_Fettouh_ARAB
6                                     _Rola__Aboutaam_ARAB
7        _Aida_Alexandra__Alvim_de_Abreu_Silva_Rodrigue...
8             _Isabel__Cristina_Affonso_Scaletsky_HISPANIC
9                                _Tsiri__Agbenyega_AFRICAN
10                     _Jose__Maria_Aguado_Garcia_HISPANIC
11                      _Manuela__Aguilar_Guisado_HISPANIC
12                         _Arturo__Aguillon_Luna_HISPANIC
13                                   _Ali__Ahmadzadeh_ARAB
14                                    _Ahmed__Ibrahim_ARAB
15                                    _Ahmed__Letaief_ARAB
16                           _Mohammed__Shakeel_Ahmed_AR