In [1045]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from keras.layers import Input, Embedding, LSTM, Dense, Dropout
from keras.models import Model, load_model
from keras.callbacks import EarlyStopping

In [983]:
# load the data
file_name = 'names.csv'
data = pd.read_csv(file_name, header=0, sep=',')

# remove English for easy prediction
data = data[data['nation'] != 'ENGLISH']
data

Unnamed: 0,first_name,last_name,nation
0,PONGSIRI,SUASAT,THAI
1,NOPADON,MISANG,THAI
2,KRISADA,SOMSUK,THAI
3,WORAWUT,SANGANGAM,THAI
4,KULPRIYA,KITCHAROEN,THAI
5,PATTHARAPONG,PIMONSAKONWONG,THAI
6,KANITTA,SUWANNARAT,THAI
7,JEDSADAKORN,PANICH,THAI
8,THITAPORN,KONKONG,THAI
9,DARUNWAN,THONGKHOENKHUN,THAI


In [985]:
# convert name to lower case
data['first_name'] = data['first_name'].str.lower()
data['last_name'] = data['last_name'].str.lower()
# data['full_name'] = data['first_name'] + ' ' + data['last_name']
data.head()

Unnamed: 0,first_name,last_name,nation
0,pongsiri,suasat,THAI
1,nopadon,misang,THAI
2,krisada,somsuk,THAI
3,worawut,sangangam,THAI
4,kulpriya,kitcharoen,THAI


# Analyze name length

In [986]:
data['first_name_len'] = data['first_name'].str.len()
data['last_name_len'] = data['last_name'].str.len()
# data['full_name_len'] = data['full_name'].str.len()
data.head()

Unnamed: 0,first_name,last_name,nation,first_name_len,last_name_len
0,pongsiri,suasat,THAI,8,6
1,nopadon,misang,THAI,8,6
2,krisada,somsuk,THAI,7,6
3,worawut,sangangam,THAI,7,9
4,kulpriya,kitcharoen,THAI,8,10


In [987]:
data.describe()

Unnamed: 0,first_name_len,last_name_len
count,300.0,300.0
mean,5.94,6.58
std,2.418518,2.788582
min,2.0,2.0
25%,4.0,5.0
50%,6.0,6.0
75%,8.0,8.0
max,12.0,16.0


# Preprocessing name for machine learning model
We need to convert name alphabets into a fixed length matrix, we also need to specify the maximum name
length. E.g. if the maximum name length is 10, and the English alphabets are 26. Then the final vector length
is 10*26 which is 260 for each name.

In [988]:
max_name_len = data['first_name_len'].max()
max_name_len = max(data['last_name_len'].max(), max_name_len)
# max_name_len = max(data['full_name_len'].max(), max_name_len)
max_name_len

16

In [989]:
import string
allowed_chars = string.ascii_lowercase + '- '
allowed_chars, len(allowed_chars)

('abcdefghijklmnopqrstuvwxyz- ', 28)

In [990]:
# convert name string into (name_len, len(allowed_chars)) shaped matrix containing one-hot values
# if the character is present
def name_to_matrix(name, max_name_len=None):
    vectors = []
    vec_shape = [1, len(allowed_chars)]
    for c in name:
        idx = None # contains index a=0, b=1, c=2, ..., z=25
        vec = np.zeros(vec_shape, dtype=np.float32)
        if c in allowed_chars:
            idx = allowed_chars.index(c)
            vec[0,idx] = 1.0
        vectors.append(vec)
    if max_name_len is not None:
        n_blank_chars = max_name_len - len(name)
        for i in range(n_blank_chars):
            blank_vec = np.zeros(vec_shape, dtype=np.float32)
            vectors.append(blank_vec)
    return np.concatenate(vectors, axis=0)

name_matrix = name_to_matrix('abz', max_name_len)
name_matrix[:5, :], name_matrix.shape

(array([[ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.],
        [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,
          0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.]], dtype=float32), (16, 28))

# Aggregating the dataset
Use the function defined to aggregate the dataset

In [991]:
available_nations = np.array(list(set(data['nation'])))
available_nations

array(['THAI', 'CHINESE', 'JAPANESE'], 
      dtype='<U8')

In [992]:
def one_hot(idx, size):
    vec = np.zeros([size], dtype=np.float32)
    vec[idx] = 1.0
    return vec

one_hot(1, 5)

array([ 0.,  1.,  0.,  0.,  0.], dtype=float32)

In [993]:
# apply name_to_matrix() function to all the names and aggregate the results
X, y, X_str, y_nation, y_pos = [], [], [], [], []
name_cols = ['first_name', 'last_name']
# name_cols = ['full_name']
for name_col_idx, name_col in enumerate(name_cols):
    for idx, row in data.iterrows():
        name_matrix = name_to_matrix(row[name_col], max_name_len)
        X.append(name_matrix[np.newaxis])
        y.append([row['nation'], name_col])
        X_str.append(row[name_col])
        y_nation.append(one_hot(available_nations.tolist().index(row['nation']), len(available_nations))[np.newaxis])
        y_pos.append(one_hot(name_col_idx, len(name_cols))[np.newaxis])
X = np.concatenate(X, axis=0)
y = np.array(y)
X_str = np.array(X_str)
y_nation = np.concatenate(y_nation, axis=0)
y_pos = np.concatenate(y_pos, axis=0)
X.shape, y.shape, y_nation.shape, y_pos.shape

((600, 16, 28), (600, 2), (600, 3), (600, 2))

In [994]:
X_train, X_test, y_train, y_test, X_str_train, X_str_test,\
    y_nation_train, y_nation_test, y_pos_train, y_pos_test = \
    train_test_split(X, y, X_str, y_nation, y_pos, test_size=0.2)
X_train.shape, X_test.shape, y_nation_train.shape, y_nation_test.shape

((480, 16, 28), (120, 16, 28), (480, 3), (120, 3))

In [995]:
def flatten_X(X):
    return X.reshape([X.shape[0], -1])
X_train_flat = flatten_X(X_train)
X_test_flat = flatten_X(X_test)
X_train_flat.shape, X_test_flat.shape

((480, 448), (120, 448))

# Train a machine learning model
Use a simple model, or a deep one.

In [996]:
# # train a simple model and show the score
# model_names = ['nation', 'first/last']
# models = dict()
# for i in range(y.shape[1]):
#     simple_model = DecisionTreeClassifier()
#     simple_model.fit(X_train_flat, y_train[:,i])
#     print(model_names[i], simple_model.score(X_train_flat, y_train[:,i]), simple_model.score(X_test_flat, y_test[:,i]))
#     models[model_names[i]] = simple_model

In [1028]:
# define a deep model
name_input = Input(shape=(X_train_flat.shape[1],), name='name_input', dtype='float32')
hidden = Dense(64, activation='relu')(name_input)
hidden = Dropout(0.5)(hidden)
hidden = Dense(64, activation='relu')(hidden)
hidden = Dropout(0.5)(hidden)
hidden = Dense(64, activation='relu')(hidden)
hidden = Dropout(0.5)(hidden)
out_nation = Dense(y_nation.shape[1], activation='softmax', name='out_nation')(hidden)
out_pos = Dense(y_pos.shape[1], activation='softmax', name='out_pos')(hidden)

model = Model(inputs=[name_input], outputs=[out_nation, out_pos])

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
name_input (InputLayer)          (None, 448)           0                                            
____________________________________________________________________________________________________
dense_84 (Dense)                 (None, 64)            28736                                        
____________________________________________________________________________________________________
dropout_9 (Dropout)              (None, 64)            0                                            
____________________________________________________________________________________________________
dense_85 (Dense)                 (None, 64)            4160                                         
___________________________________________________________________________________________

In [1029]:
# train it
callbacks = [EarlyStopping(patience=10)]
model.fit([X_train_flat], [y_nation_train, y_pos_train], epochs=100, validation_split=0.1, callbacks=callbacks, batch_size=32)

Train on 432 samples, validate on 48 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100


<keras.callbacks.History at 0x2215be109b0>

In [1030]:
# performance evaluation
df = pd.DataFrame(columns=['loss','out_nation_loss','out_pos_loss','out_nation_acc', 'out_pos_acc'], index=['train', 'test'])
df.loc['train'] = model.evaluate([X_train_flat], [y_nation_train, y_pos_train], verbose=0)
df.loc['test'] = model.evaluate([X_test_flat], [y_nation_test, y_pos_test], verbose=0)
df

Unnamed: 0,loss,out_nation_loss,out_pos_loss,out_nation_acc,out_pos_acc
train,0.431149,0.0465565,0.384592,0.983333,0.8375
test,0.98343,0.233655,0.749775,0.941667,0.608333


# Save/Load model

In [1031]:
model.save('model.h5')

In [1046]:
model = load_model('model.h5')

# Inspection
Identify names that the model predicts incorrectly

In [1047]:
incorrect_mask = model.predict(X_test_flat)[0].argmax(axis=1) != y_nation_test.argmax(axis=1)
names = X_str_test[incorrect_mask]
pred = model.predict(X_test_flat[incorrect_mask,:])[0]
pred_nations = available_nations[pred.argmax(axis=1)]
probs = pred.max(axis=1)
nations = available_nations[y_nation_test[incorrect_mask].argmax(axis=1)]
df = pd.DataFrame(columns=['name', 'predicted nation', 'true nation', 'prediction confidence'])
for i in range(len(names)):
    df.loc[i] = [names[i], pred_nations[i], nations[i], probs[i]*100]
df.sort_values('prediction confidence', ascending=False)

Unnamed: 0,name,predicted nation,true nation,prediction confidence
1,kanbayashi,THAI,JAPANESE,99.778581
4,tadam,JAPANESE,THAI,99.623883
6,niti,JAPANESE,THAI,92.916507
5,munmee,JAPANESE,THAI,82.577282
2,shigetaka,CHINESE,JAPANESE,63.077497
3,suriya,JAPANESE,THAI,60.202628
0,kin,CHINESE,JAPANESE,53.336006


In [1048]:
# count nations that are incorrectly predicted by the model
from collections import Counter
Counter(df['true nation'])

Counter({'JAPANESE': 3, 'THAI': 4})

In [1092]:
# check if a certain name is inside a dataset
X_that_we_care = X_str_train
X_that_we_care[np.array(['kr' in x for x in X_that_we_care])]

array(['kraingam', 'krittanan'], 
      dtype='<U16')

# Inference
Given a name, predict the nation.

In [1091]:
def predict(names, model, max_name_len):
    assert(type(names) is list)
    X = []
    for name in names:
        matrix = name_to_matrix(name.lower(), max_name_len)[np.newaxis]
        X.append(matrix)
    X = np.concatenate(X, axis=0)
    X_flat = flatten_X(X)
    return model.predict(X_flat)

names = ["Zemin", "Sunisa", "Phakphum", "Krittitam", "Chanchana", "Sornsoontorn", "Xi",
        "Jinping", "Aiyaruck", "Shinzo", "Abe", "Prayut", "Chan-o-char", "Watcharaphong",
        "Pusit", "Sora"]
names = [name.lower() for name in names]
pred = predict(names, model, max_name_len)
pred_nation, pred_pos = pred
df = pd.DataFrame(columns=['name', 'nation'] + available_nations.tolist() + ['position'] + name_cols + ['In Training Set?'])
for i in range(len(names)):
    best_nation = available_nations[pred_nation[i,:].argmax()]
    best_pos = name_cols[pred_pos[i,:].argmax()]
    in_training = names[i] in X_str_train
    df.loc[i] = [names[i], best_nation] + (pred_nation[i,:]*100).tolist() + [best_pos] + (pred_pos[i,:]*100).tolist() + [in_training]
df

Unnamed: 0,name,nation,THAI,CHINESE,JAPANESE,position,first_name,last_name,In Training Set?
0,zemin,CHINESE,0.166673,99.1307,0.702627,last_name,44.082741,55.917263,True
1,sunisa,THAI,66.423477,1.73025,31.846275,first_name,90.923813,9.076188,False
2,phakphum,THAI,99.844353,0.0007999329,0.154849,last_name,5.060532,94.939461,False
3,krittitam,THAI,99.378082,0.001183855,0.620737,first_name,80.492424,19.507582,False
4,chanchana,THAI,99.7659,0.001876087,0.232227,last_name,45.037655,54.962349,True
5,sornsoontorn,THAI,99.999939,1.270955e-09,6e-05,first_name,51.07423,48.925774,False
6,xi,CHINESE,0.049547,99.64437,0.306082,first_name,56.511669,43.488331,False
7,jinping,CHINESE,0.72263,98.12111,1.156264,last_name,36.072132,63.927876,False
8,aiyaruck,JAPANESE,0.030111,0.001518273,99.968376,first_name,81.42556,18.57444,False
9,shinzo,JAPANESE,1.401626,21.5366,77.061768,last_name,47.213024,52.786976,False
