In [266]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

In [450]:
# load the data
file_name = 'names.csv'
data = pd.read_csv(file_name, header=0, sep=',')
data

Unnamed: 0,first_name,last_name,nation
0,PONGSIRI,SUASAT,THAI
1,NOPADON,MISANG,THAI
2,PASSARAPORN,JIROTRUNGROENGKUL,THAI
3,WORAWUT,SANGANGAM,THAI
4,KULPRIYA,KITCHAROEN,THAI
5,PATTHARAPONG,PIMONSAKONWONG,THAI
6,KANITTA,SUWANNARAT,THAI
7,JEDSADAKORN,PANICH,THAI
8,THITAPORN,KONKONG,THAI
9,DARUNWAN,THONGKHOENKHUN,THAI


In [451]:
# convert name to lower case
data['first_name'] = data['first_name'].str.lower()
data['last_name'] = data['last_name'].str.lower()
# data['full_name'] = data['first_name'] + ' ' + data['last_name']
data.head()

Unnamed: 0,first_name,last_name,nation
0,pongsiri,suasat,THAI
1,nopadon,misang,THAI
2,passaraporn,jirotrungroengkul,THAI
3,worawut,sangangam,THAI
4,kulpriya,kitcharoen,THAI


# Analyze name length

In [452]:
data['first_name_len'] = data['first_name'].str.len()
data['last_name_len'] = data['last_name'].str.len()
# data['full_name_len'] = data['full_name'].str.len()
data.head()

Unnamed: 0,first_name,last_name,nation,first_name_len,last_name_len
0,pongsiri,suasat,THAI,8,6
1,nopadon,misang,THAI,8,6
2,passaraporn,jirotrungroengkul,THAI,11,17
3,worawut,sangangam,THAI,7,9
4,kulpriya,kitcharoen,THAI,8,10


In [453]:
data.describe()

Unnamed: 0,first_name_len,last_name_len
count,199.0,199.0
mean,5.743719,6.708543
std,2.700477,3.245117
min,2.0,2.0
25%,3.0,4.0
50%,5.0,6.0
75%,8.0,9.0
max,12.0,17.0


# Preprocessing name for machine learning model
We need to convert name alphabets into a fixed length matrix, we also need to specify the maximum name
length. E.g. if the maximum name length is 10, and the English alphabets are 26. Then the final vector length
is 10*26 which is 260 for each name.

In [632]:
max_name_len = data['first_name_len'].max()
max_name_len = max(data['last_name_len'].max(), max_name_len)
max_name_len

17

In [633]:
import string
allowed_chars = string.ascii_lowercase + '- '
allowed_chars, len(allowed_chars)

('abcdefghijklmnopqrstuvwxyz- ', 28)

In [634]:
# convert name string into (name_len, len(allowed_chars)) shaped matrix containing one-hot values
# if the character is present
def name_to_matrix(name, max_name_len=None):
    vectors = []
    vec_shape = [1, len(allowed_chars)]
    for c in name:
        idx = None # contains index a=0, b=1, c=2, ..., z=25
        vec = np.zeros(vec_shape, dtype=np.float32)
        if c in allowed_chars:
            idx = allowed_chars.index(c)
            vec[0,idx] = 1.0
        vectors.append(vec)
    if max_name_len is not None:
        n_blank_chars = max_name_len - len(name)
        for i in range(n_blank_chars):
            blank_vec = np.zeros(vec_shape, dtype=np.float32)
            vectors.append(blank_vec)
    return np.concatenate(vectors, axis=0)

name_matrix = name_to_matrix('abz', max_name_len)
name_matrix[:5, :], name_matrix.shape

(array([[ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.],
        [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,
          0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.]], dtype=float32), (17, 28))

# Aggregating the dataset
Use the function defined to aggregate the dataset

In [635]:
# apply name_to_matrix() function to all the names and aggregate the results
X, y, X_str = [], [], []
name_cols = ['first_name', 'last_name']
for name_col_idx, name_col in enumerate(name_cols):
    for idx, row in data.iterrows():
        name_matrix = name_to_matrix(row[name_col], max_name_len)
        X.append(name_matrix[np.newaxis])
        y.append([row['nation'], name_col])
        X_str.append(row[name_col])
X = np.concatenate(X, axis=0)
y = np.array(y)
X_str = np.array(X_str)
X.shape, y.shape

((398, 17, 28), (398, 2))

In [636]:
X_train, X_test, y_train, y_test, X_str_train, X_str_test = train_test_split(X, y, X_str, test_size=0.2)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((318, 17, 28), (80, 17, 28), (318, 2), (80, 2))

In [637]:
def flatten_X(X):
    return X.reshape([X.shape[0], -1])
X_train_flat = flatten_X(X_train)
X_test_flat = flatten_X(X_test)
X_train_flat.shape, X_test_flat.shape

((318, 476), (80, 476))

# Train a machine learning model
Use a simple model, or a deep one.

In [638]:
# train a simple model and show the score
model_names = ['nation', 'first/last']
models = dict()
for i in range(y.shape[1]):
    simple_model = DecisionTreeClassifier()
    simple_model.fit(X_train_flat, y_train[:,i])
    print(model_names[i], simple_model.score(X_train_flat, y_train[:,i]), simple_model.score(X_test_flat, y_test[:,i]))
    models[model_names[i]] = simple_model

nation 1.0 0.925
first/last 0.959119496855 0.675


# Inference
Given a name, predict the nation.

In [647]:
def predict(names, model, max_name_len):
    assert(type(names) is list)
    X = []
    for name in names:
        matrix = name_to_matrix(name, max_name_len)[np.newaxis]
        X.append(matrix)
    X = np.concatenate(X, axis=0)
    X_flat = flatten_X(X)
    return model.predict(X_flat), model.predict_proba(X_flat)

names = ["Zhang", "Sunisa", "Monthon", "Krittitam"]
for name, model in models.items():
    print('Model:', name)
    pred = predict(names, model, max_name_len)
    print(pred)

Model: nation
(array(['CHINESE', 'THAI', 'THAI', 'THAI'], 
      dtype='<U10'), array([[ 1.,  0.],
       [ 0.,  1.],
       [ 0.,  1.],
       [ 0.,  1.]]))
Model: first/last
(array(['last_name', 'first_name', 'first_name', 'first_name'], 
      dtype='<U10'), array([[ 0.4,  0.6],
       [ 1. ,  0. ],
       [ 1. ,  0. ],
       [ 1. ,  0. ]]))


# Inspection
Identify names that the model predicts incorrectly

In [651]:
incorrect_mask = models['nation'].predict(X_test_flat) != y_test[:,0]
X_str_test[incorrect_mask], y_test[incorrect_mask][:,0]

(array(['boonin', 'jianhong', 'kietpermsak', 'phurk', 'zhenkang', 'ruogang'], 
       dtype='<U17'),
 array(['THAI', 'CHINESE', 'THAI', 'THAI', 'CHINESE', 'CHINESE'], 
       dtype='<U10'))