In [24]:
import pandas as pd
import numpy as np

In [3]:
# load the data
file_name = 'names.csv'
data = pd.read_csv(file_name, header=0)
data

Unnamed: 0,name,nation
0,ploy,Thai
1,May,Thai
2,Nan,Thai
3,Fern,Thai
4,mild,Thai
5,Aom,Thai
6,Kanokwan,Thai
7,jane,Thai
8,Beam,Thai
9,Natcha,Thai


In [9]:
# convert name to lower case
data['name'] = data['name'].apply(lambda name: name.lower())
data.head()

Unnamed: 0,name,nation
0,ploy,Thai
1,may,Thai
2,,Thai
3,fern,Thai
4,mild,Thai


# Analyze name length

In [10]:
data['name_len'] = data['name'].apply(lambda name: len(name))
data.head()

Unnamed: 0,name,nation,name_len
0,ploy,Thai,4
1,may,Thai,3
2,,Thai,3
3,fern,Thai,4
4,mild,Thai,4


In [12]:
data.describe()

Unnamed: 0,name_len
count,40.0
mean,4.45
std,1.449138
min,2.0
25%,3.0
50%,4.0
75%,5.0
max,8.0


# Preprocessing name for machine learning model
We need to convert name alphabets into a fixed length matrix, we also need to specify the maximum name
length. E.g. if the maximum name length is 5, and the English alphabets are 26. Then the final vector length
is 5*26 which is 130 for each name.

In [13]:
max_name_len = data['name_len'].max()
max_name_len

8

In [25]:
import string
string.ascii_lowercase, len(string.ascii_lowercase)

('abcdefghijklmnopqrstuvwxyz', 26)

In [45]:
# convert name string into (name_len, 26) shaped matrix containing one-hot values
# if the character is present
def name_to_matrix(name, max_name_len=None):
    vectors = []
    vec_shape = [1, len(string.ascii_lowercase)]
    for c in name:
        idx = None # contains index a=0, b=1, c=2, ..., z=25
        vec = np.zeros(vec_shape, dtype=np.float32)
        if c in string.ascii_lowercase:
            idx = string.ascii_lowercase.index(c)
            vec[0,idx] = 1.0
        vectors.append(vec)
    if max_name_len is not None:
        n_blank_chars = max_name_len - len(name)
        for i in range(n_blank_chars):
            blank_vec = np.zeros(vec_shape, dtype=np.float32)
            vectors.append(blank_vec)
    return np.concatenate(vectors, axis=0)

name_matrix = name_to_matrix('-abz', max_name_len)
name_matrix, name_matrix.shape

(array([[ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.

# Aggregating the dataset
Use the function defined to aggregate the dataset

In [51]:
# apply name_to_matrix() function to all the names and aggregate the results
X = []
for name in data['name']:
    name_matrix = name_to_matrix(name, max_name_len)
    X.append(name_matrix[np.newaxis])
X = np.concatenate(X, axis=0)
X.shape

(40, 8, 26)

In [53]:
y = data['nation']
y.shape

(40,)

# Train a machine learning model
Use a simple model, or a deep one.