# Categorize nationality of last names by RNN

In [1]:
!wget https://download.pytorch.org/tutorial/data.zip
! unzip data.zip
! ls data

--2017-03-24 14:41:06--  https://download.pytorch.org/tutorial/data.zip
Resolving download.pytorch.org... 52.222.236.22, 52.222.236.252, 52.222.236.154, ...
Connecting to download.pytorch.org|52.222.236.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2882130 (2.7M) [application/zip]
Saving to: 'data.zip'


2017-03-24 14:41:07 (2.06 MB/s) - 'data.zip' saved [2882130/2882130]

Archive:  data.zip
  inflating: data/eng-fra.txt        
   creating: data/names/
  inflating: data/names/Arabic.txt   
  inflating: data/names/Chinese.txt  
  inflating: data/names/Czech.txt    
  inflating: data/names/Dutch.txt    
  inflating: data/names/English.txt  
  inflating: data/names/French.txt   
  inflating: data/names/German.txt   
  inflating: data/names/Greek.txt    
  inflating: data/names/Irish.txt    
  inflating: data/names/Italian.txt  
  inflating: data/names/Japanese.txt  
  inflating: data/names/Korean.txt   
  inflating: data/names/Polish.txt   
  inflating: dat

In [2]:
import glob

def findFiles(path): return glob.glob(path)

print(findFiles('data/names/*.txt'))

import unicodedata
import string

all_letters = string.ascii_letters + " .,;'"
n_letters = len(all_letters)

# Turn a Unicode string to plain ASCII, thanks to http://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

print(unicodeToAscii('Ślusàrski'))

['data/names/Arabic.txt', 'data/names/Chinese.txt', 'data/names/Czech.txt', 'data/names/Dutch.txt', 'data/names/English.txt', 'data/names/French.txt', 'data/names/German.txt', 'data/names/Greek.txt', 'data/names/Irish.txt', 'data/names/Italian.txt', 'data/names/Japanese.txt', 'data/names/Korean.txt', 'data/names/Polish.txt', 'data/names/Portuguese.txt', 'data/names/Russian.txt', 'data/names/Scottish.txt', 'data/names/Spanish.txt', 'data/names/Vietnamese.txt']
Slusarski


In [4]:
# Build the category_lines dictionary, a list of names per language
category_lines = {}
all_categories = []

# Read a file and split into lines
def readLines(filename):
    lines = open(filename).read().strip().split('\n')
    return [unicodeToAscii(line) for line in lines]

for filename in findFiles('data/names/*.txt'):
    category = filename.split('/')[-1].split('.')[0]
    all_categories.append(category)
    lines = readLines(filename)
    category_lines[category] = lines

n_categories = len(all_categories)
n_categories

18

In [57]:
import pandas as pd

name_nations = pd.DataFrame([
    (nation,name)
    for nation, list_of_names in category_lines.items()
    for name in list_of_names], columns=['nation', 'name']
)

name_nations = name_nations\
    .assign(nation=lambda df: df.nation.astype('category'))\
    .assign(y=lambda df: df.nation.cat.codes
)
name_nations.nation.cat.categories

Index(['Arabic', 'Chinese', 'Czech', 'Dutch', 'English', 'French', 'German',
       'Greek', 'Irish', 'Italian', 'Japanese', 'Korean', 'Polish',
       'Portuguese', 'Russian', 'Scottish', 'Spanish', 'Vietnamese'],
      dtype='object')

In [58]:
max_name_length = name_nations.name.apply(len).max()
max_name_length

19

In [59]:
import numpy as np

# Find letter index from all_letters, e.g. "a" = 1
# we'll reserve 0 for padding symbol
def letterToIndex(letter):
    return all_letters.find(letter) + 1

# Just for demonstration, turn a letter into a <1 x n_letters> Tensor
def letterToTensor(letter):
    tensor = np.zeros(shape=(1, n_letters + 1))
    tensor[0][letterToIndex(letter)] = 1
    return tensor

# Turn a line into a <line_length x 1 x n_letters>,
# or an array of one-hot letter vectors
def lineToTensor(line, max_length=max_name_length):
    tensor = np.zeros(shape=(max_length, n_letters + 1))
    for li, letter in enumerate(line):
        tensor[li][letterToIndex(letter)] = 1
    return tensor

print(letterToTensor('J'))

print(lineToTensor('Jones').shape)

[[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.]]
(19, 58)


In [60]:
training_seqs = np.stack([lineToTensor(name) for name in name_nations.name])
training_seqs.shape

(20074, 19, 58)

In [61]:
import tensorflow as tf

sess = tf.InteractiveSession()

tf.__version__

seq_length = max_name_length
targets = pd.get_dummies(name_nations.y).values

print(training_seqs[:3, :, 0])
print(targets[:3])

[[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.]]
[[0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0]]


In [None]:
from keras.layers.recurrent import LSTM
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.optimizers import Adam

num_hidden = 128

model = Sequential()
model.add(LSTM(num_hidden, input_shape=(max_name_length, n_letters + 1), activation='sigmoid', inner_activation='hard_sigmoid'))
model.add(Dense(n_categories))
model.add(Activation('softmax'))
optimizer = Adam(lr=0.005)
model.compile(optimizer=optimizer,
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

In [88]:
model.fit(training_seqs, targets, batch_size=500, nb_epoch=10, verbose=2)

Epoch 1/10
10s - loss: 1.2833 - acc: 0.6331
Epoch 2/10
11s - loss: 1.2393 - acc: 0.6496
Epoch 3/10
11s - loss: 1.2187 - acc: 0.6540
Epoch 4/10
11s - loss: 1.1846 - acc: 0.6652
Epoch 5/10
10s - loss: 1.1588 - acc: 0.6669
Epoch 6/10
11s - loss: 1.1278 - acc: 0.6765
Epoch 7/10
10s - loss: 1.0729 - acc: 0.6946
Epoch 8/10
10s - loss: 1.0492 - acc: 0.6995
Epoch 9/10
10s - loss: 1.0077 - acc: 0.7099
Epoch 10/10
10s - loss: 0.9767 - acc: 0.7170


<keras.callbacks.History at 0x138ac5908>

In [89]:
def predict_nationality(name):
    result = model.predict(np.array([lineToTensor(name)]), verbose=0)
    return name_nations.nation.cat.categories[np.argmax(result)]

In [90]:
predict_nationality("Rochefort")

'English'

In [91]:
predict_nationality("Kon")

'English'

In [100]:
predict_nationality("Allain")

'English'

In [96]:
predict_nationality("Soldatov")

'Russian'

In [99]:
predict_nationality("Lu")

'English'