# LSTM for lastname classifcation redo in Keras

In [220]:
import pandas as pd
import numpy as np

import tensorflow_addons as tfa
from keras.datasets import mnist
from tensorflow.keras import models, layers, callbacks, optimizers, Sequential, losses
import tqdm
from tqdm.keras import TqdmCallback

from sklearn.model_selection import train_test_split

from typing import Sequence

## Load

Let's download [training](https://raw.githubusercontent.com/hunkim/PyTorchZeroToAll/master/data/names_train.csv.gz) and [testing](https://raw.githubusercontent.com/hunkim/PyTorchZeroToAll/master/data/names_test.csv.gz) data for last names.   This data set is a bunch of last names and the nationality or language. 

In [221]:
df_train = pd.read_csv("data/names_train.csv", header=None)
df_train.columns = ['name','language']
df_test = pd.read_csv("data/names_train.csv", header=None)
df_test.columns = ['name','language']

In [222]:
df_train.shape, df_test.shape

((13374, 2), (13374, 2))

In [223]:
df_train.head(2)

Unnamed: 0,name,language
0,Adsit,Czech
1,Ajdrna,Czech


## Clean

In [224]:
badname = df_train['name']=='To The First Page'
df_train[badname]

Unnamed: 0,name,language
8340,To The First Page,Russian
8341,To The First Page,Russian
8342,To The First Page,Russian
8343,To The First Page,Russian
8344,To The First Page,Russian
8345,To The First Page,Russian
8346,To The First Page,Russian
8347,To The First Page,Russian
8348,To The First Page,Russian
8349,To The First Page,Russian


In [225]:
comma = df_train['name'].str.contains(',') # might as well keep
df_train[comma]

Unnamed: 0,name,language
5976,"Jevolojnov,",Russian
6549,"Lytkin,",Russian


In [226]:
df_train[df_train['name'].str.contains("'")][:3] # there are ok so keep quote

Unnamed: 0,name,language
3609,Awak'Yan,Russian
4454,Dan'Ko,Russian
4471,Dar'Kin,Russian


In [227]:
badname = df_train['name']=='To The First Page'
df_train = df_train[~badname]

badname = df_test['name']=='To The First Page'
df_test = df_test[~badname]

In [228]:
df_train['name'] = df_train['name'].str.lower()
df_test['name'] = df_test['name'].str.lower()

In [229]:
def maxlen(strings:Sequence[str]) -> int:
    return max([len(l) for l in strings])

max_len = max(maxlen(df_train['name']), maxlen(df_test['name']))
max_len

19

## Split out validation set

In [230]:
X, y = df_train[['name']], df_train['language']
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.20)
X_test, y_test = df_test[['name']], df_test['language']

## Get vocab

In [231]:
def vocab(strings):
    letters = [list(l) for l in strings]
    V = set([c for cl in letters for c in cl])
    V = sorted(list(V))
    ctoi = {c:i for i, c in enumerate(V)}
    return V, ctoi

In [232]:
V, ctoi = vocab(X['name'])
ctoi

{' ': 0,
 "'": 1,
 ',': 2,
 'a': 3,
 'b': 4,
 'c': 5,
 'd': 6,
 'e': 7,
 'f': 8,
 'g': 9,
 'h': 10,
 'i': 11,
 'j': 12,
 'k': 13,
 'l': 14,
 'm': 15,
 'n': 16,
 'o': 17,
 'p': 18,
 'q': 19,
 'r': 20,
 's': 21,
 't': 22,
 'u': 23,
 'v': 24,
 'w': 25,
 'x': 26,
 'y': 27,
 'z': 28}

## Encode names into array of char

In [233]:
def encode_chars(names, max_len):
    encoded = np.zeros(shape=(len(names),max_len))
    
    for i,name in enumerate(names):
        a = [ctoi[c] for c in name]
        encoded[i,:len(a)] = a # effectively pads with 0 to max_len
    return encoded

In [234]:
sample = ['cat','a','at'] # always debug with a small representative example
encode_chars(sample, max_len)

array([[ 5.,  3., 22.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.],
       [ 3.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.],
       [ 3., 22.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.]])

In [235]:
encode_chars(X_train['name'][0:3], max_len)

array([[27.,  3.,  9., 23., 16., 17., 24.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.],
       [11., 21.,  3.,  9.,  3., 14., 11.,  7., 24.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.],
       [22., 23.,  6., 17., 20.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.]])

In [236]:
X_train = encode_chars(X_train['name'], max_len)
X_valid = encode_chars(X_valid['name'], max_len)

In [237]:
X_train.shape

(10686, 19)

In [238]:
#X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
#X_train.shape

## One-hot encode target language (class)

Get categories from training only, not valid/test sets. Then apply cats to those set y's.

In [239]:
y_train = y_train.astype('category').cat.as_ordered()
y_cats = y_train.cat.categories
y_cats

Index(['Arabic', 'Chinese', 'Czech', 'Dutch', 'English', 'French', 'German',
       'Greek', 'Irish', 'Italian', 'Japanese', 'Korean', 'Polish',
       'Portuguese', 'Russian', 'Scottish', 'Spanish', 'Vietnamese'],
      dtype='object')

In [240]:
y_train = pd.get_dummies(y_train).values

In [241]:
y_valid = pd.Categorical(y_valid, categories=y_cats, ordered=True)
y_test = pd.Categorical(y_test, categories=y_cats, ordered=True)

In [242]:
y_valid = pd.get_dummies(y_valid).values
y_test = pd.get_dummies(y_test).values

In [243]:
y_valid[:5], y_test[:5]

(array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
        [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]],
       dtype=uint8),
 array([[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
       dtype=uint8))

In [254]:
embedding_len = 32
model = Sequential()
model.add(layers.Embedding(input_dim=len(V), output_dim=embedding_len, input_length=max_len))
model.add(layers.LSTM(embedding_len, dropout=0.1))
model.add(layers.Dense(len(y_cats), activation='softmax'))

opt = optimizers.Adam(learning_rate=0.01)

model.compile(loss=losses.categorical_crossentropy, optimizer=opt, metrics=['accuracy'])
#model.summary()

batch_size = 64
history = model.fit(X_train, y_train,
                    shuffle=True,
                    epochs=20,
                    validation_data=(X_valid, y_valid),
                    batch_size=batch_size,
                    verbose=0
                  , callbacks=[tfa.callbacks.TQDMProgressBar(show_epoch_progress=True)]
                    )

HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=20.0, style=Progr…

Epoch 1/20


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=167.0), HTML(value='')), layout=Layout(di…


Epoch 2/20


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=167.0), HTML(value='')), layout=Layout(di…


Epoch 3/20


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=167.0), HTML(value='')), layout=Layout(di…


Epoch 4/20


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=167.0), HTML(value='')), layout=Layout(di…


Epoch 5/20


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=167.0), HTML(value='')), layout=Layout(di…


Epoch 6/20


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=167.0), HTML(value='')), layout=Layout(di…


Epoch 7/20


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=167.0), HTML(value='')), layout=Layout(di…


Epoch 8/20


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=167.0), HTML(value='')), layout=Layout(di…


Epoch 9/20


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=167.0), HTML(value='')), layout=Layout(di…


Epoch 10/20


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=167.0), HTML(value='')), layout=Layout(di…


Epoch 11/20


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=167.0), HTML(value='')), layout=Layout(di…


Epoch 12/20


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=167.0), HTML(value='')), layout=Layout(di…


Epoch 13/20


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=167.0), HTML(value='')), layout=Layout(di…


Epoch 14/20


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=167.0), HTML(value='')), layout=Layout(di…


Epoch 15/20


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=167.0), HTML(value='')), layout=Layout(di…


Epoch 16/20


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=167.0), HTML(value='')), layout=Layout(di…


Epoch 17/20


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=167.0), HTML(value='')), layout=Layout(di…


Epoch 18/20


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=167.0), HTML(value='')), layout=Layout(di…


Epoch 19/20


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=167.0), HTML(value='')), layout=Layout(di…


Epoch 20/20


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=167.0), HTML(value='')), layout=Layout(di…



