In [24]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import keras
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer

import matplotlib.pyplot as plt

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding
from keras.layers import LSTM
from keras.layers import SimpleRNN
from keras import optimizers
from keras.utils.vis_utils import plot_model

import time

In [25]:
def bigrams(words):
    bigrams = []
    for b in words:
        bigrams.append([b[i:i+2] for i in range(len(b)-1)])
    return bigrams

In [26]:
df = pd.read_csv('./data/dataset.csv')
# print('data shape: {}'.format(df.shape))

X = df['NAME']
y = df['NATIONALITY']

classes = y.unique()
# print(classes)
num_classes = len(y.unique())

# print('number of classes: {}'.format(num_classes))

X_train_df, X_test_df, y_train_df, y_test_df = train_test_split(X, y, test_size=0.2, random_state=69)

# print('train data shape X, y: {},{}'.format(X_train_df.shape, y_train_df.shape))
# print('test data shape X, y: {},{}'.format(X_test_df.shape, y_test_df.shape))

In [27]:
X_tokenizer = Tokenizer(num_words=None, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', 
          lower=False, char_level=True, oov_token=None)

y_tokenizer = Tokenizer(num_words=None, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', 
          lower=True, char_level=False, oov_token=None)

In [28]:
X_train = X_train_df.values.astype(str) # Otherwise, there's an error when calling 'fit_on_texts' >> AttributeError: 'int' object has no attribute 'lower'
X_test = X_test_df.values.astype(str) # Otherwise, there's an error when calling 'fit_on_texts' >> AttributeError: 'int' object has no attribute 'lower'

# X_train = bigrams(X_train)

X_tokenizer.fit_on_texts(X_train)
X_train = X_tokenizer.texts_to_sequences(X_train)
X_test = X_tokenizer.texts_to_sequences(X_test)
# print(len(X_tokenizer.index_word))

X_train = X_tokenizer.sequences_to_matrix(X_train, mode='tfidf')
X_test = X_tokenizer.sequences_to_matrix(X_test, mode='tfidf')


In [29]:
# encode from string labels to numerical labels 
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train_df.values.astype(str)) # error without astype(str)
y_test = label_encoder.transform(y_test_df.values.astype(str))
# print(encoder.classes_.shape)
# encoder.inverse_transform(y_train)

y_train = to_categorical(y_train, num_classes)
y_test = to_categorical(y_test, num_classes)

# X_tokenizer.word_counts
# print(len(X_tokenizer.word_counts))

In [40]:
max_features = len(X_tokenizer.word_counts)
# print('max_features = {}'.format(max_features))
batch_size = 23
maxlen = 30
embedding_dims = 50
epochs=20

print(len(X_train), 'train sequences')
print(len(X_test), 'test sequences')

print('Pad sequences (samples x time)')
X_train = sequence.pad_sequences(X_train, padding="post", maxlen=maxlen)
X_test = sequence.pad_sequences(X_test, padding="post", maxlen=maxlen)
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)

print('Hyperparameters')
print('max_features = {}'.format(max_features))
print('batch_size = {}'.format(batch_size))
print('maxlen = {}'.format(maxlen))
print('embedding_dims = {}'.format(embedding_dims))
print('epochs = {}'.format(epochs))

def train_model():
    print('Build model...')
    model = Sequential()
    model.add(Embedding(max_features,
                        embedding_dims,
                        input_length=maxlen))
    model.add(SimpleRNN(embedding_dims))

#     model.add(LSTM(maxlen))
    model.add(Dense(num_classes, activation='softmax'))

    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    print(model.summary())
    
    print('Train...')

    history = model.fit(X_train, y_train,
              batch_size=batch_size,
              epochs=epochs,
              validation_data=(X_test, y_test),
                verbose=0
             )
    score, acc = model.evaluate(X_test, y_test,
                                batch_size=batch_size,
                               verbose=1)


    print('Test score:', score)
    print('Test accuracy:', acc)
#     return model

5700 train sequences
1425 test sequences
Pad sequences (samples x time)
X_train shape: (5700, 30)
X_test shape: (1425, 30)
Hyperparameters
max_features = 54
batch_size = 23
maxlen = 30
embedding_dims = 50
epochs = 20


In [47]:
%time _ = train_model()

Build model...
Model: "sequential_28"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_28 (Embedding)     (None, 30, 50)            2700      
_________________________________________________________________
simple_rnn_28 (SimpleRNN)    (None, 50)                5050      
_________________________________________________________________
dense_28 (Dense)             (None, 3)                 153       
Total params: 7,903
Trainable params: 7,903
Non-trainable params: 0
_________________________________________________________________
None
Train...
Test score: 0.97146679050044
Test accuracy: 0.480701744556427
CPU times: user 1min 1s, sys: 16.5 s, total: 1min 17s
Wall time: 31.5 s


In [20]:
results = model.predict(X_test)

def plot_model(filename='model_plot_0.png'):
    plot_model(model, to_file=filename, show_shapes=True, show_layer_names=True)
    
def plot_hist():
    colors = ['#E69F00', '#56B4E9', '#F0E442']    
    plt.hist(results.transpose(), normed=False)
    
def plot_loss():
    plt.plot(history.history['loss'])
    