In [27]:
import pandas as pd
import numpy as np
import keras
import time
from keras.models import Sequential
from keras.layers import Dense ,Dropout
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.layers.normalization import BatchNormalization
import numpy as np
import random
import os

In [2]:
#https://github.com/simon-larsson/pokemon-name-generator/blob/master/name_generator.ipynb

In [44]:
step_length = 1    
epochs = 50      
batch_size = 32    
dropout_rate = 0.2 
model_path = os.path.realpath('./name_gen_model.h5') 
load_model = False 
store_model = True 
verbosity = 1     
gen_amount = 10    

In [6]:
input_names = []
with open('ThaiNameInEnglish.txt', encoding = 'utf8')as f:
    for name in f:
        name = name.rstrip()
        if len(input_names) < 10:
            print(name)
        input_names.append(name)
    print('...')

KAMONNUCH
KAMOLLAK
GASARAPORN
JARUWAN
CHIRAPORN
JUTARAT
CHONTHICHA
NARUEMON
WIMONSIRI
SASITHON
...


In [7]:
file = open('ThaiNameInEnglish.txt', encoding = 'utf8')
processed_inputs = file.read()    
processed_inputs = processed_inputs.lower()

In [8]:
chars = sorted(list(set(processed_inputs)))
print(chars)

bad_chars = ['-', '\u200b']
for i in range(len(bad_chars)):
    processed_inputs = processed_inputs.replace(bad_chars[i],"")
max_sequence_length = max([len(name) for name in input_names]) 
chars = sorted(list(set(processed_inputs)))
print(chars)

['\n', '-', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'y', 'z', '\u200b']
['\n', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'y', 'z']


In [10]:
concat_names = processed_inputs
input_len = len(concat_names )
num_chars = len(chars)
max_sequence_length = max([len(name) for name in input_names]) 
print ("Total number of characters:", input_len)
print ("Total vocab:", num_chars)
print('Number of names: ', len(input_names))
print('Longest name: ', max_sequence_length)

Total number of characters: 22042
Total vocab: 26
Number of names:  2399
Longest name:  21


In [11]:
char2idx = dict((c, i) for i, c in enumerate(chars))
idx2char = dict((i, c) for i, c in enumerate(chars))

In [45]:
sequences = []
next_chars = []
for i in range(0, len(concat_names) - max_sequence_length, step_length):
    sequences.append(concat_names[i: i + max_sequence_length])
    next_chars.append(concat_names[i + max_sequence_length])

num_sequences = len(sequences)

print('Number of sequences:', num_sequences)
print('First 10 sequences and next chars:')
for i in range(10):
    print('X=[{}]   y=[{}]'.replace('\n', ' ').format(sequences[i], next_chars[i]).replace('\n', ' '))

Number of sequences: 22021
First 10 sequences and next chars:
X=[kamonnuch kamollak ga]   y=[s]
X=[amonnuch kamollak gas]   y=[a]
X=[monnuch kamollak gasa]   y=[r]
X=[onnuch kamollak gasar]   y=[a]
X=[nnuch kamollak gasara]   y=[p]
X=[nuch kamollak gasarap]   y=[o]
X=[uch kamollak gasarapo]   y=[r]
X=[ch kamollak gasarapor]   y=[n]
X=[h kamollak gasaraporn]   y=[ ]
X=[ kamollak gasaraporn ]   y=[j]


In [46]:
X = np.zeros((num_sequences, max_sequence_length, num_chars), dtype=np.bool)
Y = np.zeros((num_sequences, num_chars), dtype=np.bool)

for i, sequence in enumerate(sequences):
    for j, char in enumerate(sequence):
        X[i, j, char2idx[char]] = 1
    Y[i, char2idx[next_chars[i]]] = 1
    
print('X shape: {}'.format(X.shape))
print('Y shape: {}'.format(Y.shape))

X shape: (22021, 21, 26)
Y shape: (22021, 26)


In [47]:
model = Sequential()
model.add(LSTM(64, input_shape=(max_sequence_length, num_chars),   recurrent_dropout=dropout_rate))
model.add(Dense(units=num_chars, activation='softmax'))
model.compile(loss='categorical_crossentropy',optimizer='adam')

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_32 (LSTM)               (None, 64)                23296     
_________________________________________________________________
dense_10 (Dense)             (None, 26)                1690      
Total params: 24,986
Trainable params: 24,986
Non-trainable params: 0
_________________________________________________________________


In [48]:
if load_model:
    model.load(model_path)
else:
    start = time.time()
    print('Start training for {} epochs'.format(epochs))
    history = model.fit(X, Y, epochs=epochs, batch_size=batch_size, verbose=verbosity)
    end = time.time()
    print('Finished training - time elapsed:', (end - start)/60, 'min')
if store_model:
    print('Storing model at:', model_path)
    model.save(model_path)

Start training for 50 epochs
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Finished training - time elapsed: 18.752102224032082 min
Storing model at: C:\Users\peravit2\Desktop\BADS7604 DL\Aj.thitirat\name_gen_model.h5


In [51]:
filename = "name_gen_model.h5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [57]:
sequence = concat_names[-(max_sequence_length - 1):] + '\n'

new_names = []

print('{} new names are being generated'.format(gen_amount))

while len(new_names) < gen_amount:
    
    x = np.zeros((1, max_sequence_length, num_chars))
    for i, char in enumerate(sequence):
        x[0, i, char2idx[char]] = 1

    probs = model.predict(x, verbose=0)[0]
    probs /= probs.sum()
    next_idx = np.random.choice(len(probs), p=probs)   
    next_char = idx2char[next_idx]   
    sequence = sequence[1:] + next_char

    if next_char == '\n':

        gen_name = [name for name in sequence.split('\n')][1]

        if len(gen_name) > 2 and gen_name[0] == gen_name[1]:
            gen_name = gen_name[1:]

        if len(gen_name) > 2:
            
            if gen_name not in input_names + new_names:
                new_names.append(gen_name.capitalize())

        if 0 == (len(new_names) % (gen_amount/ 10)):
            print('Generated {}'.format(len(new_names)))

10 new names are being generated
Generated 1
Generated 1
Generated 2
Generated 3
Generated 4
Generated 5
Generated 6
Generated 7
Generated 8
Generated 9
Generated 10


In [58]:
print_first_n = min(10, gen_amount)

print('First {} generated names:'.format(print_first_n))
for name in new_names[:print_first_n]:
    print(name)

First 10 generated names:
Tarathip
Phongphet
Pheerapat
Piyakan
Piyanut
Preedapa
Phanida
Menthima
Phimchai
Monthit
