In [1]:
import csv
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten
from tensorflow.keras.layers import Embedding, LSTM
from keras.utils import np_utils
from tensorflow.keras.preprocessing.sequence import pad_sequences
from utils import remove_accents

Using TensorFlow backend.


In [3]:
def loadData():
    with open('dataset.csv', encoding='utf-8') as csvfile:
        readCSV = csv.reader(csvfile, delimiter=',')
        data = [row for row in readCSV]
        return (data)
    
dataset = loadData()
max_length = max([len(row[0]) for row in dataset])
print('data length =', len(dataset))
print('max str length =', max_length)

data length = 63774
max str length = 28


In [4]:
highest_unicode = 0

def updateHighestUnicode(ch):
    global highest_unicode
    chInt = ord(ch)
    if chInt > highest_unicode:
        highest_unicode = chInt

def word2charArr(word):
    arr = []
    for i, ch in enumerate(list(word)):
        arr.append(ord(ch))
        updateHighestUnicode(ch)
    return arr

def name2vec():
    X_train = []
    for row in dataset:
        word = row[0]
        X_train.append(word2charArr(word))
        word = remove_accents(row[0])
        X_train.append(word2charArr(word))
    return np.array(X_train)

X_train = name2vec()
X_train = pad_sequences(X_train)
print(X_train, X_train.shape, highest_unicode)

[[  0   0   0 ...  97 110 104]
 [  0   0   0 ...  97 110 104]
 [  0   0   0 ... 109  97 105]
 ...
 [  0   0   0 ... 121 101 110]
 [  0   0   0 ...  97 110 104]
 [  0   0   0 ...  97 110 104]] (127548, 28) 8217


In [5]:
def gender2vec():
    Y_train = []
    for row in dataset:
        if (row[1] == 'm'):
            Y_train.extend([0, 0])
        else:
            Y_train.extend([1, 1])
    return np.array(Y_train)

Y_train = gender2vec()

In [6]:
model = Sequential()
model.add(Embedding(highest_unicode+1, 60, input_length=X_train.shape[1]))
model.add(Dropout(0.25))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2, activation='softmax'))

model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 28, 60)            493080    
_________________________________________________________________
dropout (Dropout)            (None, 28, 60)            0         
_________________________________________________________________
lstm (LSTM)                  (None, 128)               96768     
_________________________________________________________________
dense (Dense)                (None, 2)                 258       
Total params: 590,106
Trainable params: 590,106
Non-trainable params: 0
_________________________________________________________________


In [12]:
H = model.fit(X_train, Y_train, batch_size=32, epochs=10, verbose=1)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [13]:
def gender(name):
    data = []
    for i, ch in enumerate(list(name)):
        data.append(ord(ch))
    inp = np.array([data])
    inp = pad_sequences(inp, maxlen=28)
    print(inp)
    pred = model.predict(inp)[0]
    if pred[0] < pred[1]:
        return "nữ"
    else:
        return "nam"

gender("death click")

[[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 100
  101  97 116 104  32  99 108 105  99 107]]


'nam'

In [14]:
model.save_weights('./model_v2.h5')

Đoạn này vì không thể cài `tensorflowjs` trên env hiện tại, nên mình phải save model lại và chuyển sang env khác để export model ra json

In [7]:
import tensorflowjs as tfjs
model.load_weights('./model_v2.h5')
tfjs.converters.save_keras_model(model, './model_v2')