In this notebook I train an LSTM network

on the [nettalk corpus](https://archive.ics.uci.edu/ml/datasets/Connectionist+Bench+%28Nettalk+Corpus%29)

to perform English to Russian transliteration.

Accuracy achieved: 93%

In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow.keras as keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Masking
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import TimeDistributed
from tensorflow.keras.layers import Bidirectional

# Load data

In [2]:
! wget https://archive.ics.uci.edu/ml/machine-learning-databases/undocumented/connectionist-bench/nettalk/nettalk.data

--2020-12-03 15:05:23--  https://archive.ics.uci.edu/ml/machine-learning-databases/undocumented/connectionist-bench/nettalk/nettalk.data
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 541269 (529K) [application/x-httpd-php]
Saving to: ‘nettalk.data’


2020-12-03 15:05:27 (548 KB/s) - ‘nettalk.data’ saved [541269/541269]



In [2]:
with open('nettalk.data') as f:
    data = f.readlines()[10:]
X = []
Y = []
for line in data:
    X.append(list(line.split()[0]))
    Y.append(list(line.split()[1]))
eng_alphabet = list({l for word in X for l in word})
print("Letter alphabet", eng_alphabet)
ph_alphabet = np.array(list({l for word in Y for l in word})) #phoneme alphabet
print("Phoneme alphabet", ph_alphabet)
# see
# https://archive.ics.uci.edu/ml/machine-learning-databases/undocumented/connectionist-bench/nettalk/nettalk.names
# for what phoneme symbols mean

char_to_num = dict((c, i) for i, c in enumerate(eng_alphabet))
num_to_char = dict((i, c) for i, c in enumerate(eng_alphabet))

ph_to_num = dict((c, i) for i, c in enumerate(ph_alphabet))
num_to_ph = dict((i, c) for i, c in enumerate(ph_alphabet))

lengths = [len(word) for word in X]
maxlen = max(lengths)

#add zero padding at the end of each word
x = np.zeros((len(X), maxlen, len(eng_alphabet)), dtype=np.bool)
y = np.zeros((len(X), maxlen, len(ph_alphabet)), dtype=np.bool) 
y[:,:,ph_to_num['-']] = 1 #absense of letter sounds like silence
for i, word in enumerate(X):
    for t, char in enumerate(word):
        x[i, t, char_to_num[char]] = 1
        y[i, t, ph_to_num['-']] = 0
        y[i, t, ph_to_num[Y[i][t]]] = 1
    
print("x.shape", x.shape)
print("y.shape", y.shape)
assert (np.sum(y, axis = 2) == 1).all()

Letter alphabet ['b', 'j', 'i', 'z', 't', 'u', 'l', 'g', 'k', 's', 'h', 'd', 'f', 'm', 'v', 'p', 'x', 'o', 'w', 'e', 'c', 'n', 'y', 'a', 'q', 'r']
Phoneme alphabet ['R' 'b' 'i' 'U' 'T' 'z' 't' 'I' 'u' 'l' 'g' 'Y' 'k' '!' 's' 'G' 'C' 'h'
 'E' 'd' 'f' 'S' 'm' 'Z' '^' 'v' '*' 'p' 'x' '@' 'O' 'M' '+' 'o' '#' 'w'
 'e' 'N' 'K' 'c' 'L' 'A' 'n' 'y' 'X' 'D' 'a' 'W' '-' 'J' 'r']
x.shape (20008, 19, 26)
y.shape (20008, 19, 51)


In [3]:
X_train, X_test, Y_train, Y_test = train_test_split(x, y, random_state=0, test_size=0.01)

X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, random_state=0, test_size=0.02)
print(len(X_train), "Train sequences")
print(len(X_val), "Validation sequences")
print(len(X_test), "Test sequences")
print("X_train shape: ", X_train.shape)
print("Y_train shape: ", Y_train.shape)

19410 Train sequences
397 Validation sequences
201 Test sequences
X_train shape:  (19410, 19, 26)
Y_train shape:  (19410, 19, 51)


In [4]:
model = Sequential()
model.add(Masking(mask_value=0, input_shape=(maxlen, len(eng_alphabet))))
model.add(Bidirectional(LSTM(32, return_sequences=True), input_shape=(maxlen, len(eng_alphabet))))
model.add(Bidirectional(LSTM(32, return_sequences=True)))
model.add(TimeDistributed(Dense(51, activation='softmax')))
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

model.fit(X_train, Y_train, epochs=60, batch_size=32, validation_data=(X_val, Y_val),
          callbacks = [keras.callbacks.EarlyStopping(patience=7)])

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Train on 19410 samples, validate on 397 samples
Epoch 1/60
Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 5/60
Epoch 6/60
Epoch 7/60
Epoch 8/60
Epoch 9/60
Epoch 10/60
Epoch 11/60
Epoch 12/60
Epoch 13/60
Epoch 14/60
Epoch 15/60
Epoch 16/60
Epoch 17/60
Epoch 18/60
Epoch 19/60
Epoch 20/60
Epoch 21/60
Epoch 22/60
Epoch 23/60
Epoch 24/60
Epoch 25/60
Epoch 26/60
Epoch 27/60
Epoch 28/60
Epoch 29/60
Epoch 30/60
Epoch 31/60
Epoch 32/60
Epoch 33/60
Epoch 34/60
Epoch 35/60
Epoch 36/60


<tensorflow.python.keras.callbacks.History at 0x7f3caa718610>

In [6]:
results = model.evaluate(X_test, Y_test, batch_size=128)
print("test loss, test acc:", results)
model.save('translit.h5')  

test loss, test acc: [0.08416331552006119, 0.93391776]


In [7]:
import numpy as np
maxlen = 19
def predict(word, model):
    x = np.zeros((1, maxlen, len(eng_alphabet)), dtype = np.bool)
    for t, char in enumerate(word):
        x[0, t, char_to_num[char]] = 1
    y = np.argmax(model.predict(x), axis = 2)[0][:len(word)]
    return([num_to_ph[num] for num in y])

In [8]:
#transcribe phonemes into russian
ph_to_rus = {   
  "a":"о",
  "b":"б",
  "c":"o",
  "d":"д",
  "e":"эй", # ей
  "f":"ф",
  "g":"г",
  "h":"х",
  "i":"и",
  "k":"к",
  "l":"л",
  "m":"м",
  "n":"н",
  "o":"оу",
  "p":"п",
  "r":"р",
  "s":"с",
  "t":"т",
  "u":"у",
  "v":"в",
  "w":"в",
  "x":"э",
  "y":"й",
  "z":"з",
  "A":"ай",
  "C":"ч",
  "D":"з",
  "E":"э",
  "G":"нг", 
  "I":"и",
  "J":"дж",
  "K":"кш",
  "L":"л", # или "эл"
  "M":"м",
  "N":"н",
  "O":"ой",
  "Q":"кв",
  "R":"ёр",
  "S":"ш",
  "T":"с",
  "U":"у",
  "W":"ау",
  "X":"кс",
  "Y":"ью", #после гласной или в начале слова = ю, после согласной = ью
  "Z":"ж",
  "@":"э",
  "!":"ц",
  "#":"гз",
  "*":"в",
  "^":"а",
  "+":"уа",
  "-":""
}
def translit(word, model):
    phonemes = predict(word, model)
    return ''.join([ph_to_rus[ph] for ph in phonemes])

In [9]:
#some random english words not from the training set:
wordlist = """telling educated colorful decisive reuse hissing dazzling idolatry development reproduction
rebellious fervor gleeful doleful harmful bitterness despite regardless copyright security""".split()
for word in wordlist:
    print(translit(word, model))

тэлинг
эджэкэйтэд
калёрфал
дисайсив
руз
хизинг
дэзлинг
айдлэтри
дэвэлопмэнт
рэпрэдакшэн
рибэлйэс
фёрвёр
глифал
доулфал
хормфал
битёрнэс
диспайт
ригёрдлэс
копэрайт
сикьюрэти
