<a href="https://colab.research.google.com/github/morrisalp/taatik/blob/master/Learn_transliteration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import requests
from tqdm import tqdm
tqdm.pandas()
import tensorflow as tf
import pandas as pd
from google.colab import files
import io
import re
import unicodedata
import numpy as np

https://blog.keras.io/a-ten-minute-introduction-to-sequence-to-sequence-learning-in-keras.html

# Load and clean transliteration data from Wiktionary

In [0]:
uploaded = files.upload() # upload hebrew_wiki_translit.csv

Saving hebrew_wiki_translit.csv to hebrew_wiki_translit.csv


In [0]:
# cleaning functions
def normalize(text):
  # normalizes e.g. order of multiple nikkud characters
  out = unicodedata.normalize('NFC', text)
  return out.replace(
      '\u05ba', '\u05b9' # normalize holam
  ).replace(
      '\u05bd', '' # remove meteg
  ).replace(
    '\u05be', '-' # replace maqaf with hyphen
  ).replace('״', '"').replace("׳", "'")
open_brackets, close_brackets, vertical_bar, not_bracket = r'\[\[', r'\]\]', r'\|', r'[^\]]'
link_regex = f'{open_brackets}(?:{not_bracket}*{vertical_bar})({not_bracket}*){close_brackets}'
def remove_links(text):
  #    [[אדריכלות|אַדְרִיכָלוּת]] [[נוף|נוֹף]]
  # goes to:
  #    אַדְרִיכָלוּת נוֹף
  out = re.sub(link_regex, r'\1', text)
  return re.sub(r'\([^\)]*\)', '', re.sub(r'\{\{[^\}]*\}\}', '', out))
# load scraped transliterations and clean them
df = pd.read_csv(io.BytesIO(uploaded['hebrew_wiki_translit.csv']), keep_default_na = False)
df.word = df.word.str.replace('״', '"').str.replace("׳", "'").str.strip()
df.nikkud = df.nikkud.apply(normalize).apply(remove_links).str.strip()
df.transliteration = df.transliteration.str.lower().str.replace("[׳\u200f]", "'").str.strip()
# split multiwords
n_word, n_nikkud, n_translit = [x.str.split().apply(len) for x in [df.word, df.nikkud, df.transliteration]]
df = df[(n_word == n_nikkud) & (n_nikkud == n_translit)]
df = pd.concat(list(
    pd.DataFrame({
        'word': t.word.split(),
        'nikkud': t.nikkud.split(),
        'transliteration': t.transliteration.split()
    })
    for t in tqdm(df.itertuples(), desc = 'Splitting multiwords', total = df.shape[0])
))
# remove bad characters
df = df[
    df.word.str.match('[א-ת]') &
    ~df.word.str.contains("[^א-ת '\"]") &
    (df.nikkud != '') &
    ~df.nikkud.str.contains(r'[^\u0590-\u05ff \'"]') &
    ~df.nikkud.str.match('^[א-ת \'"]*$') &
    df.transliteration.str.contains('[a-z]') &
    ~df.transliteration.str.contains('[^a-z \'"]')
]


Splitting multiwords: 100%|██████████| 11922/11922 [00:08<00:00, 1478.94it/s]


In [0]:
df.shape

(15490, 3)

# Make data sequences

In [0]:
nikkud_maxlen = df.nikkud.apply(len).max()
translit_maxlen = df.transliteration.apply(len).max()
symbols = '^$ '
nikkud_unique_chars = set(''.join(df.nikkud)) - set(symbols)
translit_unique_chars = set(''.join(df.transliteration)) - set(symbols)
nikkud_charset = symbols + ''.join(sorted(nikkud_unique_chars))
translit_charset = symbols + ''.join(sorted(translit_unique_chars))

In [0]:
print(nikkud_maxlen, len(nikkud_charset), nikkud_charset)
print(translit_maxlen, len(translit_charset), translit_charset)

31 46 ^$ "'ְֱֲֳִֵֶַָֹֻּׁׂאבגדהוזחטיךכלםמןנסעףפץצקרשת
25 31 ^$ "'abcdefghijklmnopqrstuvwxyz


In [0]:
def pad_word(word, pad_length):
  return '^' + word + ' ' * (pad_length - len(word)) + '$'
def word2onehot(word, charset, pad_length):
  return tf.keras.utils.to_categorical([charset.index(c) for c in pad_word(word, pad_length)], num_classes = len(charset))
def nikkud2onehot(nikkud):
  return word2onehot(nikkud, nikkud_charset, nikkud_maxlen)
def translit2onehot(translit):
  return word2onehot(translit, translit_charset, translit_maxlen)

In [0]:
nikkud2onehot('צִיתָר')

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]], dtype=float32)

In [0]:
X = np.array([nikkud2onehot(nikkud) for nikkud in df.nikkud])
Y = np.array([translit2onehot(translit) for translit in df.transliteration])

In [0]:
# teacher forcing - Z[:, t, :] = Y[:, t + 1, :]
Z = np.roll(Y, -1, axis = 1)

In [0]:
X.shape, Y.shape, Z.shape

((15490, 33, 46), (15490, 27, 31), (15490, 27, 31))

In [0]:
(Z[:, 10, :] == Y[:, 11, :]).all()

True

In [0]:
def onehot2word(onehot, charset):
  return ''.join([charset[np.argmax(v)] for v in onehot])
def onehot2nikkud(onehot):
  return onehot2word(onehot, nikkud_charset)
def onehot2translit(onehot):
  return onehot2word(onehot, translit_charset)

In [0]:
onehot2translit(Y[1, :, :])

'^eugenika                 $'

In [0]:
onehot2translit(Z[1, :, :])

'eugenika                 $^'

# Build seq2seq nikkud=>translit (N2T) model

In [0]:
latent_dim = 256
batch_size = 256#128#64
epochs = 100
validation_split = 0.2

In [0]:
# Define an input sequence and process it.
encoder_inputs = tf.keras.layers.Input(shape = (None, len(nikkud_charset)))#(None, num_encoder_tokens))
encoder = tf.keras.layers.LSTM(latent_dim, return_state = True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

In [0]:
# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = tf.keras.layers.Input(shape = (None, len(translit_charset)))#(None, num_decoder_tokens))
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the 
# return states in the training model, but we will use them in inference.
decoder_lstm = tf.keras.layers.LSTM(latent_dim, return_sequences = True, return_state = True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
                                     initial_state = encoder_states)
decoder_dense = tf.keras.layers.Dense(len(translit_charset), activation = 'softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [0]:
# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = tf.keras.models.Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [0]:
print(model.summary())

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_31 (InputLayer)           (None, None, 46)     0                                            
__________________________________________________________________________________________________
input_32 (InputLayer)           (None, None, 31)     0                                            
__________________________________________________________________________________________________
lstm_14 (LSTM)                  [(None, 256), (None, 310272      input_31[0][0]                   
__________________________________________________________________________________________________
lstm_15 (LSTM)                  [(None, None, 256),  294912      input_32[0][0]                   
                                                                 lstm_14[0][1]                    
          

# Train N2T model

In [0]:
# Run training
model.compile(optimizer = 'rmsprop', loss = 'categorical_crossentropy')
model.fit([X, Y], Z,
          batch_size = batch_size,
          epochs = epochs,
          validation_split = validation_split)

Train on 12392 samples, validate on 3098 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/10

<tensorflow.python.keras.callbacks.History at 0x7f7e3efe47f0>

# Build N2T inference model

In [0]:
encoder_model = tf.keras.models.Model(encoder_inputs, encoder_states)

decoder_state_input_h = tf.keras.layers.Input(shape = (latent_dim,))
decoder_state_input_c = tf.keras.layers.Input(shape = (latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_inputs, initial_state = decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = tf.keras.models.Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)

In [0]:
def decode_sequence(input_text, input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, len(translit_charset)))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0, 0] = 1.

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)

        # Sample a token
        char_probabilities = {
            c: p for c, p in zip(translit_charset, output_tokens[0, -1, :])
        }
        sampled_char = max(translit_charset, key = lambda c: char_probabilities[c])
        sampled_token_index = translit_charset.index(sampled_char)
#         sampled_token_index = np.argmax(output_tokens[0, -1, :])
#         sampled_char = translit_charset[sampled_token_index]
        decoded_sentence += sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '$' or
           len(decoded_sentence) > translit_maxlen):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, len(translit_charset)))
        target_seq[0, 0, sampled_token_index] = 1.

        # Update states
        states_value = [h, c]

    return decoded_sentence

In [0]:
def n2t(nikkud):
  input_text = normalize(nikkud)
  N = nikkud2onehot(input_text)[None]
  return decode_sequence(input_text, N).replace('$', '').strip()

# Test N2T predictions

In [0]:
n2t('שַׁחֶפֶת')

'shakhefet'

In [0]:
df2 = df.sample(n = 20, random_state = 0)

In [0]:
df2['n2t'] = df2.nikkud.progress_apply(n2t)

100%|██████████| 20/20 [00:02<00:00,  9.04it/s]


In [0]:
df2['match'] = df2.transliteration == df2.n2t

In [0]:
df2['match'].sum() / df2.shape[0]

0.7

In [0]:
df2[['nikkud', 'transliteration', 'n2t', 'match']]

Unnamed: 0,nikkud,transliteration,n2t,match
0,סֶקְסְטַנְט,sekstant,sextaneya,False
0,מֻגְלָה,mugla,mugla,True
0,מוֹדַעַת,moda'at,moda'at,True
0,בָּרִיא,bari,bari,True
0,שְׁמַרְחֹם,shmarkhom,shmarmon,False
0,מִסְפָּן,mispan,mispan,True
0,רִיבָה,riva,riva,True
0,אֳנִיַּת,oniyat,oniyat,True
1,מִסְחָרִית,miskharit,miskharit,True
0,מֵמֵס,memes,memes,True


In [0]:
[(x, n2t(x)) for x in 'אבגדהוזחטיכךלמםנןסעפףצץקרשת']

[('א', 'os'),
 ('ב', 've'),
 ('ג', 'go'),
 ('ד', 'du'),
 ('ה', 'ho'),
 ('ו', 'wa'),
 ('ז', 'zu'),
 ('ח', 'lek'),
 ('ט', 'to'),
 ('י', 'yu'),
 ('כ', 'ch'),
 ('ך', 'cho'),
 ('ל', 'lo'),
 ('מ', 'mo'),
 ('ם', 'mo'),
 ('נ', 'no'),
 ('ן', 'no'),
 ('ס', 'su'),
 ('ע', "'o"),
 ('פ', 'fo'),
 ('ף', 'fe'),
 ('צ', 'tsa'),
 ('ץ', 'tush'),
 ('ק', 'chu'),
 ('ר', 'rush'),
 ('ש', 'sh'),
 ('ת', 'to')]

In [0]:
n2t('שלום')

"'olsm"

In [0]:
n2t('מוריס')

'mursi'