<a href="https://colab.research.google.com/github/narendra974/AIMLOPS_IISC/blob/main/NMT_TRANSFORMERS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [293]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [294]:
%cd /content/drive/MyDrive/DATASETS/ENG_DEU_DATA/

/content/drive/MyDrive/DATASETS/ENG_DEU_DATA


In [295]:
!pip install pycld2



In [296]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
import pycld2 as cld2
import spacy
from nltk.stem.porter import PorterStemmer
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import regex as re

import tensorflow as tf
from tensorflow import keras

from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences

In [297]:
nltk.download('words')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
!python -m spacy download de_core_news_sm
gernlp = spacy.load('de_core_news_sm')
engnlp = spacy.load('en_core_web_sm')

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Collecting de-core-news-sm==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.6.0/de_core_news_sm-3.6.0-py3-none-any.whl (14.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.6/14.6 MB[0m [31m35.5 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('de_core_news_sm')


In [298]:
def removestop(text,stopwords):
  raw = text.split()
  words = [word for word in raw if not word in stopwords]
  cleanwords = ' '.join(words)
  return cleanwords

def tolower(text):
  return text.lower()

def removespecial(text):
  te1 = re.sub("\s+"," ",text)
  te2 = re.sub('\n', '', te1)
  te3 = re.sub('\r', '', te2)
  te4 = re.sub("[0-9]","",te3)
  te5 = re.sub("()@%^&*-_,/\{}[?|$|.|!]","",te4)
  te6 = re.sub(r"[\p{Cc}\p{Cs}]+","",te5)
  te7 = re.sub(r'[^\w\s]','', te6)
  te8 = re.sub("[^a-zA-Z ]","",te7)
  return te7

def removeurl(text):
  return re.sub('https?://\S+|www\.\S+', '', text)

def clean_text(text):
    text = text.lower()
    pattern = re.compile(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?")
    text = re.sub(pattern,' ',text).strip()
    return text

In [299]:
basefolder =  "/content/drive/MyDrive/DATASETS/ENG_DEU_DATA/"
# germanfiles = ["commoncrawl_de_en.txt","europarl-v7_de_en.txt","news-commentary-v9_de_en.txt"]
# engfiles = ["commoncrawl_en_de.txt","europarl-v7_en_de.txt","news-commentary-v9_en_de.txt"]
germanfiles = ["news-commentary-v9_de_en.txt"]
engfiles = ["news-commentary-v9_en_de.txt"]

In [300]:
def read_files(fileloc, language):
  with open(fileloc,"rb") as f:
    f_lines = f.readlines()
  df = pd.DataFrame(f_lines)
  dfc = df.set_axis([language],axis=1)
  dfc[language] = dfc[language].str.decode("utf-8")
  return dfc

In [301]:
dfappend=pd.DataFrame()
for efile in range(len(germanfiles)):
  germanfilepath = basefolder+germanfiles[efile]
  print(germanfilepath)
  germandff = read_files(germanfilepath,"german")
  engfilepath = basefolder+engfiles[efile]
  engdff = read_files(engfilepath,"english")
  print(germandff.shape)
  print(engdff.shape)
  dfconcat = pd.concat([germandff, engdff],axis="columns")
  dfappend=pd.concat([dfappend, dfconcat])

/content/drive/MyDrive/DATASETS/ENG_DEU_DATA/news-commentary-v9_de_en.txt
(201288, 1)
(201288, 1)


In [302]:
dfappend.isna().sum()

german     0
english    0
dtype: int64

In [303]:
dfappend.duplicated().sum()

430

In [304]:
dfappend.drop_duplicates(subset=None, keep='first', inplace=True)
dfappend.shape

(200858, 2)

In [305]:
german_stop_words = stopwords.words('german')
english_stop_words = stopwords.words('english')

dfappend['english_clean'] = dfappend['english'].apply(lambda x: removestop(x,english_stop_words))
dfappend['german_clean'] = dfappend['german'].apply(lambda x: removestop(x,german_stop_words))
dfappend['german_clean'] = dfappend['german_clean'].apply(lambda x: removespecial(x))
dfappend['english_clean'] = dfappend['english_clean'].apply(lambda x: removespecial(x))

In [306]:
def langdet(x):
  isReliable, textBytesFound, details = cld2.detect(x)
  return(details[0][1])

In [307]:
dfappend['is_eng'] = dfappend['english_clean'].apply(lambda x: langdet(x))
dfappend['is_ger'] = dfappend['german_clean'].apply(lambda x: langdet(x))

In [308]:
display(dfappend["is_ger"].value_counts())

de     195236
un       4267
en        861
nl         83
lb         83
ru         83
nn         37
da         35
no         16
na         13
fy         12
af         11
la          7
sv          7
id          7
pt          6
ms          6
fr          5
tk          5
war         5
ie          5
ro          5
rm          5
sk          4
fi          4
tr          3
it          3
pl          3
eu          3
es          3
vo          3
gv          2
cs          2
et          2
tn          2
ha          2
lt          2
gn          2
hr          2
rw          2
ia          1
mfe         1
mt          1
bi          1
tl          1
lv          1
zzp         1
ca          1
az          1
sco         1
fo          1
jw          1
ts          1
aa          1
Name: is_ger, dtype: int64

In [309]:
display(dfappend["is_eng"].value_counts())

en     196526
un       4140
sco        26
ru         24
da         20
zh         17
ca         10
ms          7
la          7
id          5
tr          5
oc          5
rw          5
rm          4
gn          3
pt          3
gv          3
zzp         3
es          3
de          2
co          2
fr          2
pl          2
ro          2
bs          2
mfe         2
nn          2
uz          2
ie          2
ha          2
hmn         1
az          1
lt          1
mg          1
af          1
vo          1
gl          1
no          1
fj          1
sq          1
sn          1
ts          1
sr          1
ceb         1
kha         1
hu          1
ga          1
bi          1
war         1
tk          1
Name: is_eng, dtype: int64

In [310]:
dfappendclean = dfappend[(dfappend.is_eng == 'en') & (dfappend.is_ger =='de')]

In [311]:
dfappendclean.shape

(192441, 6)

In [312]:
display(dfappendclean)

Unnamed: 0,german,english,english_clean,german_clean,is_eng,is_ger
1,"SAN FRANCISCO – Es war noch nie leicht, ein ra...",SAN FRANCISCO – It has never been easy to have...,SAN FRANCISCO It never easy rational conversa...,SAN FRANCISCO Es nie leicht rationales Gesprä...,en,de
2,In letzter Zeit allerdings ist dies schwierige...,"Lately, with gold prices up more than 300% ove...",Lately gold prices last decade harder ever,In letzter Zeit allerdings schwieriger je Gold...,en,de
3,Erst letzten Dezember verfassten meine Kollege...,"Just last December, fellow economists Martin F...",Just last December fellow economists Martin Fe...,Erst letzten Dezember verfassten Kollegen Mart...,en,de
4,"Und es kam, wie es kommen musste.\n",Wouldn’t you know it?\n,Wouldnt know it,Und kam kommen musste,en,de
5,Seit der Veröffentlichung ihrer Artikel ist de...,"Since their articles appeared, the price of go...",Since articles appeared price gold moved still...,Seit Veröffentlichung Artikel Goldpreis gestiegen,en,de
...,...,...,...,...,...,...
201283,Das bleibt eine der größten Errungenschaften i...,Their achievement remains one of the greatest ...,Their achievement remains one greatest recent ...,Das bleibt größten Errungenschaften jüngeren G...,en,de
201284,Gleichzeitig scheint sich Zumas revolutionäre ...,"At the same time, Zuma’s revolutionary generat...",At time Zumas revolutionary generation still s...,Gleichzeitig scheint Zumas revolutionäre Gener...,en,de
201285,"In einer Region, wo die älteren Menschen sehr ...","In a region that reveres the elderly, Zuma’s a...",In region reveres elderly Zumas attachment rur...,In Region älteren Menschen verehrt werden Zuma...,en,de
201286,Drei von zehn Südafrikanern sind jünger als 15...,Three in ten South Africans are younger than 1...,Three ten South Africans younger meaning live...,Drei zehn Südafrikanern jünger bedeutet Tag A...,en,de


In [313]:
dfappendclean["engcount"]=dfappendclean['english_clean'].str.split().str.len()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfappendclean["engcount"]=dfappendclean['english_clean'].str.split().str.len()


In [314]:
train_n=640
total = int(1.20 * train_n)
dfappendclean = dfappendclean.sample(total)
dfappendclean_val = dfappendclean[:train_n]
dfappendclean = dfappendclean[train_n:total]

In [315]:
bos_string = 'bos '
eos_string = ' eos'
dfappendclean['english_clean'] = bos_string + dfappendclean['english_clean'].astype(str) + eos_string
dfappendclean['german_clean'] = bos_string + dfappendclean['german_clean'].astype(str) + eos_string

en_list =  dfappendclean['english_clean'].astype(str).tolist()
ge_list =  dfappendclean['german_clean'].astype(str).tolist()
en_list_val =  dfappendclean_val['english_clean'].astype(str).tolist()
ge_list_val =  dfappendclean_val['german_clean'].astype(str).tolist()


In [316]:
display(en_list[0:5])

['bos America did indeed turn inward following fall Saigon neglect Afghanistan following Soviet withdrawal  led chaos Al Qaedas neartakeover country eos',
 'bos It also calls government adopt within year legislation prohibit potentially harmful experiments great apes interests eos',
 'bos Russias adherence WTOs legal framework begin make economic relations much stable predictable eos',
 'bos Despite announced reforms unlikely significant exchangerate flexibility Chinas currency eos',
 'bos The group brought together Gwyn Prins wellregarded expert security policy international relations heads LSEs Mackinder Programme Study Long Wave Events eos']

In [317]:
display(ge_list[0:5])

['bos Tatsächlich zog Amerika Fall Saigons zurück Vernachlässigung Afghanistans sowjetischen Rückzug Jahr  führte Chaos beinahen Machtübernahme AlKaida Land eos',
 'bos Sie verlangt außerdem Regierung innerhalb Jahres Gesetz erlassen potenziell schädliche Experimente Menschenaffen deren Interesse liegen untersagt eos',
 'bos Wenn Russland rechtliche Regelwerks WHO hält dürften Wirtschaftsbeziehungen Land künftig stabiler vorhersehbarer gestalten eos',
 'bos Trotz angekündigter Reformen unwahrscheinlich signifikanten Wechselkursflexibilität chinesischen Währung kommen wird eos',
 'bos Die Expertengruppe traf Initiative Gwyn Prins renommierten Experten Sicherheitspolitik internationale Beziehungen Mackinder Programme LSE Erforschung langfristigen Ereignissen leitet eos']

In [318]:
MAX_TOKENS=128
en_tokenizer = Tokenizer();
en_tokenizer.fit_on_texts(dfappendclean["english_clean"])

en_bos_index = en_tokenizer.word_index['bos']
print(en_bos_index)
en_eos_index = en_tokenizer.word_index['eos']
print(en_eos_index)

1
2


In [319]:
ge_tokenizer = Tokenizer();
ge_tokenizer.fit_on_texts(dfappendclean["german_clean"])

ge_bos_index = ge_tokenizer.word_index['bos']
print(ge_bos_index)
ge_eos_index = ge_tokenizer.word_index['eos']
print(ge_eos_index)

1
2


In [320]:
MAX_TOKENS=128
en = en_tokenizer.texts_to_sequences(en_list)     # Output is ragged.
en = tf.keras.utils.pad_sequences(en, maxlen=MAX_TOKENS, padding='post')

ge = ge_tokenizer.texts_to_sequences(ge_list)
ge = tf.keras.utils.pad_sequences(ge, maxlen=MAX_TOKENS+1, padding='post')

en_val = en_tokenizer.texts_to_sequences(en_list_val)     # Output is ragged.
en_val = tf.keras.utils.pad_sequences(en_val, maxlen=MAX_TOKENS, padding='post')

ge_val = ge_tokenizer.texts_to_sequences(ge_list_val)
ge_val = tf.keras.utils.pad_sequences(ge_val, maxlen=MAX_TOKENS+1, padding='post')


In [321]:
dataset = tf.data.Dataset.from_tensor_slices((en, ge))
dataset_val = tf.data.Dataset.from_tensor_slices((en_val, ge_val))

In [322]:
BUFFER_SIZE = 20000
BATCH_SIZE = 64
def prepare_batch(en, ge):

    ge_inputs = ge[:, :-1]
    ge_labels = ge[:, 1:]

    return (en, ge_inputs), ge_labels

def make_batches(ds):
  return (
      ds
      .shuffle(BUFFER_SIZE)
      .batch(BATCH_SIZE)
      .map(prepare_batch, tf.data.AUTOTUNE)
      .prefetch(buffer_size=tf.data.AUTOTUNE))

# Create training and validation set batches.
dataset_batches = make_batches(dataset)
dataset_batches_val = make_batches(dataset_val)

In [323]:
for (pt, en), en_labels in dataset_batches.take(1):
  break

print(pt.shape)
print(en.shape)
print(en_labels.shape)

(64, 128)
(64, 128)
(64, 128)


In [324]:
def positional_encoding(length, depth):
  depth = depth/2

  positions = np.arange(length)[:, np.newaxis]     # (seq, 1)
  depths = np.arange(depth)[np.newaxis, :]/depth   # (1, depth)

  angle_rates = 1 / (10000**depths)         # (1, depth)
  angle_rads = positions * angle_rates      # (pos, depth)

  pos_encoding = np.concatenate(
      [np.sin(angle_rads), np.cos(angle_rads)],
      axis=-1)

  return tf.cast(pos_encoding, dtype=tf.float32)

In [325]:
class PositionalEmbedding(tf.keras.layers.Layer):
  def __init__(self, vocab_size, d_model):
    super().__init__()
    self.d_model = d_model
    self.embedding = tf.keras.layers.Embedding(vocab_size, d_model, mask_zero=True)
    self.pos_encoding = positional_encoding(length=128, depth=d_model)

  def compute_mask(self, *args, **kwargs):
    return self.embedding.compute_mask(*args, **kwargs)

  def call(self, x):
    length = tf.shape(x)[1]
    x = self.embedding(x)
    # This factor sets the relative scale of the embedding and positonal_encoding.
    x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    x = x + self.pos_encoding[tf.newaxis, :length, :]
    return x


In [326]:
class BaseAttention(tf.keras.layers.Layer):
  def __init__(self, **kwargs):
    super().__init__()
    self.mha = tf.keras.layers.MultiHeadAttention(**kwargs)
    self.layernorm = tf.keras.layers.LayerNormalization()
    self.add = tf.keras.layers.Add()

In [327]:
class CrossAttention(BaseAttention):
  def call(self, x, context):
    attn_output, attn_scores = self.mha(
        query=x,
        key=context,
        value=context,
        return_attention_scores=True)

    # Cache the attention scores for plotting later.
    self.last_attn_scores = attn_scores

    x = self.add([x, attn_output])
    x = self.layernorm(x)

    return x

In [328]:
class GlobalSelfAttention(BaseAttention):
  def call(self, x):
    attn_output = self.mha(
        query=x,
        value=x,
        key=x)
    x = self.add([x, attn_output])
    x = self.layernorm(x)
    return x

In [329]:
class CausalSelfAttention(BaseAttention):
  def call(self, x):
    attn_output = self.mha(
        query=x,
        value=x,
        key=x,
        use_causal_mask = True)
    x = self.add([x, attn_output])
    x = self.layernorm(x)
    return x

In [330]:
class FeedForward(tf.keras.layers.Layer):
  def __init__(self, d_model, dff, dropout_rate=0.1):
    super().__init__()
    self.seq = tf.keras.Sequential([
      tf.keras.layers.Dense(dff, activation='relu'),
      tf.keras.layers.Dense(d_model),
      tf.keras.layers.Dropout(dropout_rate)
    ])
    self.add = tf.keras.layers.Add()
    self.layer_norm = tf.keras.layers.LayerNormalization()

  def call(self, x):
    x = self.add([x, self.seq(x)])
    x = self.layer_norm(x)
    return x


In [331]:
class EncoderLayer(tf.keras.layers.Layer):
  def __init__(self,*, d_model, num_heads, dff, dropout_rate=0.1):
    super().__init__()

    self.self_attention = GlobalSelfAttention(
        num_heads=num_heads,
        key_dim=d_model,
        dropout=dropout_rate)

    self.ffn = FeedForward(d_model, dff)

  def call(self, x):
    x = self.self_attention(x)
    x = self.ffn(x)
    return x

In [332]:
class Encoder(tf.keras.layers.Layer):
  def __init__(self, *, num_layers, d_model, num_heads,
               dff, vocab_size, dropout_rate=0.1):
    super().__init__()

    self.d_model = d_model
    self.num_layers = num_layers

    self.pos_embedding = PositionalEmbedding(
        vocab_size=vocab_size, d_model=d_model)

    self.enc_layers = [
        EncoderLayer(d_model=d_model,
                     num_heads=num_heads,
                     dff=dff,
                     dropout_rate=dropout_rate)
        for _ in range(num_layers)]
    self.dropout = tf.keras.layers.Dropout(dropout_rate)

  def call(self, x):
    # `x` is token-IDs shape: (batch, seq_len)
    x = self.pos_embedding(x)  # Shape `(batch_size, seq_len, d_model)`.

    # Add dropout.
    x = self.dropout(x)

    for i in range(self.num_layers):
      x = self.enc_layers[i](x)

    return x  # Shape `(batch_size, seq_len, d_model)`.

In [333]:
class DecoderLayer(tf.keras.layers.Layer):
  def __init__(self,
               *,
               d_model,
               num_heads,
               dff,
               dropout_rate=0.1):
    super(DecoderLayer, self).__init__()

    self.causal_self_attention = CausalSelfAttention(
        num_heads=num_heads,
        key_dim=d_model,
        dropout=dropout_rate)

    self.cross_attention = CrossAttention(
        num_heads=num_heads,
        key_dim=d_model,
        dropout=dropout_rate)

    self.ffn = FeedForward(d_model, dff)

  def call(self, x, context):
    x = self.causal_self_attention(x=x)
    x = self.cross_attention(x=x, context=context)

    # Cache the last attention scores for plotting later
    self.last_attn_scores = self.cross_attention.last_attn_scores

    x = self.ffn(x)  # Shape `(batch_size, seq_len, d_model)`.
    return x

In [334]:
class Decoder(tf.keras.layers.Layer):
  def __init__(self, *, num_layers, d_model, num_heads, dff, vocab_size,
               dropout_rate=0.1):
    super(Decoder, self).__init__()

    self.d_model = d_model
    self.num_layers = num_layers

    self.pos_embedding = PositionalEmbedding(vocab_size=vocab_size,
                                             d_model=d_model)
    self.dropout = tf.keras.layers.Dropout(dropout_rate)
    self.dec_layers = [
        DecoderLayer(d_model=d_model, num_heads=num_heads,
                     dff=dff, dropout_rate=dropout_rate)
        for _ in range(num_layers)]

    self.last_attn_scores = None

  def call(self, x, context):
    # `x` is token-IDs shape (batch, target_seq_len)
    x = self.pos_embedding(x)  # (batch_size, target_seq_len, d_model)

    x = self.dropout(x)

    for i in range(self.num_layers):
      x  = self.dec_layers[i](x, context)

    self.last_attn_scores = self.dec_layers[-1].last_attn_scores

    # The shape of x is (batch_size, target_seq_len, d_model).
    return x

In [335]:
class Transformer(tf.keras.Model):
  def __init__(self, *, num_layers, d_model, num_heads, dff,
               input_vocab_size, target_vocab_size, dropout_rate=0.1):
    super().__init__()
    self.encoder = Encoder(num_layers=num_layers, d_model=d_model,
                           num_heads=num_heads, dff=dff,
                           vocab_size=input_vocab_size,
                           dropout_rate=dropout_rate)

    self.decoder = Decoder(num_layers=num_layers, d_model=d_model,
                           num_heads=num_heads, dff=dff,
                           vocab_size=target_vocab_size,
                           dropout_rate=dropout_rate)

    self.final_layer = tf.keras.layers.Dense(target_vocab_size)

  def call(self, inputs):
    # To use a Keras model with `.fit` you must pass all your inputs in the
    # first argument.
    context, x  = inputs
    context = self.encoder(context)  # (batch_size, context_len, d_model)

    x = self.decoder(x, context)  # (batch_size, target_len, d_model)

    # Final linear layer output.
    logits = self.final_layer(x)  # (batch_size, target_len, target_vocab_size)

    try:
      # Drop the keras mask, so it doesn't scale the losses/metrics.
      # b/250038731
      del logits._keras_mask
    except AttributeError:
      pass

    # Return the final output and the attention weights.
    return logits

In [336]:
num_layers = 4
d_model = 128
dff = 128
num_heads = 8
dropout_rate = 0.1

In [337]:
en_vocab_size = len(en_tokenizer.word_index)+1;
ge_vocab_size = len(ge_tokenizer.word_index)+1;

transformer = Transformer(
    num_layers=num_layers,
    d_model=d_model,
    num_heads=num_heads,
    dff=dff,
    input_vocab_size = en_vocab_size,
    target_vocab_size = ge_vocab_size,
    dropout_rate=dropout_rate)

In [338]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
  def __init__(self, d_model, warmup_steps=4000):
    super().__init__()

    self.d_model = d_model
    self.d_model = tf.cast(self.d_model, tf.float32)

    self.warmup_steps = warmup_steps

  def __call__(self, step):
    step = tf.cast(step, dtype=tf.float32)
    arg1 = tf.math.rsqrt(step)
    arg2 = step * (self.warmup_steps ** -1.5)

    return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

In [339]:
learning_rate = CustomSchedule(d_model)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98,
                                     epsilon=1e-9)

In [340]:
def masked_loss(label, pred):
  mask = label != 0
  loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')
  loss = loss_object(label, pred)

  mask = tf.cast(mask, dtype=loss.dtype)
  loss *= mask

  loss = tf.reduce_sum(loss)/tf.reduce_sum(mask)
  return loss


def masked_accuracy(label, pred):
  pred = tf.argmax(pred, axis=2)
  label = tf.cast(label, pred.dtype)
  match = label == pred

  mask = label != 0

  match = match & mask

  match = tf.cast(match, dtype=tf.float32)
  mask = tf.cast(mask, dtype=tf.float32)
  return tf.reduce_sum(match)/tf.reduce_sum(mask)

In [341]:
transformer.compile(
    loss=masked_loss,
    optimizer=optimizer,
    metrics=[masked_accuracy])

In [342]:
transformer.fit(dataset_batches, epochs=1, validation_data=dataset_batches_val)





<keras.callbacks.History at 0x7c45a2027730>

In [343]:
class Translator(tf.Module):
  def __init__(self, en_tokenizer, ge_tokenizer, transformer):
    self.en_tokenizer = en_tokenizer
    self.ge_tokenizer = ge_tokenizer
    self.transformer = transformer

  def __call__(self, sentence, max_length=MAX_TOKENS):

    # assert isinstance(sentence, tf.Tensor)
    # if len(sentence.shape) == 0:
    #  sentence = sentence[tf.newaxis]
    sentence = self.en_tokenizer.texts_to_sequences(sentence)
    sentence = tf.keras.utils.pad_sequences(sentence, maxlen=MAX_TOKENS, padding='post')
    encoder_input = sentence
    start = tf.constant(ge_bos_index, dtype=tf.int64)[tf.newaxis]
    end = ge_eos_index

    # `tf.TensorArray` is required here (instead of a Python list), so that the
    # dynamic-loop can be traced by `tf.function`.
    output_array = tf.TensorArray(dtype=tf.int64, size=0, dynamic_size=True)
    output_array = output_array.write(0, start)

    for i in tf.range(max_length-1):
      output = tf.transpose(output_array.stack())
      predictions = self.transformer([encoder_input, output], training=False)

      # Select the last token from the `seq_len` dimension.
      predictions = predictions[:, -1:, :]  # Shape `(batch_size, 1, vocab_size)`.

      predicted_id = tf.argmax(predictions, axis=-1)

      # Concatenate the `predicted_id` to the output which is given to the
      # decoder as its input.
      output_array = output_array.write(i+1, predicted_id[0])
      if predicted_id == end:
        break

    output = tf.transpose(output_array.stack())
    # The output shape is `(1, tokens)`.
    # print(output[0].numpy())
    text = ge_tokenizer.sequences_to_texts([output[0].numpy()])  # Shape: `()`.

    # `tf.function` prevents us from using the attention_weights that were
    # calculated on the last iteration of the loop.
    # So, recalculate them outside the loop.
    self.transformer([encoder_input, output[:,:-1]], training=False)
    attention_weights = self.transformer.decoder.last_attn_scores

    return text, attention_weights

In [344]:
translator = Translator(en_tokenizer, ge_tokenizer, transformer)

def print_translation(sentence, translated_text, ground_truth):
  print(f'{"Input":15s}: {sentence}')
  print(f'{"Prediction":15s}: {translated_text[0]}')
  print(f'{"Ground truth":15s}: {ground_truth}')

In [345]:
sentence = ['bos The Chinese symbol form carries reasoning due use ideograms eos']
ground_truth = 'bos Das chinesische Symbol Form verkörpert  aufgrund Verwendung Ideogrammen Chinesischen  Denkweise eos'
translated_text, attention_weights = translator(sentence)
print_translation(sentence, translated_text, ground_truth)

Input          : ['bos The Chinese symbol form carries reasoning due use ideograms eos']
Prediction     : bos gewaltvideospielen potenzielle gewaltvideospielen potenzielle vergleich unterstützung platz finanzentwicklung mittels notwendig mittels unterstützung vergiftet unterstützung vergiftet leistungen vergiftet unterstützung vergiftet unterstützung vergiftet unterstützung vergiftet vergiftet vergiftet vergiftet vergiftet vergiftet vergiftet vergiftet unterstützung betrifft arbor vergiftet vergiftet vergiftet vergiftet vergiftet unterstützung vergiftet vergiftet jährlich betrifft arbor jährlich arbor arbor arbor arafats arbor arafats arbor vergiftet opposition vergiftet ebene vergiftet opposition vergiftet ebene vergiftet vergiftet vergiftet vergiftet vergiftet vergiftet liegt vergiftet liegt vergiftet vergiftet vergiftet unterstützung vergiftet unterstützung iran politischen vergiftet darüber vergiftet unterstützung iran unterstützung iran arbor vergiftet unterstützung vergiftet unte

In [346]:
sentence = ['bos Energy carbon taxes produce less economic pain gain conventional taxes can eos']
ground_truth = 'bos Energie COSteuern wirtschaftlicher Hinsicht weniger schmerzhaft dabei einträglicher herkömmliche Steuern eos'
translated_text, attention_weights = translator(sentence)
print_translation(sentence, translated_text, ground_truth)

Input          : ['bos Energy carbon taxes produce less economic pain gain conventional taxes can eos']
Prediction     : bos gewaltvideospielen potenzielle gewaltvideospielen potenzielle vergleich unterstützung platz komponenten mittels notwendig mittels unterstützung vergiftet unterstützung erfüllen hinaus vergiftet unterstützung vergiftet unterstützung vergiftet unterstützung vergiftet vergiftet vergiftet vergiftet vergiftet vergiftet vergiftet vergiftet unterstützung betrifft arbor vergiftet vergiftet vergiftet vergiftet vergiftet unterstützung vergiftet vergiftet jährlich betrifft arbor jährlich arbor arbor arbor jährlich arafats arbor staat hinaus vergiftet vergiftet vergiftet ebene vergiftet opposition vergiftet vergiftet vergiftet vergiftet vergiftet vergiftet vergiftet vergiftet liegt vergiftet liegt vergiftet vergiftet unterstützung vergiftet unterstützung iran durchführt unterstützung vergiftet darüber vergiftet unterstützung iran unterstützung vergiftet unterstützung vergift

In [347]:
sentence = ['bos In India newborn health forms part national Reproductive Child Health Program eos']
ground_truth = 'bos In Indien Gesundheit Neugeborener Teil Programms Fortpflanzungsmedizin Kindergesundheit eos'
translated_text, attention_weights = translator(sentence)
print_translation(sentence, translated_text, ground_truth)

Input          : ['bos In India newborn health forms part national Reproductive Child Health Program eos']
Prediction     : bos gewaltvideospielen potenzielle gewaltvideospielen potenzielle vergleich unterstützung platz komponenten mittels notwendig mittels unterstützung vergiftet unterstützung vergiftet leistungen vergiftet unterstützung vergiftet unterstützung vergiftet unterstützung vergiftet vergiftet vergiftet vergiftet vergiftet vergiftet vergiftet vergiftet unterstützung betrifft arbor vergiftet vergiftet vergiftet vergiftet vergiftet unterstützung vergiftet vergiftet jährlich betrifft arbor jährlich arbor arbor arbor jährlich arafats arbor staat hinaus vergiftet vergiftet ebene vergiftet opposition vergiftet ebene vergiftet vergiftet vergiftet vergiftet vergiftet vergiftet vergiftet liegt vergiftet liegt vergiftet vergiftet unterstützung vergiftet unterstützung iran durchführt unterstützung vergiftet unterstützung iran unterstützung iran unterstützung vergiftet unterstützung ve