<a href="https://colab.research.google.com/github/narendra974/AIMLOPS_IISC/blob/main/NMT_TRANSFORMERS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
%cd /content/drive/MyDrive/DATASETS/ENG_DEU_DATA/

/content/drive/MyDrive/DATASETS/ENG_DEU_DATA


In [None]:
!pip install pycld2



In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
import pycld2 as cld2
import spacy
from nltk.stem.porter import PorterStemmer
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import regex as re

import tensorflow as tf
from tensorflow import keras

from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences

In [None]:
nltk.download('words')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
!python -m spacy download de_core_news_sm
gernlp = spacy.load('de_core_news_sm')
engnlp = spacy.load('en_core_web_sm')

In [None]:
def removestop(text,stopwords):
  raw = text.split()
  words = [word for word in raw if not word in stopwords]
  cleanwords = ' '.join(words)
  return cleanwords

def tolower(text):
  return text.lower()

def removespecial(text):
  te1 = re.sub("\s+"," ",text)
  te2 = re.sub('\n', '', te1)
  te3 = re.sub('\r', '', te2)
  te4 = re.sub("[0-9]","",te3)
  te5 = re.sub("()@%^&*-_,/\{}[?|$|.|!]","",te4)
  te6 = re.sub(r"[\p{Cc}\p{Cs}]+","",te5)
  te7 = re.sub(r'[^\w\s]','', te6)
  te8 = re.sub("[^a-zA-Z ]","",te7)
  return te7

def removeurl(text):
  return re.sub('https?://\S+|www\.\S+', '', text)

def clean_text(text):
    text = text.lower()
    pattern = re.compile(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?")
    text = re.sub(pattern,' ',text).strip()
    return text

In [None]:
basefolder =  "/content/drive/MyDrive/DATASETS/ENG_DEU_DATA/"
# germanfiles = ["commoncrawl_de_en.txt","europarl-v7_de_en.txt","news-commentary-v9_de_en.txt"]
# engfiles = ["commoncrawl_en_de.txt","europarl-v7_en_de.txt","news-commentary-v9_en_de.txt"]
germanfiles = ["news-commentary-v9_de_en.txt"]
engfiles = ["news-commentary-v9_en_de.txt"]

In [None]:
def read_files(fileloc, language):
  with open(fileloc,"rb") as f:
    f_lines = f.readlines()
  df = pd.DataFrame(f_lines)
  dfc = df.set_axis([language],axis=1)
  dfc[language] = dfc[language].str.decode("utf-8")
  return dfc

In [None]:
dfappend=pd.DataFrame()
for efile in range(len(germanfiles)):
  germanfilepath = basefolder+germanfiles[efile]
  print(germanfilepath)
  germandff = read_files(germanfilepath,"german")
  engfilepath = basefolder+engfiles[efile]
  engdff = read_files(engfilepath,"english")
  print(germandff.shape)
  print(engdff.shape)
  dfconcat = pd.concat([germandff, engdff],axis="columns")
  dfappend=pd.concat([dfappend, dfconcat])

In [None]:
dfappend.isna().sum()

In [None]:
dfappend.duplicated().sum()

In [None]:
dfappend.drop_duplicates(subset=None, keep='first', inplace=True)
dfappend.shape

In [None]:
german_stop_words = stopwords.words('german')
english_stop_words = stopwords.words('english')

dfappend['english_clean'] = dfappend['english'].apply(lambda x: removestop(x,english_stop_words))
dfappend['german_clean'] = dfappend['german'].apply(lambda x: removestop(x,german_stop_words))
dfappend['german_clean'] = dfappend['german_clean'].apply(lambda x: removespecial(x))
dfappend['english_clean'] = dfappend['english_clean'].apply(lambda x: removespecial(x))

In [None]:
def langdet(x):
  isReliable, textBytesFound, details = cld2.detect(x)
  return(details[0][1])

In [None]:
dfappend['is_eng'] = dfappend['english_clean'].apply(lambda x: langdet(x))
dfappend['is_ger'] = dfappend['german_clean'].apply(lambda x: langdet(x))

In [None]:
display(dfappend["is_ger"].value_counts())

In [None]:
display(dfappend["is_eng"].value_counts())

In [None]:
dfappendclean = dfappend[(dfappend.is_eng == 'en') & (dfappend.is_ger =='de')]

In [None]:
dfappendclean.shape

In [None]:
display(dfappendclean)

In [None]:
dfappendclean["engcount"]=dfappendclean['english_clean'].str.split().str.len()

In [None]:
n=10000
dfappendclean = dfappendclean.sample(n=14000)
dfappendclean_val = dfappendclean[10000:]
dfappendclean = dfappendclean[0:100000]

In [None]:
bos_string = 'bos '
eos_string = ' eos'
dfappendclean['english_clean'] = bos_string + dfappendclean['english_clean'].astype(str) + eos_string
dfappendclean['german_clean'] = bos_string + dfappendclean['german_clean'].astype(str) + eos_string

en_list =  dfappendclean['english_clean'].astype(str).tolist()
ge_list =  dfappendclean['german_clean'].astype(str).tolist()
en_list_val =  dfappendclean_val['english_clean'].astype(str).tolist()
ge_list_val =  dfappendclean_val['german_clean'].astype(str).tolist()


In [None]:
display(en_list[0:5])

In [None]:
display(ge_list[0:5])

In [None]:
MAX_TOKENS=128
en_tokenizer = Tokenizer();
en_tokenizer.fit_on_texts(dfappendclean["english_clean"])

en_bos_index = en_tokenizer.word_index['bos']
print(en_bos_index)
en_eos_index = en_tokenizer.word_index['eos']
print(en_eos_index)

In [None]:
ge_tokenizer = Tokenizer();
ge_tokenizer.fit_on_texts(dfappendclean["german_clean"])

ge_bos_index = ge_tokenizer.word_index['bos']
print(ge_bos_index)
ge_eos_index = ge_tokenizer.word_index['eos']
print(ge_eos_index)

In [None]:
MAX_TOKENS=128
en = en_tokenizer.texts_to_sequences(en_list)     # Output is ragged.
en = tf.keras.utils.pad_sequences(en, maxlen=MAX_TOKENS, padding='post')

ge = ge_tokenizer.texts_to_sequences(ge_list)
ge = tf.keras.utils.pad_sequences(ge, maxlen=MAX_TOKENS+1, padding='post')

en_val = en_tokenizer.texts_to_sequences(en_list_val)     # Output is ragged.
en_val = tf.keras.utils.pad_sequences(en_val, maxlen=MAX_TOKENS, padding='post')

ge_val = ge_tokenizer.texts_to_sequences(ge_list_val)
ge_val = tf.keras.utils.pad_sequences(ge_val, maxlen=MAX_TOKENS+1, padding='post')


In [None]:
dataset = tf.data.Dataset.from_tensor_slices((en, ge))
dataset_val = tf.data.Dataset.from_tensor_slices((en_val, ge_val))

In [None]:
BUFFER_SIZE = 20000
BATCH_SIZE = 64
def prepare_batch(en, ge):

    ge_inputs = ge[:, :-1]  # Drop the [END] tokens
    ge_labels = ge[:, 1:]   # Drop the [START] tokens

    return (en, ge_inputs), ge_labels

def make_batches(ds):
  return (
      ds
      .shuffle(BUFFER_SIZE)
      .batch(BATCH_SIZE)
      .map(prepare_batch, tf.data.AUTOTUNE)
      .prefetch(buffer_size=tf.data.AUTOTUNE))

# Create training and validation set batches.
dataset_batches = make_batches(dataset)
dataset_batches_val = make_batches(dataset_val)

In [None]:
for (pt, en), en_labels in dataset_batches.take(1):
  break

print(pt.shape)
print(en.shape)
print(en_labels.shape)

In [None]:
def positional_encoding(length, depth):
  depth = depth/2

  positions = np.arange(length)[:, np.newaxis]     # (seq, 1)
  depths = np.arange(depth)[np.newaxis, :]/depth   # (1, depth)

  angle_rates = 1 / (10000**depths)         # (1, depth)
  angle_rads = positions * angle_rates      # (pos, depth)

  pos_encoding = np.concatenate(
      [np.sin(angle_rads), np.cos(angle_rads)],
      axis=-1)

  return tf.cast(pos_encoding, dtype=tf.float32)

In [None]:
class PositionalEmbedding(tf.keras.layers.Layer):
  def __init__(self, vocab_size, d_model):
    super().__init__()
    self.d_model = d_model
    self.embedding = tf.keras.layers.Embedding(vocab_size, d_model, mask_zero=True)
    self.pos_encoding = positional_encoding(length=2048, depth=d_model)

  def compute_mask(self, *args, **kwargs):
    return self.embedding.compute_mask(*args, **kwargs)

  def call(self, x):
    length = tf.shape(x)[1]
    x = self.embedding(x)
    # This factor sets the relative scale of the embedding and positonal_encoding.
    x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    x = x + self.pos_encoding[tf.newaxis, :length, :]
    return x


In [None]:
class BaseAttention(tf.keras.layers.Layer):
  def __init__(self, **kwargs):
    super().__init__()
    self.mha = tf.keras.layers.MultiHeadAttention(**kwargs)
    self.layernorm = tf.keras.layers.LayerNormalization()
    self.add = tf.keras.layers.Add()

In [None]:
class CrossAttention(BaseAttention):
  def call(self, x, context):
    attn_output, attn_scores = self.mha(
        query=x,
        key=context,
        value=context,
        return_attention_scores=True)

    # Cache the attention scores for plotting later.
    self.last_attn_scores = attn_scores

    x = self.add([x, attn_output])
    x = self.layernorm(x)

    return x

In [None]:
class GlobalSelfAttention(BaseAttention):
  def call(self, x):
    attn_output = self.mha(
        query=x,
        value=x,
        key=x)
    x = self.add([x, attn_output])
    x = self.layernorm(x)
    return x

In [None]:
class CausalSelfAttention(BaseAttention):
  def call(self, x):
    attn_output = self.mha(
        query=x,
        value=x,
        key=x,
        use_causal_mask = True)
    x = self.add([x, attn_output])
    x = self.layernorm(x)
    return x

In [None]:
class FeedForward(tf.keras.layers.Layer):
  def __init__(self, d_model, dff, dropout_rate=0.1):
    super().__init__()
    self.seq = tf.keras.Sequential([
      tf.keras.layers.Dense(dff, activation='relu'),
      tf.keras.layers.Dense(d_model),
      tf.keras.layers.Dropout(dropout_rate)
    ])
    self.add = tf.keras.layers.Add()
    self.layer_norm = tf.keras.layers.LayerNormalization()

  def call(self, x):
    x = self.add([x, self.seq(x)])
    x = self.layer_norm(x)
    return x


In [None]:
class EncoderLayer(tf.keras.layers.Layer):
  def __init__(self,*, d_model, num_heads, dff, dropout_rate=0.1):
    super().__init__()

    self.self_attention = GlobalSelfAttention(
        num_heads=num_heads,
        key_dim=d_model,
        dropout=dropout_rate)

    self.ffn = FeedForward(d_model, dff)

  def call(self, x):
    x = self.self_attention(x)
    x = self.ffn(x)
    return x

In [None]:
class Encoder(tf.keras.layers.Layer):
  def __init__(self, *, num_layers, d_model, num_heads,
               dff, vocab_size, dropout_rate=0.1):
    super().__init__()

    self.d_model = d_model
    self.num_layers = num_layers

    self.pos_embedding = PositionalEmbedding(
        vocab_size=vocab_size, d_model=d_model)

    self.enc_layers = [
        EncoderLayer(d_model=d_model,
                     num_heads=num_heads,
                     dff=dff,
                     dropout_rate=dropout_rate)
        for _ in range(num_layers)]
    self.dropout = tf.keras.layers.Dropout(dropout_rate)

  def call(self, x):
    # `x` is token-IDs shape: (batch, seq_len)
    x = self.pos_embedding(x)  # Shape `(batch_size, seq_len, d_model)`.

    # Add dropout.
    x = self.dropout(x)

    for i in range(self.num_layers):
      x = self.enc_layers[i](x)

    return x  # Shape `(batch_size, seq_len, d_model)`.

In [None]:
class DecoderLayer(tf.keras.layers.Layer):
  def __init__(self,
               *,
               d_model,
               num_heads,
               dff,
               dropout_rate=0.1):
    super(DecoderLayer, self).__init__()

    self.causal_self_attention = CausalSelfAttention(
        num_heads=num_heads,
        key_dim=d_model,
        dropout=dropout_rate)

    self.cross_attention = CrossAttention(
        num_heads=num_heads,
        key_dim=d_model,
        dropout=dropout_rate)

    self.ffn = FeedForward(d_model, dff)

  def call(self, x, context):
    x = self.causal_self_attention(x=x)
    x = self.cross_attention(x=x, context=context)

    # Cache the last attention scores for plotting later
    self.last_attn_scores = self.cross_attention.last_attn_scores

    x = self.ffn(x)  # Shape `(batch_size, seq_len, d_model)`.
    return x

In [None]:
class Decoder(tf.keras.layers.Layer):
  def __init__(self, *, num_layers, d_model, num_heads, dff, vocab_size,
               dropout_rate=0.1):
    super(Decoder, self).__init__()

    self.d_model = d_model
    self.num_layers = num_layers

    self.pos_embedding = PositionalEmbedding(vocab_size=vocab_size,
                                             d_model=d_model)
    self.dropout = tf.keras.layers.Dropout(dropout_rate)
    self.dec_layers = [
        DecoderLayer(d_model=d_model, num_heads=num_heads,
                     dff=dff, dropout_rate=dropout_rate)
        for _ in range(num_layers)]

    self.last_attn_scores = None

  def call(self, x, context):
    # `x` is token-IDs shape (batch, target_seq_len)
    x = self.pos_embedding(x)  # (batch_size, target_seq_len, d_model)

    x = self.dropout(x)

    for i in range(self.num_layers):
      x  = self.dec_layers[i](x, context)

    self.last_attn_scores = self.dec_layers[-1].last_attn_scores

    # The shape of x is (batch_size, target_seq_len, d_model).
    return x

In [None]:
class Transformer(tf.keras.Model):
  def __init__(self, *, num_layers, d_model, num_heads, dff,
               input_vocab_size, target_vocab_size, dropout_rate=0.1):
    super().__init__()
    self.encoder = Encoder(num_layers=num_layers, d_model=d_model,
                           num_heads=num_heads, dff=dff,
                           vocab_size=input_vocab_size,
                           dropout_rate=dropout_rate)

    self.decoder = Decoder(num_layers=num_layers, d_model=d_model,
                           num_heads=num_heads, dff=dff,
                           vocab_size=target_vocab_size,
                           dropout_rate=dropout_rate)

    self.final_layer = tf.keras.layers.Dense(target_vocab_size)

  def call(self, inputs):
    # To use a Keras model with `.fit` you must pass all your inputs in the
    # first argument.
    context, x  = inputs

    context = self.encoder(context)  # (batch_size, context_len, d_model)

    x = self.decoder(x, context)  # (batch_size, target_len, d_model)

    # Final linear layer output.
    logits = self.final_layer(x)  # (batch_size, target_len, target_vocab_size)

    try:
      # Drop the keras mask, so it doesn't scale the losses/metrics.
      # b/250038731
      del logits._keras_mask
    except AttributeError:
      pass

    # Return the final output and the attention weights.
    return logits

In [None]:
num_layers = 4
d_model = 128
dff = 512
num_heads = 8
dropout_rate = 0.1

In [None]:
en_vocab_size = len(en_tokenizer.word_index)+1;
ge_vocab_size = len(ge_tokenizer.word_index)+1;

transformer = Transformer(
    num_layers=num_layers,
    d_model=d_model,
    num_heads=num_heads,
    dff=dff,
    input_vocab_size = en_vocab_size,
    target_vocab_size = ge_vocab_size,
    dropout_rate=dropout_rate)

In [None]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
  def __init__(self, d_model, warmup_steps=4000):
    super().__init__()

    self.d_model = d_model
    self.d_model = tf.cast(self.d_model, tf.float32)

    self.warmup_steps = warmup_steps

  def __call__(self, step):
    step = tf.cast(step, dtype=tf.float32)
    arg1 = tf.math.rsqrt(step)
    arg2 = step * (self.warmup_steps ** -1.5)

    return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

In [None]:
learning_rate = CustomSchedule(d_model)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98,
                                     epsilon=1e-9)

In [None]:
def masked_loss(label, pred):
  mask = label != 0
  loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')
  loss = loss_object(label, pred)

  mask = tf.cast(mask, dtype=loss.dtype)
  loss *= mask

  loss = tf.reduce_sum(loss)/tf.reduce_sum(mask)
  return loss


def masked_accuracy(label, pred):
  pred = tf.argmax(pred, axis=2)
  label = tf.cast(label, pred.dtype)
  match = label == pred

  mask = label != 0

  match = match & mask

  match = tf.cast(match, dtype=tf.float32)
  mask = tf.cast(mask, dtype=tf.float32)
  return tf.reduce_sum(match)/tf.reduce_sum(mask)

In [None]:
transformer.compile(
    loss=masked_loss,
    optimizer=optimizer,
    metrics=[masked_accuracy])

In [None]:
filepath="NMT_ENG_GER-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

In [None]:
transformer.fit(dataset_batches, epochs=1, validation_data=dataset_batches_val)

In [None]:
class Translator(tf.Module):
  def __init__(self, en_tokenizer, ge_tokenizer, transformer):
    self.en_tokenizer = en_tokenizer
    self.ge_tokenizer = ge_tokenizer
    self.transformer = transformer

  def __call__(self, sentence, max_length=MAX_TOKENS):

    # assert isinstance(sentence, tf.Tensor)
    # if len(sentence.shape) == 0:
    #  sentence = sentence[tf.newaxis]

    sentence = self.en_tokenizer.texts_to_sequences(sentence)

    encoder_input = sentence

    start_end = self.en_tokenizer.texts_to_sequences([''])
    print(start_end)
    start = start_end[0][tf.newaxis]
    end = start_end[1][tf.newaxis]

    # `tf.TensorArray` is required here (instead of a Python list), so that the
    # dynamic-loop can be traced by `tf.function`.
    output_array = tf.TensorArray(dtype=tf.int64, size=0, dynamic_size=True)
    output_array = output_array.write(0, start)

    for i in tf.range(max_length):
      output = tf.transpose(output_array.stack())
      predictions = self.transformer([encoder_input, output], training=False)

      # Select the last token from the `seq_len` dimension.
      predictions = predictions[:, -1:, :]  # Shape `(batch_size, 1, vocab_size)`.

      predicted_id = tf.argmax(predictions, axis=-1)

      # Concatenate the `predicted_id` to the output which is given to the
      # decoder as its input.
      output_array = output_array.write(i+1, predicted_id[0])

      if predicted_id == end:
        break

    output = tf.transpose(output_array.stack())
    # The output shape is `(1, tokens)`.
    text = ge_tokenizer.texts_to_sequences(output)[0]  # Shape: `()`.

    # `tf.function` prevents us from using the attention_weights that were
    # calculated on the last iteration of the loop.
    # So, recalculate them outside the loop.
    self.transformer([encoder_input, output[:,:-1]], training=False)
    attention_weights = self.transformer.decoder.last_attn_scores

    return text, attention_weights

In [None]:
translator = Translator(en_tokenizer, ge_tokenizer, transformer)

In [None]:
def print_translation(sentence, tokens, ground_truth):
  print(f'{"Input:":15s}: {sentence}')
  print(f'{"Prediction":15s}: {tokens.numpy().decode("utf-8")}')
  print(f'{"Ground truth":15s}: {ground_truth}')

In [None]:
sentence = "As result creation saints becoming important way retaining faithful"
ground_truth = 'Infolgedessen entwickelt Ernennung neuer Heiliger zunehmend wichtigen Methode Bindung Gläubigen'

translated_text, translated_tokens, attention_weights = translator(sentence)
print_translation(sentence, translated_text, ground_truth)