In [58]:
import tensorflow as tf
import numpy as np
import pandas as pd
import os 
import re 

import matplotlib.pyplot as plt

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split


# Load the dataset

In [2]:
# en_url = "https://nlp.stanford.edu/projects/nmt/data/wmt14.en-de/train.en"
# de_url = "https://nlp.stanford.edu/projects/nmt/data/wmt14.en-de/train.de"

# en_file = tf.keras.utils.get_file("train.en", en_url)
# de_file = tf.keras.utils.get_file("train.de", de_url)

en_file = '/Users/mansurnurmukhambetov/.keras/datasets/train.en'
de_file = '/Users/mansurnurmukhambetov/.keras/datasets/train.de'

In [3]:
file = open(de_file, 'r')
len(file.readlines())

4468840

In [4]:
file = open(en_file, 'r')
len(file.readlines())

4468840

## Preprocessing

In [94]:
def make_dataset(file_path):
    # read file
    with open(file_path, "r") as f:
        lines = f.read().splitlines()

    # create DataFrame
    dataset = pd.DataFrame({"sentence": lines})

    # remove non-alphanumeric, punctuation and german ulmauts
    dataset = dataset.applymap(
        lambda string: re.sub("\s+", " ", 
                              re.sub(r"[^A-Za-z0-9äöüÄÖÜß]", " ", string)
        )
    ) # (),!?\'\`

    # convert text to lowercase
    dataset =  dataset.applymap(lambda string: string.lower())

    return dataset

en_dataset = make_dataset(en_file)
de_dataset = make_dataset(de_file)

In [95]:
# visualize sentence lengths

def visualize_sentence_lengths(dataset):
    lengths = dataset.sentence.str.split().str.len()

    plt.title("Sentence Lengths")
    plt.hist(lengths, bins=50)
    plt.show()
    return lengths

# en_lengths = visualize_sentence_lengths(en_dataset)
# de_lengths = visualize_sentence_lengths(de_dataset);

In [96]:
en_length = 150
de_length = 150

## Tokenization

In [46]:
# function to build a tokenizer
def build_tokenizer(dataset, vocab_size=None):
    tokenizer = Tokenizer(num_words=vocab_size, filters="", oov_token="<unk>")
    tokenizer.fit_on_texts(dataset.sentence.values)
    return tokenizer

In [47]:
# prepare english tokenizer
eng_tokenizer = build_tokenizer(en_dataset, 10000)
eng_vocab_size = len(eng_tokenizer.word_index) + 1

print('English Vocabulary Size: %d' % eng_vocab_size)

English Vocabulary Size: 597910


In [52]:
# prepare german tokenizer
ger_tokenizer = build_tokenizer(de_dataset, 10000)
ger_vocab_size = len(ger_tokenizer.word_index) + 1

print('German Vocabulary Size: %d' % ger_vocab_size)

German Vocabulary Size: 1403690


In [60]:
# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
    seq = tokenizer.texts_to_sequences(lines)
    # pad sequences with 0 values
    seq = pad_sequences(seq, maxlen=length, padding='post')
    return seq

In [57]:
data = en_dataset.join(de_dataset, lsuffix='_en', rsuffix='_de')

# split data into train and test set
train, test = train_test_split(data, test_size=0.2, random_state = 12)

2212193    for the cooperation with industry partners , a...
788373     parliament debates it in two successive readin...
915096     of particular interest are the performances of...
4018049    this is because , as mr imaz san miguel said ,...
1555806      it stretches over the length of 17 kilometers  
                                 ...                        
4061123    the use of such economic arguments shows once ...
2303235    when kim moke took over as sole artistic direc...
2133634    robert schad  apos s artistic style centres ar...
1461501    best practice is to scale the image offline   ...
3905179    i also think that the european union must quic...
Name: sentence_en, Length: 3575072, dtype: object

In [66]:
# prepare training data
trainX = encode_sequences(eng_tokenizer, en_length, train["sentence_en"])
trainY = encode_sequences(ger_tokenizer, de_length, train["sentence_de"])

# prepare validation data
testX = encode_sequences(eng_tokenizer, en_length, test["sentence_en"])
testY = encode_sequences(ger_tokenizer, de_length, test["sentence_de"])

## Model

In [82]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, RepeatVector
from keras.preprocessing.text import Tokenizer
from keras.callbacks import ModelCheckpoint
from keras.models import load_model
from keras import optimizers

In [105]:
def define_model(in_vocab, out_vocab, in_timesteps, out_timesteps, units):
    # model = Sequential()
    # # encoder
    # model.add(Embedding(in_vocab, units, input_length=in_timesteps, mask_zero=True))
    # model.add(LSTM(units, return_sequences=True))
    # # decoder
    # model.add(LSTM(units))
    # model.add(Dense(units, activation='relu'))
    # model.add(Dense(out_vocab, activation='softmax'))

    model = Sequential()
    model.add(Embedding(in_vocab, units, input_length=in_timesteps, mask_zero=True))
    model.add(LSTM(units))
    model.add(RepeatVector(out_timesteps))
    model.add(LSTM(units, return_sequences=True))
    model.add(Dense(out_vocab, activation='softmax'))

    opt = optimizers.Adam(lr=0.001)
    model.compile(optimizer=opt, loss='sparse_categorical_crossentropy')

    return model

model = define_model(eng_vocab_size, ger_vocab_size, en_length, de_length, 50)

  super().__init__(name, **kwargs)


In [106]:
model.summary()

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_6 (Embedding)     (None, 150, 50)           29895500  
                                                                 
 lstm_10 (LSTM)              (None, 50)                20200     
                                                                 
 repeat_vector_3 (RepeatVect  (None, 150, 50)          0         
 or)                                                             
                                                                 
 lstm_11 (LSTM)              (None, 150, 50)           20200     
                                                                 
 dense_5 (Dense)             (None, 150, 1403690)      71588190  
                                                                 
Total params: 101,524,090
Trainable params: 101,524,090
Non-trainable params: 0
________________________________________

In [107]:
# train model
history = model.fit(trainX, trainY[:, :, None],
                    epochs=1, batch_size=512, validation_split = 0.2, verbose=1)

2023-05-28 19:46:47.617742: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2023-05-28 19:46:48.254918: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2023-05-28 19:46:48.769188: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


ResourceExhaustedError: Graph execution error:

Detected at node 'sequential_6/dense_5/Tensordot/MatMul' defined at (most recent call last):
    File "/opt/homebrew/Caskroom/miniforge/base/lib/python3.9/runpy.py", line 197, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "/opt/homebrew/Caskroom/miniforge/base/lib/python3.9/runpy.py", line 87, in _run_code
      exec(code, run_globals)
    File "/opt/homebrew/Caskroom/miniforge/base/lib/python3.9/site-packages/ipykernel_launcher.py", line 17, in <module>
      app.launch_new_instance()
    File "/opt/homebrew/Caskroom/miniforge/base/lib/python3.9/site-packages/traitlets/config/application.py", line 982, in launch_instance
      app.start()
    File "/opt/homebrew/Caskroom/miniforge/base/lib/python3.9/site-packages/ipykernel/kernelapp.py", line 712, in start
      self.io_loop.start()
    File "/opt/homebrew/Caskroom/miniforge/base/lib/python3.9/site-packages/tornado/platform/asyncio.py", line 215, in start
      self.asyncio_loop.run_forever()
    File "/opt/homebrew/Caskroom/miniforge/base/lib/python3.9/asyncio/base_events.py", line 596, in run_forever
      self._run_once()
    File "/opt/homebrew/Caskroom/miniforge/base/lib/python3.9/asyncio/base_events.py", line 1890, in _run_once
      handle._run()
    File "/opt/homebrew/Caskroom/miniforge/base/lib/python3.9/asyncio/events.py", line 80, in _run
      self._context.run(self._callback, *self._args)
    File "/opt/homebrew/Caskroom/miniforge/base/lib/python3.9/site-packages/ipykernel/kernelbase.py", line 510, in dispatch_queue
      await self.process_one()
    File "/opt/homebrew/Caskroom/miniforge/base/lib/python3.9/site-packages/ipykernel/kernelbase.py", line 499, in process_one
      await dispatch(*args)
    File "/opt/homebrew/Caskroom/miniforge/base/lib/python3.9/site-packages/ipykernel/kernelbase.py", line 406, in dispatch_shell
      await result
    File "/opt/homebrew/Caskroom/miniforge/base/lib/python3.9/site-packages/ipykernel/kernelbase.py", line 730, in execute_request
      reply_content = await reply_content
    File "/opt/homebrew/Caskroom/miniforge/base/lib/python3.9/site-packages/ipykernel/ipkernel.py", line 383, in do_execute
      res = shell.run_cell(
    File "/opt/homebrew/Caskroom/miniforge/base/lib/python3.9/site-packages/ipykernel/zmqshell.py", line 528, in run_cell
      return super().run_cell(*args, **kwargs)
    File "/opt/homebrew/Caskroom/miniforge/base/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 2940, in run_cell
      result = self._run_cell(
    File "/opt/homebrew/Caskroom/miniforge/base/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 2995, in _run_cell
      return runner(coro)
    File "/opt/homebrew/Caskroom/miniforge/base/lib/python3.9/site-packages/IPython/core/async_helpers.py", line 129, in _pseudo_sync_runner
      coro.send(None)
    File "/opt/homebrew/Caskroom/miniforge/base/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3194, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "/opt/homebrew/Caskroom/miniforge/base/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3373, in run_ast_nodes
      if await self.run_code(code, result, async_=asy):
    File "/opt/homebrew/Caskroom/miniforge/base/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3433, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "/var/folders/s9/0x7864ms0sj16xzqw_gzzh_w0000gn/T/ipykernel_17542/122687941.py", line 2, in <module>
      history = model.fit(trainX, trainY[:, :, None],
    File "/opt/homebrew/Caskroom/miniforge/base/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/opt/homebrew/Caskroom/miniforge/base/lib/python3.9/site-packages/keras/engine/training.py", line 1564, in fit
      tmp_logs = self.train_function(iterator)
    File "/opt/homebrew/Caskroom/miniforge/base/lib/python3.9/site-packages/keras/engine/training.py", line 1160, in train_function
      return step_function(self, iterator)
    File "/opt/homebrew/Caskroom/miniforge/base/lib/python3.9/site-packages/keras/engine/training.py", line 1146, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/opt/homebrew/Caskroom/miniforge/base/lib/python3.9/site-packages/keras/engine/training.py", line 1135, in run_step
      outputs = model.train_step(data)
    File "/opt/homebrew/Caskroom/miniforge/base/lib/python3.9/site-packages/keras/engine/training.py", line 993, in train_step
      y_pred = self(x, training=True)
    File "/opt/homebrew/Caskroom/miniforge/base/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/opt/homebrew/Caskroom/miniforge/base/lib/python3.9/site-packages/keras/engine/training.py", line 557, in __call__
      return super().__call__(*args, **kwargs)
    File "/opt/homebrew/Caskroom/miniforge/base/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/opt/homebrew/Caskroom/miniforge/base/lib/python3.9/site-packages/keras/engine/base_layer.py", line 1097, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "/opt/homebrew/Caskroom/miniforge/base/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 96, in error_handler
      return fn(*args, **kwargs)
    File "/opt/homebrew/Caskroom/miniforge/base/lib/python3.9/site-packages/keras/engine/sequential.py", line 410, in call
      return super().call(inputs, training=training, mask=mask)
    File "/opt/homebrew/Caskroom/miniforge/base/lib/python3.9/site-packages/keras/engine/functional.py", line 510, in call
      return self._run_internal_graph(inputs, training=training, mask=mask)
    File "/opt/homebrew/Caskroom/miniforge/base/lib/python3.9/site-packages/keras/engine/functional.py", line 667, in _run_internal_graph
      outputs = node.layer(*args, **kwargs)
    File "/opt/homebrew/Caskroom/miniforge/base/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/opt/homebrew/Caskroom/miniforge/base/lib/python3.9/site-packages/keras/engine/base_layer.py", line 1097, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "/opt/homebrew/Caskroom/miniforge/base/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 96, in error_handler
      return fn(*args, **kwargs)
    File "/opt/homebrew/Caskroom/miniforge/base/lib/python3.9/site-packages/keras/layers/core/dense.py", line 244, in call
      outputs = tf.tensordot(inputs, self.kernel, [[rank - 1], [0]])
Node: 'sequential_6/dense_5/Tensordot/MatMul'
OOM when allocating tensor with shape[76800,1403690] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator Simple allocator
	 [[{{node sequential_6/dense_5/Tensordot/MatMul}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.
 [Op:__inference_train_function_11076416]