<a href="https://colab.research.google.com/github/nomomon/NLP-course-project/blob/master/notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import tensorflow as tf
import numpy as np
import pandas as pd
import os 
import re 

from tqdm import tqdm

import matplotlib.pyplot as plt

from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split


# Load the dataset

In [3]:
en_url = "https://nlp.stanford.edu/projects/nmt/data/wmt14.en-de/train.en"
de_url = "https://nlp.stanford.edu/projects/nmt/data/wmt14.en-de/train.de"

en_file = tf.keras.utils.get_file("train.en", en_url)
de_file = tf.keras.utils.get_file("train.de", de_url)

# en_file = '/Users/mansurnurmukhambetov/.keras/datasets/train.en'
# de_file = '/Users/mansurnurmukhambetov/.keras/datasets/train.de'

In [4]:
file = open(de_file, 'r')
len(file.readlines())

4468840

In [5]:
file = open(en_file, 'r')
len(file.readlines())

4468840

## Preprocessing

In [6]:
def make_dataset(file_path):
    # read file
    with open(file_path, "r") as f:
        lines = f.read().splitlines()

    # create DataFrame
    dataset = pd.DataFrame({"sentence": lines})

    # remove non-alphanumeric, punctuation and german ulmauts
    dataset = dataset.applymap(
        lambda string: re.sub("\s+", " ", 
                              re.sub(r"[^A-Za-z0-9äöüÄÖÜß]", " ", string)
        )
    ) # (),!?\'\`

    # convert text to lowercase
    dataset =  dataset.applymap(lambda string: string.lower())

    return dataset

en_dataset = make_dataset(en_file)
de_dataset = make_dataset(de_file)

In [7]:
# visualize sentence lengths

def visualize_sentence_lengths(dataset):
    lengths = dataset.sentence.str.split().str.len()

    plt.title("Sentence Lengths")
    plt.hist(lengths, bins=50)
    plt.show()
    return lengths

# en_lengths = visualize_sentence_lengths(en_dataset)
# de_lengths = visualize_sentence_lengths(de_dataset);

In [8]:
en_length = 150
de_length = 150

## Tokenization

In [63]:
class Tokenizer():
    def __init__(self, num_words, filters, oov_token):
        self.num_words = num_words
        self.filters = filters
        self.oov_token = oov_token
        self.vocab = {}

    # def split(self, text):
    #     return re.findall(r"\w+|[^\w\s]", text, re.UNICODE)

    def fit_on_texts(self, corpus):
        x = np.char.split(corpus)
        x = np.concatenate(x).ravel()
        u, i = np.unique(x, return_index=True)
        i += 1
        u = u[:self.num_words + 1]
        i = i[:self.num_words + 1]

        self.vocab = dict(zip(u, i))
        self.vocab[self.oov_token] = 0

    def t2i(self, token):
        if token in self.vocab:
            return self.vocab[token]
        else: 
            return 0

    def texts_to_sequences(self, text):
        x = np.char.split(text)
        x = np.concatenate(x).ravel()
        x = np.array([self.t2i(token) for token in x])
        return x

In [64]:
# function to build a tokenizer
def build_tokenizer(dataset, vocab_size=None):
    tokenizer = Tokenizer(num_words=vocab_size, filters="", oov_token="<unk>")
    tokenizer.fit_on_texts(dataset.sentence.values.flatten())

    return tokenizer

In [66]:
en_dataset.sentence.values.flatten()

array(['iron cement is a ready for use paste which is laid as a fillet by putty knife or finger in the mould edges corners of the steel ingot mould ',
       'iron cement protects the ingot against the hot abrasive steel casting process ',
       'a fire restant repair cement for fire places ovens open fireplaces etc ',
       ...,
       'in a region that reveres the elderly zuma s attachment to his rural traditions must be matched by an equal openness to the appetites of the country s youth ',
       'three in ten south africans are younger than 15 meaning that they did not live a day under apartheid ',
       'somehow zuma must find a way to honor his own generation s commitment to racial justice and national liberation while empowering the masses who daily suffer the sting of class differences and yearn for material gain '],
      dtype=object)

In [67]:
# prepare english tokenizer
eng_tokenizer = build_tokenizer(en_dataset, 1000)
eng_vocab_size = eng_tokenizer.num_words + 1

print('English Vocabulary Size: %d' % eng_vocab_size)

TypeError: ignored

In [49]:
eng_tokenizer.texts_to_sequences(np.array(["mama hoi hoi"]))

array([5, 0, 0])

In [43]:
# prepare german tokenizer
ger_tokenizer = build_tokenizer(de_dataset, 1000)
ger_vocab_size = ger_tokenizer.num_words + 1

print('German Vocabulary Size: %d' % ger_vocab_size)

German Vocabulary Size: 1001


In [37]:
# split data into train and test set
(
    en_dataset_train, 
    en_dataset_test, 
    de_dataset_train, 
    de_dataset_test
 ) = train_test_split(en_dataset, de_dataset, test_size=0.2, random_state = 12)

In [44]:
# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
    seq = lines.applymap(lambda string: tokenizer.texts_to_sequences(string))
    # pad sequences with 0 values
    seq = pad_sequences(seq, maxlen=length, padding='post')
    return seq

In [45]:
# prepare training data
trainX = encode_sequences(eng_tokenizer, en_length, en_dataset_train)
trainY = encode_sequences(ger_tokenizer, de_length, de_dataset_train)

# prepare validation data
testX = encode_sequences(eng_tokenizer, en_length, en_dataset_test)
testY = encode_sequences(ger_tokenizer, de_length, de_dataset_test)

TypeError: ignored

## Model

In [None]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, RepeatVector
from keras.preprocessing.text import Tokenizer
from keras.callbacks import ModelCheckpoint
from keras.models import load_model
from keras import optimizers

In [None]:
def define_model(in_vocab, out_vocab, in_timesteps, out_timesteps, units):
    # model = Sequential()
    # # encoder
    # model.add(Embedding(in_vocab, units, input_length=in_timesteps, mask_zero=True))
    # model.add(LSTM(units, return_sequences=True))
    # # decoder
    # model.add(LSTM(units))
    # model.add(Dense(units, activation='relu'))
    # model.add(Dense(out_vocab, activation='softmax'))

    model = Sequential()
    model.add(Embedding(in_vocab, units, input_length=in_timesteps, mask_zero=True))
    model.add(LSTM(units))
    model.add(RepeatVector(out_timesteps))
    model.add(LSTM(units, return_sequences=True))
    model.add(Dense(out_vocab, activation='softmax'))

    opt = optimizers.Adam(lr=0.001)
    model.compile(optimizer=opt, loss='sparse_categorical_crossentropy')

    return model

model = define_model(eng_vocab_size, ger_vocab_size, en_length, de_length, 10)

In [None]:
model.summary()

In [None]:
# train model
history = model.fit(trainX, trainY[:, :, None],
                    epochs=1, batch_size=32, validation_split=0.2, verbose=1)

ValueError: ignored