<a href="https://colab.research.google.com/github/nomomon/NLP-course-project/blob/master/notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
import os 
import re 

import matplotlib.pyplot as plt

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split


# Load the dataset

In [None]:
en_url = "https://nlp.stanford.edu/projects/nmt/data/wmt14.en-de/train.en"
de_url = "https://nlp.stanford.edu/projects/nmt/data/wmt14.en-de/train.de"

en_file = tf.keras.utils.get_file("train.en", en_url)
de_file = tf.keras.utils.get_file("train.de", de_url)

# en_file = '/Users/mansurnurmukhambetov/.keras/datasets/train.en'
# de_file = '/Users/mansurnurmukhambetov/.keras/datasets/train.de'

In [None]:
file = open(de_file, 'r')
len(file.readlines())

4468840

In [None]:
file = open(en_file, 'r')
len(file.readlines())

4468840

## Preprocessing

In [None]:
def make_dataset(file_path):
    # read file
    with open(file_path, "r") as f:
        lines = f.read().splitlines()

    # create DataFrame
    dataset = pd.DataFrame({"sentence": lines})

    # remove non-alphanumeric, punctuation and german ulmauts
    dataset = dataset.applymap(
        lambda string: re.sub("\s+", " ", 
                              re.sub(r"[^A-Za-z0-9äöüÄÖÜß]", " ", string)
        )
    ) # (),!?\'\`

    # convert text to lowercase
    dataset =  dataset.applymap(lambda string: string.lower())

    return dataset

en_dataset = make_dataset(en_file)
de_dataset = make_dataset(de_file)

In [None]:
# visualize sentence lengths

def visualize_sentence_lengths(dataset):
    lengths = dataset.sentence.str.split().str.len()

    plt.title("Sentence Lengths")
    plt.hist(lengths, bins=50)
    plt.show()
    return lengths

# en_lengths = visualize_sentence_lengths(en_dataset)
# de_lengths = visualize_sentence_lengths(de_dataset);

In [None]:
en_length = 150
de_length = 150

## Tokenization

In [None]:
# function to build a tokenizer
def build_tokenizer(dataset, vocab_size=None):
    tokenizer = Tokenizer(num_words=vocab_size, filters="", oov_token="<unk>")
    tokenizer.fit_on_texts(dataset.sentence.values)

    return tokenizer

In [None]:
# prepare english tokenizer
eng_tokenizer = build_tokenizer(en_dataset, 10000)
eng_vocab_size = eng_tokenizer.num_words + 1

print('English Vocabulary Size: %d' % eng_vocab_size)

English Vocabulary Size: 10001


In [None]:
# prepare german tokenizer
ger_tokenizer = build_tokenizer(de_dataset, 1000)
ger_vocab_size = ger_tokenizer.num_words + 1

print('German Vocabulary Size: %d' % ger_vocab_size)

German Vocabulary Size: 1001


In [None]:
# split data into train and test set
(
    en_dataset_train, 
    en_dataset_test, 
    de_dataset_train, 
    de_dataset_test
 ) = train_test_split(en_dataset, de_dataset, test_size=0.2, random_state = 12)

In [None]:
# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
    seq = lines.applymap(lambda string: tokenizer.texts_to_sequences(string)[0])
    # pad sequences with 0 values
    seq = pad_sequences(seq, maxlen=length, padding='post')
    return seq

In [None]:
# prepare training data
trainX = encode_sequences(eng_tokenizer, en_length, en_dataset_train)
trainY = encode_sequences(ger_tokenizer, de_length, de_dataset_train)

# prepare validation data
testX = encode_sequences(eng_tokenizer, en_length, en_dataset_test)
testY = encode_sequences(ger_tokenizer, de_length, de_dataset_test)

KeyboardInterrupt: ignored

## Model

In [None]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, RepeatVector
from keras.preprocessing.text import Tokenizer
from keras.callbacks import ModelCheckpoint
from keras.models import load_model
from keras import optimizers

In [None]:
def define_model(in_vocab, out_vocab, in_timesteps, out_timesteps, units):
    # model = Sequential()
    # # encoder
    # model.add(Embedding(in_vocab, units, input_length=in_timesteps, mask_zero=True))
    # model.add(LSTM(units, return_sequences=True))
    # # decoder
    # model.add(LSTM(units))
    # model.add(Dense(units, activation='relu'))
    # model.add(Dense(out_vocab, activation='softmax'))

    model = Sequential()
    model.add(Embedding(in_vocab, units, input_length=in_timesteps, mask_zero=True))
    model.add(LSTM(units))
    model.add(RepeatVector(out_timesteps))
    model.add(LSTM(units, return_sequences=True))
    model.add(Dense(out_vocab, activation='softmax'))

    opt = optimizers.Adam(lr=0.001)
    model.compile(optimizer=opt, loss='sparse_categorical_crossentropy')

    return model

model = define_model(eng_vocab_size, ger_vocab_size, en_length, de_length, 10)

  super().__init__(name, **kwargs)


In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 150, 10)           100010    
                                                                 
 lstm (LSTM)                 (None, 10)                840       
                                                                 
 repeat_vector (RepeatVector  (None, 150, 10)          0         
 )                                                               
                                                                 
 lstm_1 (LSTM)               (None, 150, 10)           840       
                                                                 
 dense (Dense)               (None, 150, 1001)         11011     
                                                                 
Total params: 112,701
Trainable params: 112,701
Non-trainable params: 0
__________________________________________________

In [None]:
# train model
history = model.fit(trainX, trainY[:, :, None],
                    epochs=1, batch_size=32, validation_split=0.2, verbose=1)

ValueError: ignored