In [4]:
import pandas as pd
from functools import reduce
import re
import string
import nltk
from nltk.corpus import stopwords
import zipfile

from sklearn.utils import shuffle
from keras.layers import Input, LSTM, Embedding, Dense
from keras.models import Model

import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score


In [5]:
path = '/content/rus.txt'
df= pd.read_csv(path, sep='\t', names =['eng', 'rus', 'meta'])[['eng', 'rus']]
df

Unnamed: 0,eng,rus
0,Go.,Марш!
1,Go.,Иди.
2,Go.,Идите.
3,Hi.,Здравствуйте.
4,Hi.,Привет!
...,...,...
117523,My mother has been dead these ten years.,"Вот уже десять лет, как моя мать умерла."
117524,My mother made a beautiful dress for me.,Мама сшила мне красивое платье.
117525,My mother often bakes apple pies for us.,Моя мать часто печёт для нас яблочные пироги.
117526,My mother put a large vase on the shelf.,Мама поставила на полку большую вазу.


In [6]:
# nltk.download('stopwords')
# eng_stop_words = set(stopwords.words('english'))
# rus_stop_words = set(stopwords.words('russian'))

In [7]:
from nltk.corpus.reader import xmldocs
def remove_punctuation(text_str):
    result = ""
    for c in text_str:
        # If char is not punctuation, add it to the result.
        if c not in string.punctuation and c!='':
            result += c
    return result

def remove_stopwrds(list_str, stop_words):
    filtered_list = []
    for w in list_str:
        # If a word is not in stopwords, add it to the result.
        if w not in stop_words:
            filtered_list.append(w)
    return ' '.join([str(elem) for elem in filtered_list])


def process_text(df, col):
  df[col] = df[col].apply(lambda x: x.lower())
  df[col] = df[col].apply(lambda x: re.sub("'", '', x)) 
  df[col] = df[col].apply(remove_punctuation)
  # df[col] = df[col].apply(lambda x: remove_stopwrds(x.split(' '), stop_words))
  df[col] = df[col].apply(lambda x: x.strip())
  df[col] = df[col].apply(lambda x: re.sub(" +", " ", x))
  df[col] = df[col].apply(lambda x: ''.join(i for i in x if not i.isdigit()))
  if col == 'rus':
    df[col] = df[col].apply(lambda x : '\t '+ x + ' \n')
  df[col] = df[col].apply(lambda x : x.split(' '))
  df[col] = df[col].apply(lambda x : list(filter(None, x)))

  return df

df = process_text(df, 'eng')
df = process_text(df, 'rus')
df

Unnamed: 0,eng,rus
0,[go],"[\t, марш, \n]"
1,[go],"[\t, иди, \n]"
2,[go],"[\t, идите, \n]"
3,[hi],"[\t, здравствуйте, \n]"
4,[hi],"[\t, привет, \n]"
...,...,...
117523,"[my, mother, has, been, dead, these, ten, years]","[\t, вот, уже, десять, лет, как, моя, мать, ум..."
117524,"[my, mother, made, a, beautiful, dress, for, me]","[\t, мама, сшила, мне, красивое, платье, \n]"
117525,"[my, mother, often, bakes, apple, pies, for, us]","[\t, моя, мать, часто, печёт, для, нас, яблочн..."
117526,"[my, mother, put, a, large, vase, on, the, shelf]","[\t, мама, поставила, на, полку, большую, вазу..."


In [8]:
rus_w_set = set(df.explode('rus').rus)
eng_w_set = set(df.explode('eng').eng)
eng_words= sorted(list(eng_w_set))
rus_words=sorted(list(rus_w_set))
num_encoder_tokens = len(eng_words)
num_decoder_tokens = len(rus_words) #improved, was len(rus_words)+1

print(f'eng bag size:  {num_encoder_tokens}')
print(f'rus bag size:  {num_decoder_tokens}')

eng bag size:  7912
rus bag size:  22845


In [9]:
def get_max_lenght(df, col):
  return df[col].apply(len).max()
  
max_eng_len = get_max_lenght(df, 'eng')
print(" Max length of the eng sentence  ", max_eng_len)
max_rus_len = get_max_lenght(df, 'rus')
print(" Max length of the rus sentence  ", max_rus_len)

 Max length of the eng sentence   9
 Max length of the rus sentence   13


In [11]:
source_word2idx= dict([(word, i) for i, word in enumerate(eng_words)])
target_word2idx=dict([(word, i) for i, word in enumerate(rus_words)])
source_idx2word= dict([(i, word) for word, i in  source_word2idx.items()])
target_idx2word =dict([(i, word) for word, i in target_word2idx.items()])

print(source_idx2word)



In [12]:
df.eng = df.eng.apply(lambda x: ' '.join([str(elem) for elem in x]))
df.rus = df.rus.apply(lambda x: ' '.join([str(elem) for elem in x]))

X, y = df.eng, df.rus
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle = True, test_size = 0.15)
print( X_train.shape, X_test.shape)

(99898,) (17630,)


In [15]:
def generate_batch(X = X_train, y = y_train, batch_size = 128):
    ''' 
    Generate a batch of data 
    One Hot Encoding implementation
    '''
    while True:
        for j in range(0, len(X), batch_size):
            encoder_input_data = np.zeros((batch_size, max_eng_len),dtype='float32')
            decoder_input_data = np.zeros((batch_size, max_rus_len),dtype='float32')
            decoder_target_data = np.zeros((batch_size, max_rus_len, num_decoder_tokens),dtype='float32')
            for i, (input_text, target_text) in enumerate(zip(X[j:j+batch_size], y[j:j+batch_size])):
                for t, word in enumerate(input_text.split()):
                  encoder_input_data[i, t] = source_word2idx.get(word, 0) #improved, was source_word2idx[word]
                for t, word in enumerate(target_text.split()):
                  if t<len(target_text.split())-1:
                      decoder_input_data[i, t] = target_word2idx.get(word, 0) #improved, was source_word2idx[word]
                  if t>0:
                      decoder_target_data[i, t - 1, target_word2idx[word]] = 1.
                    
            yield([encoder_input_data, decoder_input_data], decoder_target_data)

In [16]:
train_samples = len(X_train)
val_samples = len(X_test)
batch_size = 80
epochs = 15
latent_dim=256

In [26]:
encoder_inputs = Input(shape=(None,))
enc_emb =  Embedding(num_encoder_tokens, latent_dim, mask_zero = True)(encoder_inputs)
encoder_lstm1 = LSTM(latent_dim, return_state=False, return_sequences = True)
encoder_lstm2 = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm2(encoder_lstm1(enc_emb))
encoder_states = [state_h, state_c]

decoder_inputs = Input(shape=(None,))
dec_emb_layer = Embedding(num_decoder_tokens, latent_dim, mask_zero = True)
dec_emb = dec_emb_layer(decoder_inputs)

decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb,
                                     initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])
model.summary()

Model: "model_4"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_7 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 embedding_4 (Embedding)        (None, None, 256)    2025472     ['input_7[0][0]']                
                                                                                                  
 input_8 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 lstm_5 (LSTM)                  (None, None, 256)    525312      ['embedding_4[0][0]']            
                                                                                            

In [None]:
model.fit_generator(
    generator = generate_batch(X_train, y_train, batch_size = batch_size),
    steps_per_epoch = train_samples//batch_size,
    epochs = epochs,
    # validation_split=0.2,
    validation_data = generate_batch(X_test, y_test, batch_size = batch_size),
    validation_steps = val_samples//batch_size
    )

  import sys


Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
 285/1248 [=====>........................] - ETA: 3:41 - loss: 0.6242 - acc: 0.5870

In [None]:
model.save_weights('/my_model1.h5')

In [20]:
model.load_weights('/my_model1.h5')

In [21]:
# Encode the input sequence to get the "Context vectors"
encoder_model = Model(encoder_inputs, encoder_states)
# Decoder setup
# Below tensors will hold the states of the previous time step
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_state_input = [decoder_state_input_h, decoder_state_input_c]
# Get the embeddings of the decoder sequence
dec_emb2= dec_emb_layer(decoder_inputs)
# To predict the next word in the sequence, set the initial states to the states from the previous time step
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=decoder_state_input)
decoder_states2 = [state_h2, state_c2]
# A dense softmax layer to generate prob dist. over the target vocabulary
decoder_outputs2 = decoder_dense(decoder_outputs2)
# Final decoder model
decoder_model = Model(
    [decoder_inputs] + decoder_state_input,
    [decoder_outputs2] + decoder_states2)

In [22]:
def decode_sequence(input_seq):
  # Encode the input as state vectors.
  states_value = encoder_model.predict(input_seq)
  # Generate empty target sequence of length 1.
  target_seq = np.zeros((1,1))
  # Populate the first character of 
  #target sequence with the start character.
  target_seq[0, 0] = target_word2idx['\t']
# Sampling loop for a batch of sequences
  # (to simplify, here we assume a batch of size 1).
  stop_condition = False
  decoded_sentence = ''
  while not stop_condition:
      output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
# Sample a token
      sampled_token_index = np.argmax(output_tokens[0, -1, :])
      sampled_word =target_idx2word[sampled_token_index]
      decoded_sentence += ' '+ sampled_word
# Exit condition: either hit max length
      # or find stop character.
      if (sampled_word == '\n' or
          len(decoded_sentence) > 50):
          stop_condition = True
# Update the target sequence (of length 1).
      target_seq = np.zeros((1,1))
      target_seq[0, 0] = sampled_token_index
# Update states
      states_value = [h, c]
  return decoded_sentence

In [23]:
train_gen = generate_batch(X_train, y_train, batch_size = 1)
k=-1

In [24]:
k+=1
(input_seq, actual_output), _ = next(train_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input Source sentence:', X_train[k:k+1].values[0])
print('Actual Target Translation:', y_train[k:k+1].values[0][6:-4])
print('Predicted Target Translation:', decoded_sentence[:-4])

Input Source sentence: i dont need anybody
Actual Target Translation: никто не нуж
Predicted Target Translation:  не надо мне нужна была нужна так том люблю есть


In [25]:
test_gen = generate_batch(X_test, y_test, batch_size = 1)
k=10
k+=1
(input_seq, actual_output), _ = next(test_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input Source sentence:', X_test[k:k+1].values[0])
print('Actual Target Translation:', y_test[k:k+1].values[0][6:-4])
print('Predicted Target Translation:', decoded_sentence[:-4])

Input Source sentence: i owe tom my life
Actual Target Translation: язан тому жизн
Predicted Target Translation:  не могу есть тома внимания с томом в это видел в
