In [1]:
!pip install tensorflow
!pip install fasttext



In [2]:
import pandas as pd
import numpy as np
import fasttext
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import SimpleRNN, Bidirectional, Dense, Embedding, Input, Dropout, dot, Activation, concatenate
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from nltk.translate.bleu_score import corpus_bleu

In [3]:
# Step 1: Load the FastText model
model_fasttext = fasttext.load_model('model_fasttext.bin')

# Step 2: Load the preprocessed data
data = pd.read_csv('cleaned_infopankki-fa.csv')
print(data.head())

                                             English  \
0  all texts that have been published on the info...   
1                                    you are free to   
2  share copy and redistribute the material in an...   
3  adapt remix transform and build upon the mater...   
4                          under the following terms   

                                             Persian      Source  
0  تمام مطالب و متونی که به هر زبانی در صفحات این...  infopankki  
1                                    شما اجازه دارید  infopankki  
2  به اشتراک بگذارید از مطالب نسخه برداری کنید و ...  infopankki  
3  تغییر دهید مطالب را ترکیب کنید و تغییر دهید و ...  infopankki  
4                                        با شروط زیر  infopankki  


## Simple RNN

In [4]:
# Step 3: Tokenize the English and Persian sentences
MAX_VOCAB_SIZE = 10000
MAX_SEQUENCE_LENGTH = 100

# Tokenizer for English
english_tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
english_tokenizer.fit_on_texts(data['English'])
english_sequences = english_tokenizer.texts_to_sequences(data['English'])

# Tokenizer for Persian
persian_tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
persian_tokenizer.fit_on_texts(data['Persian'])
persian_sequences = persian_tokenizer.texts_to_sequences(data['Persian'])

# Padding sequences to ensure uniform input size
english_padded = pad_sequences(english_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
persian_padded = pad_sequences(persian_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')

# Step 4: Split the data into training, validation, and test sets
X_train, X_temp, y_train, y_temp, idx_train, idx_temp = train_test_split(
    english_padded, persian_padded, data.index, test_size=0.2, random_state=42)

X_val, X_test, y_val, y_test, idx_val, idx_test = train_test_split(
    X_temp, y_temp, idx_temp, test_size=0.5, random_state=42)

# Step 5: Create the embedding matrix using FastText embeddings
embedding_dim = 100  # FastText embedding dimension
embedding_matrix = np.zeros((MAX_VOCAB_SIZE, embedding_dim))

for word, i in english_tokenizer.word_index.items():
    if i < MAX_VOCAB_SIZE:
        try:
            embedding_vector = model_fasttext.get_word_vector(word)
            embedding_matrix[i] = embedding_vector
        except KeyError:
            embedding_matrix[i] = np.zeros(embedding_dim)

In [5]:
# Step 6: Build the Encoder-Decoder RNN model with Attention
units = 256  # Number of units in SimpleRNN

# Encoder
encoder_inputs = Input(shape=(MAX_SEQUENCE_LENGTH,))
encoder_embedding = Embedding(input_dim=MAX_VOCAB_SIZE, output_dim=embedding_dim,
                              weights=[embedding_matrix], trainable=False)(encoder_inputs)
encoder_rnn = SimpleRNN(units, return_sequences=True, return_state=True)
encoder_outputs, encoder_state = encoder_rnn(encoder_embedding)

# Decoder
decoder_inputs = Input(shape=(MAX_SEQUENCE_LENGTH,))
decoder_embedding = Embedding(input_dim=MAX_VOCAB_SIZE, output_dim=embedding_dim)(decoder_inputs)
decoder_rnn = SimpleRNN(units, return_sequences=True, return_state=True)
decoder_outputs, _ = decoder_rnn(decoder_embedding, initial_state=encoder_state)

# Attention mechanism
attention = dot([decoder_outputs, encoder_outputs], axes=[2, 2])
attention = Activation('softmax')(attention)
context = dot([attention, encoder_outputs], axes=[2, 1])
decoder_combined_context = concatenate([context, decoder_outputs])

# Dense layer for predicting next token
output_dense = Dense(MAX_VOCAB_SIZE, activation='softmax')(decoder_combined_context)

# Define the model
model = Model([encoder_inputs, decoder_inputs], output_dense)

# Step 7: Compile the model
model.compile(optimizer=Adam(), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Display model architecture
model.summary()

# Step 8: Prepare the target data
# Shift the Persian sequences by one time step for the decoder's output.
decoder_target_data = np.zeros_like(y_train)
decoder_target_data[:, :-1] = y_train[:, 1:]  # Shift the target data to predict the next word
decoder_target_data[:, -1] = 0  # Padding the last timestep with zeros to match the sequence length

# Ensure target shape is correct (100 timesteps)
print(f"decoder_target_data shape: {decoder_target_data.shape}")

# Step 9: Train the model
history = model.fit([X_train, y_train], np.expand_dims(decoder_target_data, -1),
                    epochs=10, batch_size=64, validation_data=([X_val, y_val], np.expand_dims(y_val, -1)))


decoder_target_data shape: (11772, 100)
Epoch 1/10
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 136ms/step - accuracy: 0.8256 - loss: 2.3984 - val_accuracy: 0.8539 - val_loss: 0.9963
Epoch 2/10
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 87ms/step - accuracy: 0.8648 - loss: 0.9178 - val_accuracy: 0.8580 - val_loss: 0.9743
Epoch 3/10
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 93ms/step - accuracy: 0.8662 - loss: 0.8857 - val_accuracy: 0.8540 - val_loss: 0.9843
Epoch 4/10
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 88ms/step - accuracy: 0.8686 - loss: 0.8526 - val_accuracy: 0.8539 - val_loss: 1.0212
Epoch 5/10
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 90ms/step - accuracy: 0.8752 - loss: 0.8058 - val_accuracy: 0.8544 - val_loss: 1.0421
Epoch 6/10
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 90ms/step - accuracy: 0.8800 - loss: 0.7602 - val_accuracy: 0

In [6]:
# Step 10: Evaluate the model
loss, accuracy = model.evaluate([X_test, y_test], np.expand_dims(y_test, -1))
print(f"Test Accuracy: {accuracy * 100:.2f}%")

# Step 11: Predict on test data in batches (to avoid OOM)
batch_size = 8  # Set a smaller batch size to prevent memory overload
num_samples = X_test.shape[0]
predicted_indices = []

# Predict in batches to avoid OOM
for start in range(0, num_samples, batch_size):
    end = min(start + batch_size, num_samples)
    batch_predictions = model.predict([X_test[start:end], y_test[start:end]], batch_size=batch_size)
    batch_predicted_indices = np.argmax(batch_predictions, axis=-1)
    predicted_indices.extend(batch_predicted_indices)

predicted_indices = np.array(predicted_indices)

# Step 12: Function to decode sequences back to text
def decode_sequence(sequence, tokenizer):
    reverse_word_map = {index: word for word, index in tokenizer.word_index.items()}
    decoded_sentence = ' '.join([reverse_word_map.get(idx, '') for idx in sequence if idx != 0])
    return decoded_sentence

# Step 13: Evaluate using BLEU score
def evaluate_bleu_score(y_true, y_pred):
    references = [[sentence] for sentence in y_true]
    hypotheses = [sentence for sentence in y_pred]
    return corpus_bleu(references, hypotheses)

# Decode predictions and ground truth
y_pred = [decode_sequence(seq, persian_tokenizer) for seq in predicted_indices]
y_true = [decode_sequence(seq, persian_tokenizer) for seq in y_test]

bleu_score = evaluate_bleu_score(y_true, y_pred)
print(f"BLEU Score: {bleu_score}")

[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 20ms/step - accuracy: 0.8563 - loss: 1.1333
Test Accuracy: 85.42%
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1

In [7]:
# Step 14: Save predictions
output_data = []
for i in range(len(X_test)):
    original_english = data['English'].iloc[idx_test[i]]
    predicted_persian = decode_sequence(predicted_indices[i], persian_tokenizer)
    actual_persian = decode_sequence(y_test[i], persian_tokenizer)

    output_data.append({
        'Original English': original_english,
        'Predicted Persian': predicted_persian,
        'Actual Persian': actual_persian
    })

In [8]:
# Step 15: Save predictions to CSV
output_df = pd.DataFrame(output_data)
output_df.to_csv('translation_predictions_simple_rnn.csv', index=False)

## Bidirectional RNN

In [10]:
# Step 3: Tokenize the English and Persian sentences
MAX_VOCAB_SIZE = 10000
MAX_SEQUENCE_LENGTH = 100

# Tokenizer for English
english_tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
english_tokenizer.fit_on_texts(data['English'])
english_sequences = english_tokenizer.texts_to_sequences(data['English'])

# Tokenizer for Persian
persian_tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
persian_tokenizer.fit_on_texts(data['Persian'])
persian_sequences = persian_tokenizer.texts_to_sequences(data['Persian'])

# Padding sequences to ensure uniform input size
english_padded = pad_sequences(english_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
persian_padded = pad_sequences(persian_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')

# Step 4: Split the data into training, validation, and test sets
X_train, X_temp, y_train, y_temp, idx_train, idx_temp = train_test_split(
    english_padded, persian_padded, data.index, test_size=0.2, random_state=42)

X_val, X_test, y_val, y_test, idx_val, idx_test = train_test_split(
    X_temp, y_temp, idx_temp, test_size=0.5, random_state=42)

# Step 5: Create the embedding matrix using FastText embeddings
embedding_dim = 100  # FastText embedding dimension
embedding_matrix = np.zeros((MAX_VOCAB_SIZE, embedding_dim))

for word, i in english_tokenizer.word_index.items():
    if i < MAX_VOCAB_SIZE:
        try:
            embedding_vector = model_fasttext.get_word_vector(word)
            embedding_matrix[i] = embedding_vector
        except KeyError:
            embedding_matrix[i] = np.zeros(embedding_dim)

In [11]:

# Step 6: Build the Encoder-Decoder RNN model with Bidirectional and Attention

units = 512  # Increased number of units for more complex representation

# Encoder
encoder_inputs = Input(shape=(MAX_SEQUENCE_LENGTH,))
encoder_embedding = Embedding(input_dim=MAX_VOCAB_SIZE, output_dim=embedding_dim,
                              weights=[embedding_matrix], trainable=False)(encoder_inputs)

# Bidirectional RNN
encoder_rnn = Bidirectional(SimpleRNN(units, return_sequences=True, return_state=True, dropout=0.3))
encoder_outputs, forward_h, backward_h = encoder_rnn(encoder_embedding)
encoder_state = concatenate([forward_h, backward_h])  # Combine forward and backward states

# Decoder
decoder_inputs = Input(shape=(MAX_SEQUENCE_LENGTH,))
decoder_embedding = Embedding(input_dim=MAX_VOCAB_SIZE, output_dim=embedding_dim)(decoder_inputs)
decoder_rnn = SimpleRNN(units * 2, return_sequences=True, return_state=True, dropout=0.3)
decoder_outputs, _ = decoder_rnn(decoder_embedding, initial_state=encoder_state)

# Attention mechanism
attention = dot([decoder_outputs, encoder_outputs], axes=[2, 2])
attention = Activation('softmax')(attention)
context = dot([attention, encoder_outputs], axes=[2, 1])
decoder_combined_context = concatenate([context, decoder_outputs])

# Dense layer for predicting next token
output_dense = Dense(MAX_VOCAB_SIZE, activation='softmax')(decoder_combined_context)

# Define the model
model = Model([encoder_inputs, decoder_inputs], output_dense)

# Step 7: Compile the model with a lower learning rate
model.compile(optimizer=Adam(learning_rate=0.0005), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Display model architecture
model.summary()

In [12]:

# Step 8: Prepare the target data
# Shift the Persian sequences by one time step for the decoder's output.
decoder_target_data = np.zeros_like(y_train)
decoder_target_data[:, :-1] = y_train[:, 1:]
decoder_target_data[:, -1] = 0  # Padding the last timestep with zeros

# Step 9: Train the model with a smaller batch size to avoid OOM errors
history = model.fit([X_train, y_train], np.expand_dims(decoder_target_data, -1),
                    epochs=20, batch_size=32, validation_data=([X_val, y_val], np.expand_dims(y_val, -1)))

# Step 10: Evaluate the model
loss, accuracy = model.evaluate([X_test, y_test], np.expand_dims(y_test, -1))
print(f"Test Accuracy: {accuracy * 100:.2f}%")

Epoch 1/20
[1m368/368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m94s[0m 215ms/step - accuracy: 0.8195 - loss: 2.0893 - val_accuracy: 0.8563 - val_loss: 1.0114
Epoch 2/20
[1m368/368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m116s[0m 175ms/step - accuracy: 0.8652 - loss: 0.9335 - val_accuracy: 0.8544 - val_loss: 0.9957
Epoch 3/20
[1m368/368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 175ms/step - accuracy: 0.8690 - loss: 0.8564 - val_accuracy: 0.8543 - val_loss: 1.0228
Epoch 4/20
[1m368/368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 173ms/step - accuracy: 0.8742 - loss: 0.7955 - val_accuracy: 0.8548 - val_loss: 1.0389
Epoch 5/20
[1m368/368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 175ms/step - accuracy: 0.8771 - loss: 0.7601 - val_accuracy: 0.8542 - val_loss: 1.0745
Epoch 6/20
[1m368/368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 175ms/step - accuracy: 0.8814 - loss: 0.7084 - val_accuracy: 0.8547 - val_loss: 1.1013
Epoch 7/2

In [13]:

# Step 11: Predict on test data in batches (to avoid OOM)
batch_size = 8
num_samples = X_test.shape[0]
predicted_indices = []

for start in range(0, num_samples, batch_size):
    end = min(start + batch_size, num_samples)
    batch_predictions = model.predict([X_test[start:end], y_test[start:end]], batch_size=batch_size)
    batch_predicted_indices = np.argmax(batch_predictions, axis=-1)
    predicted_indices.extend(batch_predicted_indices)

predicted_indices = np.array(predicted_indices)

# Step 12: Function to decode sequences back to text
def decode_sequence(sequence, tokenizer):
    reverse_word_map = {index: word for word, index in tokenizer.word_index.items()}
    decoded_sentence = ' '.join([reverse_word_map.get(idx, '') for idx in sequence if idx != 0])
    return decoded_sentence

# Step 13: Evaluate using BLEU score
def evaluate_bleu_score(y_true, y_pred):
    references = [[sentence] for sentence in y_true]
    hypotheses = [sentence for sentence in y_pred]
    return corpus_bleu(references, hypotheses)

# Decode predictions and ground truth
y_pred = [decode_sequence(seq, persian_tokenizer) for seq in predicted_indices]
y_true = [decode_sequence(seq, persian_tokenizer) for seq in y_test]

bleu_score = evaluate_bleu_score(y_true, y_pred)
print(f"BLEU Score: {bleu_score}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms

In [14]:

# Step 14: Save first 100 predictions
output_data = []
for i in range(len(X_test)):
    original_english = data['English'].iloc[idx_test[i]]
    predicted_persian = decode_sequence(predicted_indices[i], persian_tokenizer)
    actual_persian = decode_sequence(y_test[i], persian_tokenizer)

    output_data.append({
        'Original English': original_english,
        'Predicted Persian': predicted_persian,
        'Actual Persian': actual_persian
    })

# Step 15: Save predictions to CSV
output_df = pd.DataFrame(output_data)
output_df.to_csv('translation_predictions_bidirectional_rnn.csv', index=False)