# Machine Learning Language Translation from English To Persian

## Import Dependencies

In [6]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input,LSTM,Dense
import numpy as np
from loguru import logger

## Initialize parameters

In [7]:
BATCH_SIZE= 64
EPOCHS=100
LATENT_DIM=256 # latent dimensionality of encoding space
NUM_SAMPLES=2600 # number of samples to train
DATA_PATH='./pes.txt'

## Preprocessing

In [8]:
# Vectorize the data
input_texts = []
target_texts = []
input_characters = set()
target_characters = set()
with open(DATA_PATH,'r',encoding='utf-8') as file:
  lines = file.read().split('\n')
for line in lines[:min(NUM_SAMPLES,len(lines)-1)]:
  input_text, target_text , _ = line.split('\t')
  # The user use "tab" as the "start sequence" character
  # for the targets, and "\n" as "end of sequence" charecter
  target_text = '\t' + target_text + "\n"
  input_texts.append(input_text)
  target_texts.append(target_text)
  for char in input_text:
    if char not in input_characters:
      input_characters.add(char)
  for char in target_text:
    if char not in target_characters:
      target_characters.add(char)
    

In [12]:
# extract some properties
input_characters = sorted(list(input_characters))
target_characters = sorted(list(target_characters))
num_encoder_tokens = len(input_characters)
num_decoder_tokens = len(target_characters)
max_encoder_seq_length = max([len(txt) for txt in input_texts])
max_decoder_seq_length = max([len(txt) for txt in target_texts])
logger.info(f"Number of samples: {len(input_texts)}")
logger.info(f"Number of unique input tokens: {num_encoder_tokens}", )
logger.info(f"Number of unique target tokens: {num_decoder_tokens}")
logger.info(f"Max sequence length for input: {max_encoder_seq_length}")
logger.info(f"Max sequence length for target: {max_decoder_seq_length}")

[32m2024-04-06 02:38:33.552[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m8[0m - [1mNumber of samples: 2600[0m
[32m2024-04-06 02:38:33.554[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m9[0m - [1mNumber of unique input tokens: 67[0m
[32m2024-04-06 02:38:33.555[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m10[0m - [1mNumber of unique target tokens: 75[0m
[32m2024-04-06 02:38:33.555[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m11[0m - [1mMax sequence length for input: 42[0m
[32m2024-04-06 02:38:33.556[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m12[0m - [1mMax sequence length for target: 65[0m


In [14]:
input_token_index = dict(
  [(char,i) for i,char in enumerate(input_characters)]
)
target_token_index = dict(
  [(char,i) for i,char in enumerate(target_characters)]
)

### one hot representation

In [15]:
encoder_input_data = np.zeros(
  (len(input_texts), max_encoder_seq_length,num_encoder_tokens),
  dtype="float32"
)
decoder_input_data = np.zeros(
  (len(input_texts), max_decoder_seq_length,num_decoder_tokens),
    dtype="float32"
)
decoder_target_data = np.zeros(
  (len(target_texts), max_decoder_seq_length,num_decoder_tokens),
  dtype="float32"
)

In [18]:
for i , (input_text, target_text) in enumerate(zip(input_texts,target_texts)):
  for t , char in enumerate(input_text):
    encoder_input_data[i,t,input_token_index[char]]=1.
  encoder_input_data[i, t+1:, input_token_index[' ']]=1.
  for t, char in enumerate(target_text):
    # decoder_target_data is ahead of decoder_input_data by one timestep
    decoder_input_data[i, t , target_token_index[char]]= 1.
    if t > 0:
      # decoder_target_data will be ahead by one timestep
      # and will not include the start character
      decoder_target_data[i, t-1, target_token_index[char]]=1.
  decoder_input_data[i, t+1:, target_token_index[' ']] = 1.
  decoder_target_data[i, t:, target_token_index[' ']] = 1.


# Model Architecture

In [32]:
class lstm_model:
  def __init__(self,
               num_encoder_tokens,num_decoder_tokens,
               latent_dim, encoder_input_data,
               decoder_input_data, decoder_target_data,
               batch_size, epochs
               ) -> None:
    self.num_encoder_tokens = num_encoder_tokens
    self.num_decoder_tokens = num_decoder_tokens
    self.latent_dim = latent_dim
    self.encoder_input_data = encoder_input_data
    self.decoder_input_data = decoder_input_data
    self.decoder_target_data = decoder_target_data
    self.batch_size = batch_size
    self.epochs = epochs
  def encoder(self):
    self.encoder_inputs = Input(shape=(None, self.num_encoder_tokens))
    encoder = LSTM(self.latent_dim, return_state=True)
    encoder_outputs, state_h, state_c = encoder(self.encoder_inputs)
    self.encoder_states = [state_h,state_c]
  def decoder(self):
    self.decoder_inputs = Input(shape=(None,self.num_decoder_tokens))
    decoder = LSTM(self.latent_dim,return_sequences=True,return_state=True)
    self.decoder_outputs,_,_ = decoder(self.decoder_inputs, initial_state=self.encoder_states)
    decoder_dense = Dense(self.num_decoder_tokens,activation="softmax")
    self.decoder_outputs = decoder_dense(self.decoder_outputs)
  def model(self):
    self.model = Model([self.encoder_inputs,self.decoder_inputs], self.decoder_outputs)
    self.model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics = ['accuracy'])
  def training(self):
    return self.model.fit([self.encoder_input_data,self.decoder_input_data],self.decoder_target_data,
                   batch_size = self.batch_size,
                   epochs= self.epochs, validation_split = 0.2)
    
    
translation_model_obj = lstm_model(
                                  num_encoder_tokens=num_encoder_tokens,
                                  num_decoder_tokens=num_decoder_tokens,
                                  latent_dim=LATENT_DIM,
                                  encoder_input_data= encoder_input_data,
                                  decoder_input_data=decoder_input_data, 
                                  decoder_target_data=decoder_target_data,
                                  batch_size=BATCH_SIZE, epochs=EPOCHS
                                   )

  

In [33]:
translation_model_obj.encoder()
translation_model_obj.decoder()
translation_model_obj.model()
history = translation_model_obj.training()

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78