# Machine Learning Language Translation from English To Persian

## Import Dependencies

In [6]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input,LSTM,Dense
import numpy as np
from loguru import logger

## Initialize parameters

In [7]:
BATCH_SIZE= 64
EPOCHS=100
LATENT_DIM=256 # latent dimensionality of encoding space
NUM_SAMPLES=2600 # number of samples to train
DATA_PATH='./pes.txt'

## Preprocessing

In [8]:
# Vectorize the data
input_texts = []
target_texts = []
input_characters = set()
target_characters = set()
with open(DATA_PATH,'r',encoding='utf-8') as file:
  lines = file.read().split('\n')
for line in lines[:min(NUM_SAMPLES,len(lines)-1)]:
  input_text, target_text , _ = line.split('\t')
  # The user use "tab" as the "start sequence" character
  # for the targets, and "\n" as "end of sequence" charecter
  target_text = '\t' + target_text + "\n"
  input_texts.append(input_text)
  target_texts.append(target_text)
  for char in input_text:
    if char not in input_characters:
      input_characters.add(char)
  for char in target_text:
    if char not in target_characters:
      target_characters.add(char)
    

In [12]:
# extract some properties
input_characters = sorted(list(input_characters))
target_characters = sorted(list(target_characters))
num_encoder_tokens = len(input_characters)
num_decoder_tokens = len(target_characters)
max_encoder_seq_length = max([len(txt) for txt in input_texts])
max_decoder_seq_length = max([len(txt) for txt in target_texts])
logger.info(f"Number of samples: {len(input_texts)}")
logger.info(f"Number of unique input tokens: {num_encoder_tokens}", )
logger.info(f"Number of unique target tokens: {num_decoder_tokens}")
logger.info(f"Max sequence length for input: {max_encoder_seq_length}")
logger.info(f"Max sequence length for target: {max_decoder_seq_length}")

[32m2024-04-06 02:38:33.552[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m8[0m - [1mNumber of samples: 2600[0m
[32m2024-04-06 02:38:33.554[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m9[0m - [1mNumber of unique input tokens: 67[0m
[32m2024-04-06 02:38:33.555[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m10[0m - [1mNumber of unique target tokens: 75[0m
[32m2024-04-06 02:38:33.555[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m11[0m - [1mMax sequence length for input: 42[0m
[32m2024-04-06 02:38:33.556[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m12[0m - [1mMax sequence length for target: 65[0m
