## **ML Translation Model**

# **Import Libraries**

In [None]:
import numpy as np
from numpy import array
import pandas as pd
import nltk #imports nlp tool kit , provides tools for tokenization and more
import re #imports regular exp modules
import tensorflow as tf  #buliding and training models
from keras.preprocessing.text import Tokenizer #tokenizing (splitting texts into)
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential #it allows you to create model layer-by-layer
from keras.layers import Embedding, LSTM, Dense, Bidirectional, Dropout #loads the layers
from sklearn.model_selection import train_test_split #used for train ,test,split data
from keras.optimizers import Adam #loads the optimizer
from keras.preprocessing import sequence
from keras.callbacks import Callback #for bleu metrics
from keras.utils import to_categorical
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## **Data Collecting and Cleansing**

In [None]:
df=pd.read_csv("ara_eng.txt", delimiter="\t", names=["english", "arabic"]) #delimiter : btfr' maben el kalemat

In [None]:
ds = df.loc[0:12000, :].copy() #takes part of data (first 12000 rows and cols)(copy: creates a new file with selected data)
ds

Unnamed: 0,english,arabic
0,Hi.,مرحبًا.
1,Run!,اركض!
2,Help!,النجدة!
3,Jump!,اقفز!
4,Stop!,قف!
...,...,...
11996,according to the united nations educational sc...,وفقا لمنظمة الامم المتحدة للتربية والعلم والثق...
11997,he was passionate about work and convinced tha...,كان متحمسا لعمله ومقتنع بان العمارة قبل ان تكو...
11998,is the saudi government monitoring women globa...,السعودية اتراقب الحكومة النساء؟ الاصوات العالمية
11999,is the saudi government monitoring women blogg...,هل تراقب الحكومة السعودية النساء؟ يوضح لنا الم...


# **Lowercasing**

In [None]:
def preprocess(text):
  text = re.sub(r'\b\w+\b',lambda match: match.group(0).lower(),text)
  #re.sub it picks the string from data and make it lowercase
  #lambda is used to create a small unnamed function.
  #match is the matched substring found by the regular expression pattern.
  #match.group(0) retrieves the entire matched substring.
  #.lower() converts the matched substring to lowercase.
  return text
ds.loc[:, 'english'] = ds['english'].apply(preprocess) #bn-apply el function l column el english
ds['english']

0                                                      hi.
1                                                     run!
2                                                    help!
3                                                    jump!
4                                                    stop!
                               ...                        
11996    according to the united nations educational sc...
11997    he was passionate about work and convinced tha...
11998    is the saudi government monitoring women globa...
11999    is the saudi government monitoring women blogg...
12000    mexican women are murdered each day global voi...
Name: english, Length: 12001, dtype: object

## **Tokenization And Indexing**

In [None]:
#ba3ml function bt3ml tokenization lel dataset kolaha , arabic w english , kelma b kelma.
def tokenization(dataset):
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(dataset) #bykhaly el tokenizing y-fit el data
  return tokenizer

#ba3ml tokenizing lel english el dataset
tokenizer=tokenization(ds)
tokenizer_eng=tokenization(ds['english'])
VOCAB_SIZE1=len(tokenizer_eng.word_index) + 1 #+1 cause bnbda' bl zero
eng_length=8 #n-limit el padding

word_index = {word: idx for idx, word in enumerate(tokenizer_eng.word_index)}
#bady word lel index(word odamha el index) #enumerate bt3ml loop 3al words (word :1)
index_to_word_english = {index: word for word, index in tokenizer_eng.word_index.items()}
#bady index lel kelma (1:word)(index odamha el word)
#.item() = enumerate

tokenizer_ara=tokenization(ds['arabic'])
VOCAB_SIZE2=len(tokenizer_ara.word_index) + 1
ara_length=8

arabic_classes = {word: idx for idx, word in enumerate(tokenizer_ara.word_index)}
index_to_word_arabic = {index: word for word, index in tokenizer_ara.word_index.items()}

print(index_to_word_arabic)
print(index_to_word_english)

print("\nEnglish vocabulary size:",VOCAB_SIZE1)
print("\nArabic vocabulary size:",VOCAB_SIZE2)

{1: 'من', 2: 'في', 3: 'أن', 4: 'على', 5: 'لا', 6: 'توم', 7: 'ما', 8: 'هل', 9: 'هذا', 10: 'أنا', 11: 'إلى', 12: 'عن', 13: 'لم', 14: 'ذلك', 15: 'هذه', 16: 'كان', 17: 'هو', 18: 'ان', 19: 'الاصوات', 20: 'العالمية', 21: 'و', 22: 'هنا', 23: 'كل', 24: 'هناك', 25: 'الى', 26: 'ليس', 27: 'الذي', 28: 'لقد', 29: 'لي', 30: 'بعد', 31: 'مع', 32: 'إنه', 33: 'عليك', 34: 'أين', 35: 'ماذا', 36: 'أنت', 37: 'كنت', 38: 'كيف', 39: 'هي', 40: 'أريد', 41: 'إنها', 42: 'التي', 43: 'أي', 44: 'لن', 45: 'فضلك', 46: 'كانت', 47: 'جداً', 48: 'قبل', 49: 'ماري', 50: 'لك', 51: 'يجب', 52: 'شيء', 53: 'اليوم', 54: 'الكثير', 55: 'الانترنت', 56: 'الآن', 57: 'إن', 58: 'يوم', 59: 'لدي', 60: 'متى', 61: 'أحب', 62: 'تلك', 63: 'نحن', 64: 'أحد', 65: 'أنه', 66: 'سوريا', 67: 'او', 68: 'فقط', 69: 'كم', 70: 'أعرف', 71: 'لماذا', 72: 'قد', 73: 'الكتاب', 74: 'يمكنك', 75: 'الوقت', 76: 'حتى', 77: 'بعض', 78: 'عندما', 79: 'الذهاب', 80: 'إذا', 81: 'علي', 82: 'المنزل', 83: 'العالم', 84: 'غير', 85: 'أبي', 86: 'عام', 87: 'يبدو', 88: 'تريد', 89: 'تم

# **Encoding and Pad Sequence**

In [None]:
#converting text seq into numerical rep
# length : desired seq length
#padding : bakhaly el sequences leha length
def encode_sequence(tokenizer, length, ds):
  #integer encode sequences
  seq = tokenizer.texts_to_sequences(ds)
   # replaces to word in dataset to its index
  seq = pad_sequences(seq, maxlen=length, padding = 'post')
  return seq
  # seq : changes all data into seq (removes kalemat)
  #model works with padded seq

# **splitting the data**

In [None]:
main_data,test_data,main_label,test_label=train_test_split(ds['english'],ds['arabic'],test_size=0.2,random_state=12)
#train_test_split : it takes data ,trains and test it , test_size : takes 20% of data to test,
#randstate:makes sure the final result is the same mahma el data etghayaret
train_data,val_data,train_label,val_label=train_test_split(main_data,main_label,test_size=0.2,random_state=12)
#validation is added so overfitting wont happen

# **Preparing the data**

In [None]:
#prepare training data
trainX = encode_sequence(tokenizer_eng, eng_length , train_data)
trainY = encode_sequence(tokenizer_ara , ara_length , train_label)
#ba-train el data w hya sequence

#prepare validating data
valX = encode_sequence(tokenizer_eng, eng_length, val_data)
valY = encode_sequence(tokenizer_ara, ara_length, val_label)
#ba-validate el date w hya sequence

#prepare test data
testX = encode_sequence(tokenizer_eng, eng_length , test_data)
testY = encode_sequence(tokenizer_ara , ara_length , test_label)
#ba-test el data w hya sequence

# **Model Building**

In [None]:
model = Sequential() #it allows you to create model layer-by-layer
model.add(Embedding(input_dim=VOCAB_SIZE1, output_dim=100, input_length=trainX.shape[1]))
#embedding : converts words to vectors.
#input_dim :specifies the size of vocab.
#ouput_dim :represents vectors of embedding.
#trainX.shape[1]: makes its shape 2D array that makes it easy for model to work with data.
model.add(Bidirectional(LSTM(units=64, return_sequences=True, dropout=0.2, recurrent_dropout=0.2)))
#we use bidirectional for both ways
#dropuout : prevents overfitting (forget gate)
#units : number of lstm layers
#return_seq=true: layers should be full sequence
model.add(Dense(units=VOCAB_SIZE2, activation='Swish', use_bias=True, kernel_initializer='glorot_uniform'))
#dense : connection between every node , we use the units as vocab_size to make prediction for each word.
#glorot: initialize weight
model.summary()
#bytal3 el output



ValueError: Unknown activation function 'Swish' cannot be deserialized.

# **Model Compilation and Training**

In [None]:
class BLEUMetric(Callback):
    def __init__(self, validation_data, index_to_word_english, index_to_word_arabic):
        super(BLEUMetric, self).__init__() #constructor of class
        self.validation_data = validation_data #stores val data
        self.index_to_word_english = index_to_word_english #stores ind to word eng
        self.index_to_word_arabic = index_to_word_arabic #stores ind to word ara

    def on_epoch_end(self, epoch, logs=None): #for epochs
        y_true = self.validation_data[1] #the correct translation [1]it represents arabic
        y_pred = self.model.predict(self.validation_data[0])
         #the model's predicted translation [0]predicts the trans of english

        bleu_scores = [] #initialize empty list to store BLEU scores

        for true_seq, pred_seq in zip(y_true, y_pred):
          #searches for the word that want to be translated (ref_word)to the word that the model will predict it(pred_word)
            # Convert sequences from indices to actual words
            reference_words = [self.index_to_word_english.get(idx, 'UNKNOWN') for idx in true_seq]
            prediction_words = [self.index_to_word_arabic.get(np.argmax(pred_word_probs), 'UNKNOWN') for pred_word_probs in pred_seq]
             #argmax : calc high prob ,
            # Calculate BLEU score for each pair of reference and prediction
            bleu = nltk.translate.bleu_score.sentence_bleu([reference_words], prediction_words)
            #calc bleu score using nltk
            bleu_scores.append(bleu)
            #put scores in list and appends for the next one
            #this part converts indicies into words and it calcs bleu scores

        # Calculate the average BLEU score for the entire validation set
        avg_bleu = np.mean(bleu_scores)
        print(f'Epoch {epoch + 1}, Average BLEU Score on Validation Set: {avg_bleu}')

validation_data = (valX, valY)
bleu_metric = BLEUMetric(validation_data, index_to_word_english, index_to_word_arabic)

model.compile(loss='sparse_categorical_crossentropy', optimizer='adamax', metrics=['accuracy'])
#.compile : what to use to calc w kda (bastakhdem eh bezabt)
#metrics : calcs the predicted accuracy
history = model.fit(trainX, trainY, epochs=5, batch_size=32, validation_data=(valX, valY), callbacks=[bleu_metric])
#.fit : ba3ml training lel data
# it specifies the loss and accuracy and val accuracy and its loss
#callbacks = [bleu_metric]: responsible for cal bleu scores.

# **Evaluation**

In [None]:
score, acc = model.evaluate(testX, testY, batch_size=32)
print('Test score:', score)
print('Test accuracy:', acc)
#calc scores and accuracy

#**Prediction**

In [None]:
first_english_sentences = ds['english'].head(5)
#selects first 5 sentences

# Tokenize and encode the selected English sentences
encoded_first_english = encode_sequence(tokenizer_eng, eng_length, first_english_sentences)
encoded_first_english
#byhawel awel 5 l sequence and padding

# Translate the selected English sentences to Arabic using the trained model
predicted_arabic_sequences = model.predict(encoded_first_english)
#by3ml prediction lel 5 sentences ,translation bt3hom

# Convert the predicted sequences into human-readable Arabic sentences and print them
for idx, pred_seq in enumerate(predicted_arabic_sequences, start=1): # loops over predicted arabic seq
    translated_sentence = ' '.join([index_to_word_arabic.get(idx) for idx in np.argmax(pred_seq, axis=-1) if idx != 0])
    print(f"English Sentence {idx}: {first_english_sentences.iloc[idx-1]}")
    print(f"Arabic Translation {idx}: {translated_sentence}\n")
    if idx == 5:
        break  # Break the loop after printing the translations for the first 5 sentences

#**Transformer**


In [None]:
#his pipeline is configured for translation from English to Arabic.
#The specific pre-trained model used for translation is specified as "marefa-nlp/marefa-mt-en-ar".
from transformers import pipeline
import sentencepiece #used for text tokenization


pipe = pipeline("translation_en_to_ar", model="marefa-nlp/marefa-mt-en-ar")

# Translate text
text = "how are you?"
output = pipe(text)
#he pipe() function translates the input text and stores the translated output in the output variable.

print(output)