<a href="https://colab.research.google.com/github/meekmarcelin/language-translation/blob/main/language-translation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, GRU, Dense, TimeDistributed, Bidirectional, Concatenate
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

# Data collection and preprocessing
english_sentences = ["hello", "how are you", "good morning"]
yoruba_sentences = ["bawo ni", "bawo ni o se wa", "e kaaro"]

# Tokenization
eng_tokenizer = Tokenizer()
yor_tokenizer = Tokenizer()
eng_tokenizer.fit_on_texts(english_sentences)
yor_tokenizer.fit_on_texts(yoruba_sentences)

eng_sequences = eng_tokenizer.texts_to_sequences(english_sentences)
yor_sequences = yor_tokenizer.texts_to_sequences(yoruba_sentences)

# Padding
max_eng_len = max(len(seq) for seq in eng_sequences)
max_yor_len = max(len(seq) for seq in yor_sequences)

eng_sequences = pad_sequences(eng_sequences, maxlen=max_eng_len, padding='post')
yor_sequences = pad_sequences(yor_sequences, maxlen=max_yor_len, padding='post')

# Splitting the data
eng_train, eng_val, yor_train, yor_val = train_test_split(eng_sequences, yor_sequences, test_size=0.2)

# Model development
embedding_dim = 256
units = 512

# Encoder
encoder_inputs = Input(shape=(None,))
enc_emb = Embedding(input_dim=len(eng_tokenizer.word_index)+1, output_dim=embedding_dim)(encoder_inputs)
encoder_output, forward_h, backward_h = Bidirectional(GRU(units, return_state=True))(enc_emb)

# Concatenate the forward and backward states
state_h = Concatenate()([forward_h, backward_h])

# Decoder
decoder_inputs = Input(shape=(None,))
dec_emb = Embedding(input_dim=len(yor_tokenizer.word_index)+1, output_dim=embedding_dim)(decoder_inputs)
dec_gru = GRU(units * 2, return_sequences=True, return_state=True)
decoder_output, _ = dec_gru(dec_emb, initial_state=state_h)
decoder_dense = TimeDistributed(Dense(len(yor_tokenizer.word_index)+1, activation='softmax'))
decoder_outputs = decoder_dense(decoder_output)

# Model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Training
model.fit([eng_train, yor_train[:, :-1]], yor_train[:, 1:, np.newaxis],
          validation_data=([eng_val, yor_val[:, :-1]], yor_val[:, 1:, np.newaxis]),
          epochs=50, batch_size=64)

# Save the model
model.save('eng_to_yor_translation_model.h5')

# Evaluation
def evaluate_model(input_sentence):
    input_seq = eng_tokenizer.texts_to_sequences([input_sentence])
    input_seq = pad_sequences(input_seq, maxlen=max_eng_len, padding='post')
    states = model.layers[3].predict(input_seq)
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = yor_tokenizer.word_index['bawo']  # Use actual start token

    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h = model.layers[5].predict([target_seq] + [states])

        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = yor_tokenizer.index_word[sampled_token_index]
        decoded_sentence += ' ' + sampled_word

        if sampled_word == 'end' or len(decoded_sentence.split()) > max_yor_len:
            stop_condition = True

        target_seq



Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


  saving_api.save_model(
