In [1]:
# Step 1: Define dictionary
dictionary = {
    'hello': 'bonjour',
    'world': 'monde',
    'my': 'mon',
    'name': 'nom',
    'is': 'est'
}

# Step 2: Define grammar rules (not used yet)
grammar_rules = {
    'SVO': ['subject', 'verb', 'object']
}

# Step 3: Translation function
def translate(sentence):
    words = sentence.lower().split()  # split into words
    translated_words = [dictionary.get(word, word) for word in words]  # look up each word
    return ' '.join(translated_words)

# Example usage
sentence = "Hello world"
print(translate(sentence))  # Expected: "bonjour monde"


bonjour monde


In [11]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Attention, Concatenate
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split

In [12]:
# Load data
def load_sentences(file, add_tokens=False):
    with open(file, 'r', encoding='utf-8') as f:
        lines = [line.strip() for line in f]
    if add_tokens:
        lines = ['<start> '+l+' <end>' for l in lines]
    return lines

en_sent = load_sentences('/kaggle/input/english-to-french/small_vocab_en.csv')
fr_sent = load_sentences('/kaggle/input/english-to-french/small_vocab_fr.csv', add_tokens = True)

In [13]:
def tokenize(texts):
    tok = Tokenizer(filters='', lower=False)
    tok.fit_on_texts(texts)
    seq = tok.texts_to_sequences(texts)
    return tok, pad_sequences(seq, padding='post')

en_tok, en_seq = tokenize(en_sent)
fr_tok, fr_seq = tokenize(fr_sent)

In [14]:
# Decoder input/output
dec_in = fr_seq[:, :-1]
dec_out = fr_seq[:, 1:]

In [15]:
X_tr, X_val, y_tr_in, y_val_in, y_tr_out, y_val_out = train_test_split(
    en_seq, dec_in, dec_out, test_size=0.2, random_state=42
)

In [16]:
# Model
embed_dim, latent_dim = 256, 512
enc_in = Input(shape=(en_seq.shape[1],))
enc_emb = Embedding(len(en_tok.word_index)+1, embed_dim)(enc_in)
enc_out, state_h, state_c = LSTM(latent_dim, return_sequences=True, return_state=True)(enc_emb)

dec_inp = Input(shape=(dec_in.shape[1],))
dec_emb = Embedding(len(fr_tok.word_index)+1, embed_dim)(dec_inp)
dec_out, _, _ = LSTM(latent_dim, return_sequences=True, return_state=True)(dec_emb, initial_state=[state_h, state_c])

attn = Attention()([dec_out, enc_out])
dec_concat = Concatenate()([dec_out, attn])
dec_dense = Dense(len(fr_tok.word_index)+1, activation='softmax')
outputs = dec_dense(dec_concat)

In [17]:
model = Model([enc_in, dec_inp], outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train (increase epochs for better results)
model.fit([X_tr, y_tr_in], y_tr_out, validation_data=([X_val, y_val_in], y_val_out),
          batch_size=32, epochs=15)

Epoch 1/15
[1m3447/3447[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 15ms/step - accuracy: 0.7973 - loss: 0.7533 - val_accuracy: 0.9827 - val_loss: 0.0486
Epoch 2/15
[1m3447/3447[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 15ms/step - accuracy: 0.9871 - loss: 0.0386 - val_accuracy: 0.9924 - val_loss: 0.0236
Epoch 3/15
[1m3447/3447[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 15ms/step - accuracy: 0.9936 - loss: 0.0198 - val_accuracy: 0.9948 - val_loss: 0.0161
Epoch 4/15
[1m3447/3447[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 15ms/step - accuracy: 0.9956 - loss: 0.0134 - val_accuracy: 0.9956 - val_loss: 0.0140
Epoch 5/15
[1m3447/3447[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 15ms/step - accuracy: 0.9967 - loss: 0.0101 - val_accuracy: 0.9965 - val_loss: 0.0115
Epoch 6/15
[1m3447/3447[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 15ms/step - accuracy: 0.9974 - loss: 0.0081 - val_accuracy: 0.9968 - val_loss: 0.0109
Epoc

<keras.src.callbacks.history.History at 0x7fd1942eca10>

In [18]:
def translate(text):
    seq = pad_sequences(en_tok.texts_to_sequences([text]), maxlen=en_seq.shape[1], padding='post')
    out_seq = [fr_tok.word_index['<start>']]
    for _ in range(fr_seq.shape[1]):
        dec_seq = pad_sequences([out_seq], maxlen=dec_in.shape[1], padding='post')
        pred = model.predict([seq, dec_seq], verbose=0)[0, len(out_seq)-1]
        next_word = np.argmax(pred)
        if next_word == fr_tok.word_index['<end>']: break
        out_seq.append(next_word)
    return ' '.join([fr_tok.index_word[i] for i in out_seq[1:]])


In [19]:
print(translate("the weather is cold today"))

le requin est mon animal préféré .
