In [None]:
A.  Basic Machine Translation uisng Rule- Based Methods


In [1]:
# Step 1: Define the Bilingual Dictionary
dictionary = {
    'hello': 'bonjour',
    'world': 'monde',
    'my': 'mon',
    'name': 'nom',
    'is': 'est',
    'good': 'bon',
    'morning': 'matin',
    'i': 'je',
    'am': 'suis',
    'a': 'un',
    'student': 'étudiant',
    'teacher': 'professeur'
}

# Step 2: Define Grammar Rules
grammar_rules = {
    'SVO': ['subject', 'verb', 'object']  # Subject-Verb-Object structure
}

# Step 3: Translation Function
def translate(sentence):
    # Convert sentence to lowercase and split into words
    words = sentence.lower().split()
    
    # Translate each word using the dictionary
    translated_words = [dictionary.get(word, word) for word in words]
    
    # Join the translated words into a sentence
    translated_sentence = ' '.join(translated_words)
    
    return translated_sentence

# Example usage
sentence = "Hello world"
translated_sentence = translate(sentence)
print("Translated sentence:", translated_sentence)

# Sample input/output interaction
while True:
    user_input = input("Enter an English sentence to translate (or type 'exit' to quit): ")
    if user_input.lower() in ['exit', 'quit']:
        print("Exiting translation system.")
        break
    
    translated_output = translate(user_input)
    print("Translated sentence:", translated_output)


Translated sentence: bonjour monde


Enter an English sentence to translate (or type 'exit' to quit):  hello


Translated sentence: bonjour


Enter an English sentence to translate (or type 'exit' to quit):  am


Translated sentence: suis


Enter an English sentence to translate (or type 'exit' to quit):  quit


Exiting translation system.


In [None]:
B. English to French Translation Using Seq2Seq with Attention

In [None]:
import tensorflow as tf
import pandas as pd
import tensorflow_datasets as tfds

# Step 1: Load dataset from CSV using Pandas
data_path = '/kaggle/input/en-fr-translation-dataset/en-fr.csv'
data = pd.read_csv(data_path)

# Check the first few rows and the column names of the dataframe
print(data.head())
print("Columns in the DataFrame:", data.columns.tolist())  # Print the actual column names

# Ensure the dataframe contains the required columns
expected_columns = ['en', 'fr']
assert all(col in data.columns for col in expected_columns), f"CSV must contain {expected_columns} columns"

# Step 2: Convert the DataFrame to a TensorFlow Dataset
# Create a TensorFlow dataset from the DataFrame
train_dataset = tf.data.Dataset.from_tensor_slices((data['en'].values, data['fr'].values))

# Print the first example to verify conversion
for english, french in train_dataset.take(1):
    print(f'English: {english.numpy().decode("utf-8")}, French: {french.numpy().decode("utf-8")}')

# Optional: Define constants for batch size and max length
BATCH_SIZE = 64
MAX_LENGTH = 40

# Optional: Tokenization process
# Tokenizer setup for input (English) and output (French)
tokenizer_en = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
    (en.numpy() for en, fr in train_dataset), target_vocab_size=2**13)
tokenizer_fr = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
    (fr.numpy() for en, fr in train_dataset), target_vocab_size=2**13)

# Encoding function
def encode(en_t, fr_t):
    en_t = [tokenizer_en.vocab_size] + tokenizer_en.encode(en_t.numpy().decode('utf-8')) + [tokenizer_en.vocab_size + 1]
    fr_t = [tokenizer_fr.vocab_size] + tokenizer_fr.encode(fr_t.numpy().decode('utf-8')) + [tokenizer_fr.vocab_size + 1]
    return en_t, fr_t

def tf_encode(en_t, fr_t):
    return tf.py_function(encode, [en_t, fr_t], [tf.int64, tf.int64])

# Prepare the dataset with encoding
train_dataset = train_dataset.map(tf_encode)

# Filter sequences longer than MAX_LENGTH
def filter_max_length(en, fr, max_length=MAX_LENGTH):
    return tf.logical_and(tf.size(en) <= max_length, tf.size(fr) <= max_length)

train_dataset = train_dataset.filter(filter_max_length)

# Shuffle and batch the dataset
train_dataset = train_dataset.shuffle(20000).padded_batch(BATCH_SIZE, padded_shapes=([None], [None]))
train_dataset = train_dataset.prefetch(tf.data.experimental.AUTOTUNE)

# Print the first training example after processing
for en, fr in train_dataset.take(1):
    print(f'Encoded English: {en.numpy()}')
    print(f'Encoded French: {fr.numpy()}')


C) NEURAL MACHINE TRANSLATION WITH TRANSFORMERS (ENGLISH TO GERMAN)



In [None]:
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_text as text

# Load the dataset
dataset, metadata = tfds.load('wmt14_translate/de-en', with_info=True, as_supervised=True)
train_examples, val_examples = dataset['train'], dataset['validation']

# Tokenizer and preprocessing
tokenizer_en = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
    (en.numpy() for en, de in train_examples), target_vocab_size=2**13)
tokenizer_de = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
    (de.numpy() for en, de in train_examples), target_vocab_size=2**13)

def encode(lang1, lang2):
    lang1 = [tokenizer_en.vocab_size] + tokenizer_en.encode(lang1.numpy()) + [tokenizer_en.vocab_size+1]
    lang2 = [tokenizer_de.vocab_size] + tokenizer_de.encode(lang2.numpy()) + [tokenizer_de.vocab_size+1]
    return lang1, lang2

def tf_encode(en, de):
    result_en, result_de = tf.py_function(encode, [en, de], [tf.int64, tf.int64])
    result_en.set_shape([None])
    result_de.set_shape([None])
    return result_en, result_de

train_dataset = train_examples.map(tf_encode).padded_batch(64, padded_shapes=([None], [None]))
val_dataset = val_examples.map(tf_encode).padded_batch(64, padded_shapes=([None], [None]))

# Transformer Model
class Transformer(tf.keras.Model):
    def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, target_vocab_size, pe_input, pe_target, rate=0.1):
        super(Transformer, self).__init__()
        self.encoder = Encoder(num_layers, d_model, num_heads, dff, input_vocab_size, pe_input, rate)
        self.decoder = Decoder(num_layers, d_model, num_heads, dff, target_vocab_size, pe_target, rate)
        self.final_layer = tf.keras.layers.Dense(target_vocab_size)
    
    def call(self, inputs, training, enc_padding_mask, look_ahead_mask, dec_padding_mask):
        enc_output = self.encoder(inputs[0], training, enc_padding_mask)
        dec_output, _ = self.decoder(inputs[1], enc_output, training, look_ahead_mask, dec_padding_mask)
        final_output = self.final_layer(dec_output)
        return final_output

# Define hyperparameters
num_layers = 4
d_model = 128
dff = 512
num_heads = 8
dropout_rate = 0.1

input_vocab_size = tokenizer_en.vocab_size + 2
target_vocab_size = tokenizer_de.vocab_size + 2
pe_input = 1000
pe_target = 1000

# Instantiate and compile model
transformer = Transformer(num_layers, d_model, num_heads, dff, input_vocab_size, target_vocab_size, pe_input, pe_target, dropout_rate)

learning_rate = tf.keras.optimizers.schedules.ExponentialDecay(initial_learning_rate=0.0001, decay_steps=10000, decay_rate=0.9)
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

transformer.compile(optimizer=optimizer, loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=['accuracy'])

# Training loop
transformer.fit(train_dataset, epochs=10, validation_data=val_dataset)

# Sample translation
sample_sentence = "Hello, how are you?"
sample_input = tokenizer_en.encode(sample_sentence)
sample_input = tf.expand_dims(sample_input, axis=0)

# Prediction
output = transformer.predict([sample_input, tf.zeros_like(sample_input)])
output_sentence = tokenizer_de.decode([int(i) for i in tf.argmax(output, axis=-1)[0] if i < tokenizer_de.vocab_size])

print(f"Input: {sample_sentence}")
print(f"Translation: {output_sentence}")
