In [1]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
import pandas as pd
import numpy as np

In [2]:
# Load data
df = pd.read_csv("MT-EN-TM.csv", encoding='utf-8')
print(f"Dataset shape: {df.shape}\n")
print(df.head())

Dataset shape: (207, 3)

      english                 tamil  \
0    I slept.      நான் தூங்கினேன்.   
1  Calm down.     அமைதியாக இருங்கள்   
2  I'll walk.        நான் நடப்பேன்.   
3  Who is he?            அவன் யார்?   
4  Who knows?  யாருக்குத் தெரியும்?   

                                         attribution  
0  CC-BY 2.0 (France) Attribution: tatoeba.org #3...  
1  CC-BY 2.0 (France) Attribution: tatoeba.org #4...  
2  CC-BY 2.0 (France) Attribution: tatoeba.org #2...  
3  CC-BY 2.0 (France) Attribution: tatoeba.org #3...  
4  CC-BY 2.0 (France) Attribution: tatoeba.org #2...  


In [3]:
# Check for null values
print("\nNull values:")
print(df.isnull().sum())


Null values:
english        0
tamil          0
attribution    0
dtype: int64


In [4]:

# Data preparation
source_texts = df['english'].values
target_texts = df['tamil'].values

# Tokenization
source_tokenizer = Tokenizer(filters='')
target_tokenizer = Tokenizer(filters='')

source_tokenizer.fit_on_texts(source_texts)
target_tokenizer.fit_on_texts(target_texts)

source_vocab_size = len(source_tokenizer.word_index) + 1
target_vocab_size = len(target_tokenizer.word_index) + 1

print(f"\nSource vocab size: {source_vocab_size}")
print(f"Target vocab size: {target_vocab_size}")

# Convert texts to sequences
source_sequences = source_tokenizer.texts_to_sequences(source_texts)
target_sequences = target_tokenizer.texts_to_sequences(target_texts)


Source vocab size: 440
Target vocab size: 581


In [5]:
# Pad sequences
max_source_len = max(len(seq) for seq in source_sequences)
max_target_len = max(len(seq) for seq in target_sequences)

source_sequences = pad_sequences(source_sequences, maxlen=max_source_len, padding='post')
target_sequences = pad_sequences(target_sequences, maxlen=max_target_len, padding='post')

print(f"\nMax source sequence length: {max_source_len}")
print(f"Max target sequence length: {max_target_len}")


Max source sequence length: 19
Max target sequence length: 11


In [6]:
# Reshape for RNN
X = source_sequences.reshape((source_sequences.shape[0], source_sequences.shape[1], 1))
y = target_sequences.reshape((target_sequences.shape[0], target_sequences.shape[1], 1))

In [7]:
# Model architecture
embedding_dim = 64
rnn_units = 128

model = Sequential([
    Embedding(input_dim=source_vocab_size, output_dim=embedding_dim, 
              input_length=max_source_len, mask_zero=True),
    SimpleRNN(rnn_units, return_sequences=False),
    Dense(target_vocab_size, activation='softmax')
])

model.compile(optimizer='adam', 
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

model.summary()





In [8]:
# Training
batch_size = 32
epochs = 50

history = model.fit(X, y, 
                    batch_size=batch_size,
                    epochs=epochs,
                    validation_split=0.2)

Epoch 1/50


ValueError: Input 0 of layer "simple_rnn" is incompatible with the layer: expected ndim=3, found ndim=4. Full shape received: (None, 19, 1, 64)

In [None]:
# Translation function
def translate(text):
    sequence = source_tokenizer.texts_to_sequences([text])
    padded_sequence = pad_sequences(sequence, maxlen=max_source_len, padding='post')
    prediction = model.predict(padded_sequence)
    
    translated_indices = np.argmax(prediction, axis=-1)[0]
    translated_sentence = []
    
    for idx in translated_indices:
        if idx > 0:  # Skip padding
            word = target_tokenizer.index_word.get(idx, "")
            if word:
                translated_sentence.append(word)
    
    return ' '.join(translated_sentence)

# Test translations
test_phrases = [
    "Hello",
    "How are you",
    "I love you",
    "What is your name",
    "Thank you"
]

for phrase in test_phrases:
    translation = translate(phrase)
    print(f"'{phrase}' -> '{translation}'")