<a href="https://colab.research.google.com/github/mukulre/Projects/blob/main/Language_Translation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:

import numpy as np # linear algebra
import pandas as pd # data processing

import os
import string
from string import digits
import matplotlib.pyplot as plt
%matplotlib inline
import re

import seaborn as sns
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from keras.layers import Input, LSTM, Embedding, Dense
from keras.models import Model

lines=pd.read_csv("Hindi_English_Truncated_Corpus.csv",encoding='utf-8')
lines=lines[lines['source']=='ted']
lines=lines[~pd.isnull(lines['english_sentence'])]
lines.drop_duplicates(inplace=True)
# Let us pick any 25000 rows from the dataset
lines=lines.sample(n=25000,random_state=42)
lines.shape

(25000, 3)

In [2]:
lines['english_sentence']=lines['english_sentence'].apply(lambda x: x.lower())
lines['hindi_sentence']=lines['hindi_sentence'].apply(lambda x: x.lower())

In [3]:
lines['english_sentence']=lines['english_sentence'].apply(lambda x: re.sub("'", '', x))
lines['hindi_sentence']=lines['hindi_sentence'].apply(lambda x: re.sub("'", '', x))

In [4]:
exclude = set(string.punctuation) # Set of all special characters
# Remove all the special characters
lines['english_sentence']=lines['english_sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
lines['hindi_sentence']=lines['hindi_sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))

In [5]:
remove_digits = str.maketrans('', '', digits)
lines['english_sentence']=lines['english_sentence'].apply(lambda x: x.translate(remove_digits))
lines['hindi_sentence']=lines['hindi_sentence'].apply(lambda x: x.translate(remove_digits))

lines['hindi_sentence'] = lines['hindi_sentence'].apply(lambda x: re.sub("[२३०८१५७९४६]", "", x))

# Remove extra spaces
lines['english_sentence']=lines['english_sentence'].apply(lambda x: x.strip())
lines['hindi_sentence']=lines['hindi_sentence'].apply(lambda x: x.strip())
lines['english_sentence']=lines['english_sentence'].apply(lambda x: re.sub(" +", " ", x))
lines['hindi_sentence']=lines['hindi_sentence'].apply(lambda x: re.sub(" +", " ", x))

lines['hindi_sentence'] = lines['hindi_sentence'].apply(lambda x : 'START_ '+ x + ' _END')

In [6]:
### Get English and Hindi Vocabulary
all_eng_words=set()
for eng in lines['english_sentence']:
    for word in eng.split():
        if word not in all_eng_words:
            all_eng_words.add(word)

all_hindi_words=set()
for hin in lines['hindi_sentence']:
    for word in hin.split():
        if word not in all_hindi_words:
            all_hindi_words.add(word)
lines['length_eng_sentence']=lines['english_sentence'].apply(lambda x:len(x.split(" ")))
lines['length_hin_sentence']=lines['hindi_sentence'].apply(lambda x:len(x.split(" ")))

In [7]:
lines=lines[lines['length_eng_sentence']<=20]
lines=lines[lines['length_hin_sentence']<=20]
max_length_src=max(lines['length_hin_sentence'])
max_length_tar=max(lines['length_eng_sentence'])

input_words = sorted(list(all_eng_words))
target_words = sorted(list(all_hindi_words))
num_encoder_tokens = len(all_eng_words)
num_decoder_tokens = len(all_hindi_words)
num_encoder_tokens, num_decoder_tokens

num_decoder_tokens += 1 #for zero padding
input_token_index = dict([(word, i+1) for i, word in enumerate(input_words)])
target_token_index = dict([(word, i+1) for i, word in enumerate(target_words)])
reverse_input_char_index = dict((i, word) for word, i in input_token_index.items())
reverse_target_char_index = dict((i, word) for word, i in target_token_index.items())
lines = shuffle(lines)

In [8]:
X, y = lines['english_sentence'], lines['hindi_sentence']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,random_state=42)

X_train.to_pickle('X_train.pkl')
X_test.to_pickle('X_test.pkl')

In [25]:
import tensorflow as tf  # Import tensorflow
import numpy as np  # Import numpy

X_train = pd.read_pickle('X_train.pkl')
X_test = pd.read_pickle('X_test.pkl')

# Define missing variables (Ensure you replace with actual values)
batch_size = 128
max_length_src = 50  # Example max length of source sequences
max_length_tar = 50  # Example max length of target sequences
num_decoder_tokens = 1000  # Example vocabulary size
epochs = 10
train_samples = 10000  # Example training samples
val_samples = 2000  # Example validation samples

# Example dictionaries (Ensure they contain actual mappings)
input_token_index = {"example": 1}  # Replace with real token mappings
target_token_index = {"example": 1, "START_": 2, "END_": 3}  # Replace with real token mappings

# Placeholder data (Replace with actual training/testing data)
X_train = ["example sentence"] * train_samples
y_train = ["START_ example END_"] * train_samples
X_test = ["example test"] * val_samples
y_test = ["START_ test END_"] * val_samples

def generate_batch(X, y, batch_size=128):
    ''' Generate a batch of data '''
    while True:
        for j in range(0, len(X), batch_size):

            encoder_input_data = np.zeros((actual_batch_size, max_length_src), dtype='float32')
            decoder_input_data = np.zeros((actual_batch_size, max_length_tar), dtype='float32')
            decoder_target_data = np.zeros((actual_batch_size, max_length_tar, num_decoder_tokens), dtype='float32')

            actual_batch_size = min(batch_size, len(X) - j)  # Handle last batch
            for i, (input_text, target_text) in enumerate(zip(X[j:j + actual_batch_size], y[j:j + actual_batch_size])):
                for t, word in enumerate(input_text.split()):
                        encoder_input_data[i, t] = input_token_index[word]  # encoder input seq
                for t, word in enumerate(target_text.split()):
                    if word in target_token_index:
                        if t < len(target_text.split()) - 1:
                            decoder_input_data[i, t] = target_token_index[word]  # decoder input seq
                        if t > 0:
                            decoder_target_data[i, t - 1, target_token_index[word]] = 1.  # one-hot encoding

            yield ([encoder_input_data, decoder_input_data], decoder_target_data)

# Define output signature for the generator
output_signature = (
    (tf.TensorSpec(shape=(None, max_length_src), dtype=tf.float32),
     tf.TensorSpec(shape=(None, max_length_tar), dtype=tf.float32)),
    tf.TensorSpec(shape=(None, max_length_tar, num_decoder_tokens), dtype=tf.float32)
)

# Convert the generator to a tf.data.Dataset and specify output_signature
train_dataset = tf.data.Dataset.from_generator(
    lambda: generate_batch(X_train, y_train, batch_size),
    output_signature=output_signature
)
val_dataset = tf.data.Dataset.from_generator(
    lambda: generate_batch(X_test, y_test, batch_size),
    output_signature=output_signature
)

# Dummy Model (Ensure this matches your real model)
encoder_inputs = tf.keras.Input(shape=(max_length_src,))
decoder_inputs = tf.keras.Input(shape=(max_length_tar,))
reshaped_decoder_inputs = tf.keras.layers.Reshape((max_length_tar,1))(decoder_inputs)
embedding_layer = tf.keras.layers.Embedding(
    num_decoder_tokens,
    output_dim=256,
    #embedding_dim=256  # Choose an appropriate embedding dimension
)
embedded_decoder_inputs = embedding_layer(decoder_inputs)
model = tf.keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='categorical_crossentropy')

lstm_layer = tf.keras.layers.LSTM(512, return_sequences=True)  # Choose an appropriate LSTM unit size
lstm_output = lstm_layer(embedded_decoder_inputs)
decoder_outputs = tf.keras.layers.Dense(num_decoder_tokens, activation='softmax')(lstm_output)

model = tf.keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='categorical_crossentropy')

# Fit the model using the tf.data.Dataset objects
model.fit(train_dataset,
          steps_per_epoch=train_samples // batch_size,
          epochs=epochs,
          validation_data=val_dataset,
          validation_steps=val_samples // batch_size)

k = 0  # Initialize k to 0 before incrementing
k += 1
(input_seq, actual_output), _ = next(train_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input English sentence:', X_train[k:k + 1].values[0])
print('Actual Hindi Translation:', y_train[k:k + 1].values[0][6:-4])
print('Predicted Hindi Translation:', decoded_sentence[:-4])

Epoch 1/10


UnknownError: Graph execution error:

Detected at node PyFunc defined at (most recent call last):
<stack traces unavailable>
UnboundLocalError: cannot access local variable 'actual_batch_size' where it is not associated with a value
Traceback (most recent call last):

  File "/usr/local/lib/python3.11/dist-packages/tensorflow/python/ops/script_ops.py", line 269, in __call__
    ret = func(*args)
          ^^^^^^^^^^^

  File "/usr/local/lib/python3.11/dist-packages/tensorflow/python/autograph/impl/api.py", line 643, in wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^

  File "/usr/local/lib/python3.11/dist-packages/tensorflow/python/data/ops/from_generator_op.py", line 198, in generator_py_func
    values = next(generator_state.get_iterator(iterator_id))
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

  File "<ipython-input-25-77ee098ad310>", line 31, in generate_batch
    encoder_input_data = np.zeros((actual_batch_size, max_length_src), dtype='float32')
                                   ^^^^^^^^^^^^^^^^^

UnboundLocalError: cannot access local variable 'actual_batch_size' where it is not associated with a value


	 [[{{node PyFunc}}]]
	 [[IteratorGetNext]] [Op:__inference_multi_step_on_iterator_19064]