In [1]:
import tensorflow as tf
import pandas as pd
import tensorflow_datasets as tfds

# Step 1: Load dataset from CSV using Pandas
data_path = '/kaggle/input/wmt-sampled-50000-english-to-french-dataset/wmt_sample_50000.csv'
data = pd.read_csv(data_path)

# Check the first few rows and the column names of the dataframe
print(data.head())
print("Columns in the DataFrame:", data.columns.tolist())  # Print the actual column names

# Ensure the dataframe contains the required columns
expected_columns = ['en', 'fr']
assert all(col in data.columns for col in expected_columns), f"CSV must contain {expected_columns} columns"

# Step 1.1: Preprocess the data
# Drop rows with missing or non-string values
data = data.dropna(subset=['en', 'fr'])  # Drop rows where 'en' or 'fr' is NaN
data['en'] = data['en'].astype(str)  # Ensure 'en' column is of type string
data['fr'] = data['fr'].astype(str)  # Ensure 'fr' column is of type string

# Step 2: Convert the DataFrame to a TensorFlow Dataset
train_dataset = tf.data.Dataset.from_tensor_slices((data['en'].values, data['fr'].values))

# Print the first example to verify conversion
for english, french in train_dataset.take(1):
    print(f'English: {english.numpy().decode("utf-8")}, French: {french.numpy().decode("utf-8")}')

# Optional: Define constants for batch size and max length
BATCH_SIZE = 64
MAX_LENGTH = 40

# Optional: Tokenization process
tokenizer_en = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
    (en.numpy() for en, fr in train_dataset), target_vocab_size=2**13)
tokenizer_fr = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
    (fr.numpy() for en, fr in train_dataset), target_vocab_size=2**13)

# Encoding function
def encode(en_t, fr_t):
    en_t = [tokenizer_en.vocab_size] + tokenizer_en.encode(en_t.numpy().decode('utf-8')) + [tokenizer_en.vocab_size + 1]
    fr_t = [tokenizer_fr.vocab_size] + tokenizer_fr.encode(fr_t.numpy().decode('utf-8')) + [tokenizer_fr.vocab_size + 1]
    return en_t, fr_t

def tf_encode(en_t, fr_t):
    return tf.py_function(encode, [en_t, fr_t], [tf.int64, tf.int64])

# Prepare the dataset with encoding
train_dataset = train_dataset.map(tf_encode)

# Filter sequences longer than MAX_LENGTH
def filter_max_length(en, fr, max_length=MAX_LENGTH):
    return tf.logical_and(tf.size(en) <= max_length, tf.size(fr) <= max_length)

train_dataset = train_dataset.filter(lambda en, fr: filter_max_length(en, fr, MAX_LENGTH))

# Shuffle and batch the dataset
train_dataset = train_dataset.shuffle(20000).padded_batch(BATCH_SIZE, padded_shapes=([None], [None]))
train_dataset = train_dataset.prefetch(tf.data.experimental.AUTOTUNE)

# Print the first training example after processing
for en, fr in train_dataset.take(1):
    print(f'Encoded English: {en.numpy()}')
    print(f'Encoded French: {fr.numpy()}')


   Unnamed: 0                                                 en  \
0           0  Survey participants noted that growth in the s...   
1           1  Influenza B was reported in a very small propo...   
2           2  If you are travelling with infants or small ch...   
3           3     ◦ Failure to provide a social insurance number   
4           4  This often includes arrangements of traps and ...   

                                                  fr  
0  Les participants à l’enquête notent que la cro...  
1  Le virus de l'influenza B a été signalé dans u...  
2  Emportez également un double de votre ordonnan...  
3  ◦ Défaut de fournir un numéro d'assurance sociale  
4  Le circuit comprend souvent des systèmes de pi...  
Columns in the DataFrame: ['Unnamed: 0', 'en', 'fr']
English: Survey participants noted that growth in the second quarter will likely be substantial, particularly in the corporate travel segment, compared with the dismal performance of the second quarter of 200