In [1]:
pip install torch transformers datasets




In [2]:
pip install sentencepiece



In [3]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import load_dataset, load_metric
import torch


In [4]:
import pandas as pd
from datasets import Dataset

# Load the dataset from the CSV file
csv_file_path = '/train.csv'  # Replace with your CSV file path
df = pd.read_csv(csv_file_path, header=None, names=['text'])

# Convert the DataFrame into a Hugging Face dataset
dataset = Dataset.from_pandas(df)


In [5]:
dataset

Dataset({
    features: ['text'],
    num_rows: 4937
})

In [6]:
# Choose the appropriate model checkpoint
model_checkpoint = "t5-base"  # Or any other appropriate model

# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Tokenize function to encode the texts
def tokenize_function(examples):
    # Tokenize the text into input IDs and attention masks
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# Apply the tokenize function to all sentences in the dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)



For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


Map:   0%|          | 0/4937 [00:00<?, ? examples/s]

In [7]:
from transformers import pipeline

# Use a pre-trained model for translation from English to Hindi
translator = pipeline('translation_en_to_hi', model='Helsinki-NLP/opus-mt-en-hi')


source.spm:   0%|          | 0.00/812k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/1.07M [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.10M [00:00<?, ?B/s]



In [None]:
# Function to translate a batch of texts
def translate_batch(texts):
    translations = translator(texts, max_length=512)
    return [translation['translation_text'] for translation in translations]

# Batch size (you may adjust this depending on your hardware)
batch_size = 10

# Container for the translated texts
translated_texts = []

# Translate in batches
for i in range(0, len(tokenized_dataset), batch_size):
    batch_texts = tokenized_dataset[i:i+batch_size]['text']
    batch_translations = translate_batch(batch_texts)
    translated_texts.extend(batch_translations)

    # Optional: Print progress
    print(f"Translated {i + len(batch_translations)}/{len(tokenized_dataset)} sentences")

# Add the translations to the original dataset
tokenized_dataset = tokenized_dataset.add_column('hindi_translation', translated_texts)


Translated 10/4937 sentences
Translated 20/4937 sentences
Translated 30/4937 sentences
Translated 40/4937 sentences
Translated 50/4937 sentences
Translated 60/4937 sentences
Translated 70/4937 sentences
Translated 80/4937 sentences
Translated 90/4937 sentences
Translated 100/4937 sentences
Translated 110/4937 sentences
Translated 120/4937 sentences
Translated 130/4937 sentences
Translated 140/4937 sentences
Translated 150/4937 sentences
Translated 160/4937 sentences
Translated 170/4937 sentences
Translated 180/4937 sentences
Translated 190/4937 sentences
Translated 200/4937 sentences
Translated 210/4937 sentences
Translated 220/4937 sentences
Translated 230/4937 sentences
Translated 240/4937 sentences
Translated 250/4937 sentences
Translated 260/4937 sentences
Translated 270/4937 sentences
Translated 280/4937 sentences
Translated 290/4937 sentences
Translated 300/4937 sentences
Translated 310/4937 sentences
Translated 320/4937 sentences
Translated 330/4937 sentences
Translated 340/4937

In [None]:
# Convert the dataset to a pandas DataFrame
translated_df = tokenized_dataset.to_pandas()

# Save the DataFrame to a CSV file
translated_df.to_csv('translated_dataset.csv', index=False)
