In [1]:
import pandas as pd
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorForSeq2Seq
from torch.utils.data import DataLoader

  from .autonotebook import tqdm as notebook_tqdm


In [20]:
class DataHandler:
    def __init__(self, model_checkpoint='Helsinki-NLP/opus-mt-en-es', 
                 max_input_length=128, max_target_length=128):
        self.model_checkpoint = model_checkpoint
        self.tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
        self.tokenizer.add_special_tokens({"cls_token": "<s>"})
        self.max_input_length = max_input_length
        self.max_target_length = max_target_length

    def load_dataset(self, filepath):
        # df = pd.read_csv(filepath, sep="\t", header=None)
        # df = df.iloc[:30000]
        # df.columns = ['en', 'es']
        # df.to_csv('spa.csv', index=None)
        raw_dataset = load_dataset('csv', data_files='spa.csv')
        split = raw_dataset['train'].train_test_split(test_size=0.3, seed=42)
        tokenized_datasets = split.map(
            self.preprocess_function, batched=True,
            remove_columns=split["train"].column_names,
        )
        return tokenized_datasets

    def preprocess_function(self, batch):
        model_inputs = self.tokenizer(
            batch['en'], max_length=self.max_input_length, truncation=True)
        labels = self.tokenizer(
            batch['es'], max_length=self.max_target_length, truncation=True)
        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    def prepare_dataloader(self, tokenized_datasets, batch_size=32):
        data_collator = DataCollatorForSeq2Seq(self.tokenizer)
        train_loader = DataLoader(
            tokenized_datasets["train"],
            shuffle=True,
            batch_size=batch_size,
            collate_fn=data_collator
        )
        valid_loader = DataLoader(
            tokenized_datasets["test"],
            batch_size=batch_size,
            collate_fn=data_collator
        )
        return train_loader, valid_loader

In [21]:
# Step 1: Instantiate the DataHandler object
data_handler = DataHandler()

In [22]:

raw_dataset = load_dataset('csv', data_files = 'spa.csv')
raw_dataset

Found cached dataset csv (C:/Users/luvve/.cache/huggingface/datasets/csv/default-5578750b55a9497d/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)
100%|██████████| 1/1 [00:00<00:00, 18.33it/s]


DatasetDict({
    train: Dataset({
        features: ['en', 'es'],
        num_rows: 30000
    })
})

In [23]:
df_dataset = pd.DataFrame(raw_dataset)
df_dataset

Unnamed: 0,train
0,"{'en': 'Go.', 'es': 'Ve.'}"
1,"{'en': 'Go.', 'es': 'Vete.'}"
2,"{'en': 'Go.', 'es': 'Vaya.'}"
3,"{'en': 'Hi.', 'es': 'Hola.'}"
4,"{'en': 'Run!', 'es': '¡Corre!'}"
...,...
29995,"{'en': 'How was your honeymoon?', 'es': '¿Cómo..."
29996,"{'en': 'How well can you skate?', 'es': '¿Qué ..."
29997,"{'en': 'How well can you skate?', 'es': '¿Cuán..."
29998,"{'en': 'How wide is this river?', 'es': '¿Qué ..."


In [24]:
df_dataset['train']

0                               {'en': 'Go.', 'es': 'Ve.'}
1                             {'en': 'Go.', 'es': 'Vete.'}
2                             {'en': 'Go.', 'es': 'Vaya.'}
3                             {'en': 'Hi.', 'es': 'Hola.'}
4                          {'en': 'Run!', 'es': '¡Corre!'}
                               ...                        
29995    {'en': 'How was your honeymoon?', 'es': '¿Cómo...
29996    {'en': 'How well can you skate?', 'es': '¿Qué ...
29997    {'en': 'How well can you skate?', 'es': '¿Cuán...
29998    {'en': 'How wide is this river?', 'es': '¿Qué ...
29999    {'en': 'How will you stop them?', 'es': '¿Cómo...
Name: train, Length: 30000, dtype: object

In [16]:
split = raw_dataset['train'].train_test_split(test_size=0.3, seed=42)
split

Loading cached split indices for dataset at C:\Users\luvve\.cache\huggingface\datasets\csv\default-5578750b55a9497d\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-f8892a324f96c202.arrow and C:\Users\luvve\.cache\huggingface\datasets\csv\default-5578750b55a9497d\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-853b8cac2b333041.arrow


DatasetDict({
    train: Dataset({
        features: ['en', 'es'],
        num_rows: 21000
    })
    test: Dataset({
        features: ['en', 'es'],
        num_rows: 9000
    })
})

In [25]:
# Step 3: Define the preprocess_function
def preprocess_function(batch):
    model_inputs = data_handler.tokenizer(
        batch['en'], max_length=data_handler.max_input_length, truncation=True)
    labels = data_handler.tokenizer(
        batch['es'], max_length=data_handler.max_target_length, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [26]:
# Step 4: Map the preprocess_function to the split dataset
tokenized_datasets = split.map(
    preprocess_function, batched=True,
    remove_columns=split["train"].column_names,
)

                                                                   

In [27]:
# Print the first few elements of the tokenized_datasets to see the output
for i in range(5):
    # print()
    
    print(tokenized_datasets['train'][i])

{'input_ids': [33, 88, 9222, 48, 3, 0], 'attention_mask': [1, 1, 1, 1, 1, 1], 'labels': [711, 25, 4947, 36359, 8, 91, 11503, 5170, 279, 3, 0]}
{'input_ids': [552, 11490, 9, 310, 255, 3, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1], 'labels': [54, 260, 88, 547, 6291, 151, 4286, 324, 3, 0]}
{'input_ids': [143, 31, 125, 1208, 3, 0], 'attention_mask': [1, 1, 1, 1, 1, 1], 'labels': [539, 43, 155, 1208, 75, 3, 0]}
{'input_ids': [1093, 220, 1890, 23, 48, 3, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1], 'labels': [15165, 1250, 380, 4227, 2583, 36, 25, 12288, 3, 0]}
{'input_ids': [124, 20, 100, 18422, 48, 141, 3, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1], 'labels': [350, 73, 18958, 9, 29, 8, 10032, 261, 100, 3, 0]}


In [28]:
print(tokenized_datasets['train'])

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 21000
})


In [29]:
print(tokenized_datasets)

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 21000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 9000
    })
})
