In [6]:
import pandas as pd
import datasets
from typing import Dict, Sequence

In [2]:
df = pd.read_parquet("en-2-fr-translation.parquet", engine='pyarrow').rename(columns={'English words/sentences': 'input', 'French words/sentences': 'output'})
df

Unnamed: 0,input,output
0,Hi.,Salut!
1,Run!,Cours !
2,Run!,Courez !
3,Who?,Qui ?
4,Wow!,Ça alors !
...,...,...
175461,We need to uphold laws against discrimination ...,Nous devons faire respecter les lois contre la...
175462,A carbon footprint is the amount of carbon dio...,Une empreinte carbone est la somme de pollutio...
175463,Death is something that we're often discourage...,La mort est une chose qu'on nous décourage sou...
175464,Since there are usually multiple websites on a...,Puisqu'il y a de multiples sites web sur chaqu...


In [3]:
# create jsonl files
df.to_json('en-2-fr-translation.jsonl', orient='records', lines=True)

In [32]:
# play around with dataset / tokenizer
train_dataset = datasets.load_dataset('json', data_files='en-2-fr-translation.jsonl')
train_dataset

Downloading data files: 100%|██████████| 1/1 [00:00<00:00, 5809.29it/s]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 342.92it/s]
Generating train split: 175466 examples [00:00, 3527675.37 examples/s]


DatasetDict({
    train: Dataset({
        features: ['en', 'fr'],
        num_rows: 175466
    })
})

In [33]:
import transformers
from torch.utils.data import Dataset
import torch
from dataset import fmt_prompt
import os


In [35]:
tokenizer = transformers.AutoTokenizer.from_pretrained(
        'microsoft/phi-2',
        model_max_length=2048,
        padding_side="right",
        use_fast=False,
        # pad_token=DEFAULT_PAD_TOKEN,
        trust_remote_code=True,
    )

tokenizer_config.json: 100%|██████████| 7.34k/7.34k [00:00<00:00, 3.83MB/s]
vocab.json: 100%|██████████| 798k/798k [00:00<00:00, 6.83MB/s]
merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 4.16MB/s]
added_tokens.json: 100%|██████████| 1.08k/1.08k [00:00<00:00, 489kB/s]
special_tokens_map.json: 100%|██████████| 99.0/99.0 [00:00<00:00, 42.8kB/s]
tokenizer.json: 100%|██████████| 2.11M/2.11M [00:00<00:00, 14.7MB/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [36]:
def preprocess(
        samples: Sequence[str],
        tokenizer: transformers.PreTrainedTokenizer
    ) -> Dict:
    """Preprocess data for training by tokenizing"""
    sources = [f"{fmt_prompt(sources)}" for sources in samples["input"]]
    targets = [f"{translation}{tokenizer.eos_token}" for translation in samples["output"]]
    complete_examples = [s + t for s,t in zip(sources, targets)]
    """tokenize examples"""
    tokenized_strings = [
        tokenizer(
            example,
            return_tensors='pt',
            padding=False,
            max_length=tokenizer.model_max_length,
            truncation=True,
        ) 
        for example in complete_examples
    ]
    return None

In [39]:
class MyDataSet(Dataset):
    """Dataset for fine-tuning model"""

    def __init__(self, tokenizer: transformers.PreTrainedTokenizer, paths: str, limit=3000):
        super(MyDataSet, self).__init__()
        dataset = (
            datasets.load_dataset(
            "json",
            data_files=paths,
            split=f"train[0:{limit}]" if limit else "train",
            )
            # .filter(
            #     # filter data entries
            #     )
            .map(
                lambda samples: preprocess(samples, tokenizer),
                batched=True,
                batch_size=300,
                # create a preprocessing function 
            )
        )

        self.tokenizer = tokenizer
        self.data = None 
        # self.size = len(dataframe)

    def __len__(self) -> int:
        return self.size

    def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:
        
        return None
        

In [40]:
dataset = MyDataSet(tokenizer, ['en-2-fr-translation.jsonl'])

Map: 100%|██████████| 3000/3000 [00:00<00:00, 6220.91 examples/s]


TypeError: 'NoneType' object is not callable