In [6]:
import pandas as pd
import datasets
from typing import Dict, Sequence

In [2]:
df = pd.read_parquet("en-2-fr-translation.parquet", engine='pyarrow').rename(columns={'English words/sentences': 'input', 'French words/sentences': 'output'})
df

Unnamed: 0,input,output
0,Hi.,Salut!
1,Run!,Cours !
2,Run!,Courez !
3,Who?,Qui ?
4,Wow!,Ça alors !
...,...,...
175461,We need to uphold laws against discrimination ...,Nous devons faire respecter les lois contre la...
175462,A carbon footprint is the amount of carbon dio...,Une empreinte carbone est la somme de pollutio...
175463,Death is something that we're often discourage...,La mort est une chose qu'on nous décourage sou...
175464,Since there are usually multiple websites on a...,Puisqu'il y a de multiples sites web sur chaqu...


In [3]:
# create jsonl files
df.to_json('en-2-fr-translation.jsonl', orient='records', lines=True)

In [32]:
# play around with dataset / tokenizer
train_dataset = datasets.load_dataset('json', data_files='en-2-fr-translation.jsonl')
train_dataset

Downloading data files: 100%|██████████| 1/1 [00:00<00:00, 5809.29it/s]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 342.92it/s]
Generating train split: 175466 examples [00:00, 3527675.37 examples/s]


DatasetDict({
    train: Dataset({
        features: ['en', 'fr'],
        num_rows: 175466
    })
})

In [80]:
import transformers
from torch.utils.data import Dataset
import torch
from dataset import fmt_prompt
import os
import copy 


In [71]:
tokenizer = transformers.AutoTokenizer.from_pretrained(
        'microsoft/phi-2',
        model_max_length=2048,
        padding_side="right",
        use_fast=False,
        pad_token="<|pad|>",
        trust_remote_code=True,
    )

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [82]:
def _tokenize(
        strings: Sequence[str],
        tokenizer: transformers.PreTrainedTokenizer
) -> Dict:
    """tokenize examples"""
    tokenized_strings = [
        tokenizer(
            example,
            return_tensors='pt',
            padding=False,
            max_length=tokenizer.model_max_length,
            truncation=True,
        ) 
        for example in strings
    ]

    input_ids = labels = [tokenized.input_ids[0] for tokenized in tokenized_strings]
    input_ids_lens = labels_lens = [
        tokenized.input_ids.ne(tokenizer.pad_token_id).sum().item()
        for tokenized in tokenized_strings
    ]
    return dict(
        input_ids=input_ids,
        labels=labels,
        input_ids_lens=input_ids_lens,
        labels_lens=labels_lens,
    )

In [92]:
def preprocess(
        samples: Sequence[str],
        tokenizer: transformers.PreTrainedTokenizer
    ) -> Dict:
    """Preprocess data for training by tokenizing"""
    sources = [f"{fmt_prompt(sources)}" for sources in samples["input"]]
    targets = [f"{translation}{tokenizer.eos_token}" for translation in samples["output"]]
    complete_examples = [s + t for s,t in zip(sources, targets)] # source + target -> "Can you translate this phrase for me? <|phrase|>, Sure thing, here is the french translation <|target|>"
    examples_tokenized, sources_tokenized = [
        _tokenize(strings, tokenizer) for strings in (complete_examples, sources)
    ]
    input_ids = examples_tokenized["input_ids"]
    labels = copy.deepcopy(input_ids)
    for label, source_length in zip(labels, sources_tokenized["input_ids_lens"]):
        label[:source_length] = -100 # Pytorch will ignore -100 during learning in c.e.l.
    return dict(input_ids=input_ids, labels=labels)


In [95]:
class MyDataSet(Dataset):
    """Dataset for fine-tuning model"""

    def __init__(self, tokenizer: transformers.PreTrainedTokenizer, paths: str, limit=3000):
        super(MyDataSet, self).__init__()
        dataset = (
            datasets.load_dataset(
            "json",
            data_files=paths,
            split=f"train[0:{limit}]" if limit else "train",
            )
            .map(
                lambda samples: preprocess(samples, tokenizer),
                batched=True,
                batch_size=300,
            )
        )

        self.tokenizer = tokenizer
        self.input_ids = dataset["input_ids"]
        self.labels = dataset["labels"]
        # self.size = len(dataframe)

    def __len__(self) -> int:
        return len(self.input_ids)

    def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:
        return dict(
            input_ids = torch.tensor(self.input_ids[idx]),
            labels = torch.tensor(self.labels[idx])
        )
        

In [99]:
dataset = MyDataSet(tokenizer, ['en-2-fr-translation.jsonl'])

In [102]:
dataset[1000]

{'input_ids': tensor([21017, 27759,    25,   198,  1680,   345,  3387, 15772,   428,  9546,
           393,  1573,   284, 48718,    30,   220,   198,   775,   821, 15800,
            13,   198,   198, 21017, 18261,    25,   198,  3363,   286,  1781,
             0,  3423,   318,   257, 48718, 11059,   286,   326,  9546,    25,
           220,   198,    45,   516,   264,  2002,   274,  4628,  1460,    13,
         50256]),
 'labels': tensor([ -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,    45,   516,   264,  2002,   274,  4628,  1460,    13,
         50256])}

In [3]:
#### wandb experimenting ####

In [4]:
import wandb
import random


In [5]:

# start a new wandb run to track this script
wandb.init(
    # set the wandb project where this run will be logged
    project="my-awesome-project",
    
    # track hyperparameters and run metadata
    config={
    "learning_rate": 0.02,
    "architecture": "CNN",
    "dataset": "CIFAR-100",
    "epochs": 10,
    }
)

# simulate training
epochs = 10
offset = random.random() / 5
for epoch in range(2, epochs):
    acc = 1 - 2 ** -epoch - random.random() / epoch - offset
    loss = 2 ** -epoch + random.random() / epoch + offset
    
    # log metrics to wandb
    wandb.log({"acc": acc, "loss": loss})
    
# [optional] finish the wandb run, necessary in notebooks
wandb.finish()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mlrav35[0m. Use [1m`wandb login --relogin`[0m to force relogin




0,1
acc,▁▂█▅█▇█▇
loss,█▆▂▁▂▂▂▂

0,1
acc,0.90225
loss,0.11831
