In [3]:
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, default_data_collator
from datasets import load_dataset
from itertools import chain
import json

In [4]:
tokenizer = AutoTokenizer.from_pretrained('meta-llama/Llama-2-7b-hf', trust_remote_code=True)
tokenizer.add_eos_token = True

In [5]:
!head -n 1000 hf-datasets/dedupe-datasets/news.jsonl > test.jsonl

In [10]:
raw_datasets = load_dataset('json', data_files='test.jsonl', split = 'train')

Downloading and preparing dataset json/default to /home/ubuntu/.cache/huggingface/datasets/json/default-6d91dd6cf4394a3b/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /home/ubuntu/.cache/huggingface/datasets/json/default-6d91dd6cf4394a3b/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4. Subsequent calls will reuse this data.


In [11]:
raw_datasets['text'][0]

"\nSubscribe to our Telegram channel for the latest stories and updates.\n\nIn yet another baffling Tweet, Elon Musk has claimed that he will buy the Manchester United Football Club.\n\nAlso, I’m buying Manchester United ur welcome— Elon Musk (@elonmusk) August 17, 2022 \nThis comes after the fiasco of him wanting to buy the Twitter social media platform for $44 billion (around RM195 billion) and pulling out midway through the deal. Both parties have filed lawsuits against each other.\nThe Tesla Motors CEO has a habit of tweeting funny (and oftentimes odd) thoughts and jokes, so much so that even his loyal fan base is no longer clear if he is serious or not.\nHow far has it come that even us tesla fans dont know whether hes trolling or not 💀— Werran Buffet (@Yeeeeeeeeeeet11) August 17, 2022 \nHowever, fans of Manchester United took the chance to criticise their team’s abysmal performance in their recent games.\nIf they keep playing like they have been you'll get a discount.— Cal (@CalE

In [12]:
text_column_name = 'text'
def tokenize_function(examples):
    return tokenizer(examples[text_column_name])

In [13]:
column_names = raw_datasets.column_names
tokenized_datasets = raw_datasets.map(
    tokenize_function,
    batched=True,
    remove_columns=column_names,
    desc="Running tokenizer on dataset",
)

Running tokenizer on dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [14]:
block_size = 1024
def group_texts(examples):
    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    total_length = (total_length // block_size) * block_size
    result = {
        k: [t[i: i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [15]:
lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    desc=f"Grouping texts in chunks of {block_size}",
)

Grouping texts in chunks of 1024:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [17]:
train_dataloader = DataLoader(
    lm_datasets,
    shuffle=True,
    collate_fn=default_data_collator,
    batch_size=3)

In [18]:
len(train_dataloader)

264

In [52]:
i = iter(train_dataloader)

In [56]:
next(i)

{'input_ids': tensor([[  423, 30010, 29879,  ..., 15145,   465,  1943],
         [29906, 29896, 29899,  ...,  8724,   508,   679],
         [ 3935,  4982,   408,  ...,   278,  6121,   325]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1]]),
 'labels': tensor([[  423, 30010, 29879,  ..., 15145,   465,  1943],
         [29906, 29896, 29899,  ...,  8724,   508,   679],
         [ 3935,  4982,   408,  ...,   278,  6121,   325]])}