In [1]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    get_scheduler,
    default_data_collator,
    SchedulerType
)
import os
import json
from itertools import chain
from datasets import load_dataset

In [2]:
train_file = 'combine-mistral.jsonl'
tokenizer = AutoTokenizer.from_pretrained(
    'mistralai/Mistral-7B-v0.1',
)
tokenizer.add_bos_token = False
tokenizer.add_eos_token = False
text_column_name = 'text'

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
raw_datasets = load_dataset(
    'json',
    data_files=train_file,
    split='train'
)

In [4]:
def tokenize_function(examples):
    return tokenizer(examples[text_column_name])

In [5]:
filename = os.path.split(train_file)[1]
column_names = raw_datasets.column_names
tokenized_datasets = raw_datasets.map(
    tokenize_function,
    batched=True,
    remove_columns=column_names,
    load_from_cache_file=True,
    cache_file_name=f'./{filename}-tokenized',
    num_proc=20,
)

tokenized_datasets

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 37117462
})

In [None]:
block_size = 4096

def group_texts(examples):
    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    total_length = (total_length // block_size) * block_size
    result = {
        k: [t[i: i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    load_from_cache_file=True,
    cache_file_name=f'./{filename}-grouped-{block_size}',
    num_proc=20,
)

In [8]:
len(lm_datasets)

2521345

In [9]:
tokenizer.decode(lm_datasets[0]['input_ids'])

'<s> [ke.pa.ya.han] | کڤايهنDefinisi : 1. perihal payah, kesukaran, ke\xadsusahan: Wahab mendengar cerita-cerita tentang ~ penduduk pekan Tembeling akibat sekatan yg diadakan oleh kerajaan; 2. men\xadderita payah (kesulitan, kekurangan): sedangkan hendak mendapat sesuap nasi ~, inikan pula hendak menziarahi tempat-tempat yg bersejarah; 3. kelelahan, keletihan: beberapa lamanya ia berdiam diri saja, terengah-engah spt orang ~. (Kamus Dewan Edisi Keempat) [ke.pa.ya.han] | کڤايهنDefinisi : hal atau keadaan payah; kesukaran; kesusahan: ~ penduduk kampung tsb berhubung dgn kawasan bandar telah mendapat perhatian drpd pihak kerajaan. (Kamus Pelajar Edisi Kedua) berpayah-payah memayahkan memperpayah kepayahan</s><s> Definisi : (ar-Razzaq) Ar Yang Maha Mem\xadberi Rezeki (satu drpd 99 nama Allah). (Kamus Dewan Edisi Keempat)</s><s> [pen.dé.kla.ma.si] | ڤنديکلاماسيDefinisi : orang yg mendeklamasikan sajak, puisi dsb: beliau juga mendapat peng\xadiktirafan sbg penulis dan ~ puisi tanah air. (Kam