In [57]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "HuggingFaceTB/SmolLM2-135M"
dataset_path = "HuggingFaceTB/smoltalk"
dataset_name = "everyday-conversations"
ebook_file_path = ""

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Preprocess the book into a txt file that has role and content

In [28]:
from ebooklib import epub
from bs4 import BeautifulSoup


def read_epub(file_path):
    book = epub.read_epub(file_path)
    text_content = []

    for item in book.get_items():
        if isinstance(item, epub.EpubHtml):
            soup = BeautifulSoup(item.content, "html.parser")
            text_content.append(soup.get_text())

    return "\n".join(text_content)


if ebook_file_path != "":
    text = read_epub(ebook_file_path)
    with open("../datasets/etel_adnan.txt", "w") as f:
        f.write(text)

In [29]:
# open the txt file and print the first 500 characters
with open("../datasets/etel_adnan.txt", "r") as f:
    text = f.read()

In [30]:
# split the book into chapters
candidates = text.split("\n\n\n\n\n\n")
final = []
for candidate in candidates:
    candidate = candidate.strip()
    if len(candidate) > 1000 and candidate[0] in [
        "1",
        "2",
        "3",
        "4",
        "5",
        "6",
        "7",
        "8",
        "9",
        "10",
        "11",
    ]:
        final.append("\n\n".join(candidate.split("\n\n")[1:]))

In [31]:
# Abbreviate the names for the first two lines which is not abbreviated
final[0] = final[0].replace("LAURE ADLER: ", "LA: ").replace("ETEL ADNAN: ", "EA: ")

In [32]:
def parse_conversation(text):
    result = []
    chunks = text.split("LA: ")[1:]  # Skip empty first chunk

    for chunk in chunks:
        if "EA: " in chunk:
            la_text, ea_chunk = chunk.split("EA: ")
            result.append({"role": "LA", "content": la_text.strip()})
            result.append({"role":"EA", "content": ea_chunk.strip()})
        else:
            result.append({"LA": chunk.strip()})

    return result

role_content_templated = []
for chapter in final:
    result = parse_conversation(chapter)
    role_content_templated.append(result)

In [33]:
role_content_templated[0][:2]

[{'role': 'LA',
  'content': 'Etel, you are a writer, a poet, an artist; you were born in Lebanon. In which language were you brought up?'},
 {'role': 'EA',
  'content': 'I’m a bit of a particular case, especially for the time. My mother was Greek, from Smyrna (now Izmir), which is to say from Turkey, and my father was born in Damascus; he was also an officer of the Ottoman empire, so the common language between them was Turkish. We spoke Turkish in Beirut, at home, but my mother spoke to me in Greek, naturally. I grew up this way until the age of twenty, until twenty-four even, speaking Greek and Turkish, and French, because at the time the schools were strictly French speaking; Arabic wasn’t taught. I “caught”—as the saying goes—my Arabic in the street and with other children. So, I grew up in four languages.'}]

In [38]:
import json

with open("../datasets/etel_adnan.json", "w") as f:
    json.dump(role_content_templated, f)

# Setup tokenizer for chat template and special tokens

In [39]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [58]:
# special strings for role that will be added to tokenizer vocabulary

role_A = "#29njkn(dkj38$%nkjn#" #Laure Adler
role_B = "#foi*Ewoh!@oih(&idl#" #Etel Adnan

In [41]:
# Add chat template to tokenizer

tokenizer.chat_template = "{% for message in messages %}{% if (message['role'] == 'system') %}{{'<|im_start|>system<|im_sep|>' + message['content'] + '<|im_end|>'}}{% elif (message['role'] == 'LA') %}{{'<|im_start|>#29njkn(dkj38$%nkjn#<|im_sep|>' + message['content'] + '<|im_end|><|im_start|>#foi*Ewoh!@oih(&idl#<|im_sep|>'}}{% elif (message['role'] == 'EA') %}{{message['content'] + '<|im_end|>'}}{% endif %}{% endfor %}"

In [42]:
len(tokenizer)

49152

In [59]:
result = tokenizer.add_special_tokens(
    {
        "additional_special_tokens": tokenizer.additional_special_tokens
        + [role_A, role_B, "<|im_sep|>"]
    }
)

In [60]:
len(tokenizer)

49155

In [64]:
tokenizer.save_pretrained("./")

('./tokenizer_config.json',
 './special_tokens_map.json',
 './vocab.json',
 './merges.txt',
 './added_tokens.json',
 './tokenizer.json')

In [65]:
tokenizer = AutoTokenizer.from_pretrained("./")

In [67]:
len(tokenizer)

49155

In [45]:
print(tokenizer(role_A))
print(tokenizer(role_B))

{'input_ids': [49152], 'attention_mask': [1]}
{'input_ids': [49153], 'attention_mask': [1]}


# Apply tokenizer and split at max_seq_length

In [46]:
import json

with open("../datasets/etel_adnan.json", "r") as f:
    data = f.read()
    data = json.loads(data)

In [47]:
# test tokenizer apply_chat_template method 

tokens =tokenizer.apply_chat_template(
    [
        {
            "role": "LA",
            "content": "hi",
        },
        {
            "role": "EA",
            "content": "hi",
        },
    ],
    tokenize=True
)

for token in tokens:
    print(token ,":", tokenizer.decode(token))

1 : <|im_start|>
49152 : #29njkn(dkj38$%nkjn#
49154 : <|im_sep|>
6004 : hi
2 : <|im_end|>
1 : <|im_start|>
49153 : #foi*Ewoh!@oih(&idl#
49154 : <|im_sep|>
6004 : hi
2 : <|im_end|>


In [48]:
# apply to the whole chapters

chat_templated_tokens = [
    tokenizer.apply_chat_template(chapter, tokenize=True) for chapter in data
]

In [49]:
max_seq_length=4096
split_token_sequence = [1, 49152] # tokens for <|im_start|> and 29njkn(dkj38$%nkjn#

def find_last_sequence(lst, sequence):
    for i in range(len(lst)-len(sequence), -1, -1):  # Search backwards
        if lst[i:i+len(sequence)] == sequence:
            return i
    return -1

truncated_tokens = []
for i, chapter in enumerate(chat_templated_tokens):
    if len(chapter) < max_seq_length:
        truncated_tokens.append(chapter)
        print("Ch.",i+1, " - len:", len(chapter))
    else:
        split_len = len(chapter) // ((len(chapter) // max_seq_length) + 1)

        while True:
            split_with_max_seq_len = chapter[:split_len]
            last_index = find_last_sequence(
                split_with_max_seq_len, split_token_sequence
            )
            deducted_num_tokens = split_len - last_index

            split_at_utterance_level = chapter[:last_index]
            truncated_tokens.append(split_at_utterance_level)
            print("Ch.",i+1, " - len:", len(split_at_utterance_level))

            chapter = chapter[last_index:]
            if len(chapter) < max_seq_length:
                truncated_tokens.append(chapter)
                print("Ch.",i+1, " - len:", len(chapter))
                break
    print()

Ch. 1  - len: 3276
Ch. 1  - len: 3465

Ch. 2  - len: 2566
Ch. 2  - len: 2725

Ch. 3  - len: 1774
Ch. 3  - len: 2391

Ch. 4  - len: 2249
Ch. 4  - len: 2293

Ch. 5  - len: 3890

Ch. 6  - len: 1397

Ch. 7  - len: 1762

Ch. 8  - len: 2670

Ch. 9  - len: 2589

Ch. 10  - len: 2555
Ch. 10  - len: 2609

Ch. 11  - len: 3200



In [50]:
# check if it was splited at the end of Etel's utterance

print(tokenizer.decode(truncated_tokens[0])[-50:])
print(tokenizer.decode(truncated_tokens[1])[-50:])
print(tokenizer.decode(truncated_tokens[2])[-50:])
print(tokenizer.decode(truncated_tokens[-1])[-50:])

 knows this, and Bachelard just as much.<|im_end|>
c decisions—chance collaborates with us.<|im_end|>
away: that’s what it is to put in order.<|im_end|>
ve tree on the balcony. It’s a good day.<|im_end|>


In [56]:
len(truncated_tokens)

16

In [68]:
# save to json
import json

with open("../datasets/etel_adnan_tokens_with_labels.json", "w") as f:
    json.dump(
        {
            "input_ids": truncated_tokens,
            "labels": truncated_tokens,
        },
        f,
    )

In [52]:
# decode each set of tokens and save as etel_adnan_templated.txt
with open("../datasets/etel_adnan_templated.txt", "w") as f:
    result = []
    for chapter in truncated_tokens:
        result.append(tokenizer.decode(chapter))
    result = {"text": result}
    json.dump(result, f)

### Data collator

we need to define our own collator since we already tokenized the data.

In [23]:
import json

with open("../datasets/etel_adnan.json", "r") as f:
    data = f.read()
    data = json.loads(data)

In [26]:
from transformers import (
    DefaultDataCollator,
    DataCollatorForLanguageModeling,
    DataCollatorWithPadding,
)

# data_collator = DefaultDataCollator()
# data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [36]:
tokenizer.pad_token = tokenizer.eos_token

In [24]:
from datasets import Dataset

ds = Dataset.from_dict({"text": data})

In [25]:
ds[0]

{'text': [{'content': 'Etel, you are a writer, a poet, an artist; you were born in Lebanon. In which language were you brought up?',
   'role': 'LA'},
  {'content': 'I’m a bit of a particular case, especially for the time. My mother was Greek, from Smyrna (now Izmir), which is to say from Turkey, and my father was born in Damascus; he was also an officer of the Ottoman empire, so the common language between them was Turkish. We spoke Turkish in Beirut, at home, but my mother spoke to me in Greek, naturally. I grew up this way until the age of twenty, until twenty-four even, speaking Greek and Turkish, and French, because at the time the schools were strictly French speaking; Arabic wasn’t taught. I “caught”—as the saying goes—my Arabic in the street and with other children. So, I grew up in four languages.',
   'role': 'EA'},
  {'content': 'At what point did you realize you were an artist?',
   'role': 'LA'},
  {'content': 'Much later. I was already thirty years old. I was in America. 

In [16]:
from trl import SFTTrainer
import os
import torch
from datasets import load_dataset, Dataset
from transformers import (
    BitsAndBytesConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    DataCollatorWithPadding,
    DefaultDataCollator,
)
from trl import SFTTrainer, setup_chat_format, SFTConfig
from peft import LoraConfig
from torch.utils.data import DataLoader

# Configure LoRA parameters
rank_dimension = 6
lora_alpha = 8
lora_dropout = 0.05
run_name = "etel_adnan"

model = AutoModelForCausalLM.from_pretrained(model_name)

peft_config = LoraConfig(
    r=rank_dimension,  # Rank dimension - typically between 4-32
    lora_alpha=lora_alpha,  # LoRA scaling factor - typically 2x rank
    lora_dropout=lora_dropout,  # Dropout probability for LoRA layers
    bias="none",  # Bias type for LoRA. the corresponding biases will be updated during training.
    target_modules="all-linear",  # Which modules to apply LoRA to
    task_type="CAUSAL_LM",  # Task type for model architecture
)

# Training configuration
# Hyperparameters based on QLoRA paper recommendations
args = SFTConfig(
    output_dir=run_name,  # Directory to save model checkpoints
    num_train_epochs=1,  # Number of training epochs
    per_device_train_batch_size=1,  # Batch size per GPU
    gradient_accumulation_steps=2,  # Accumulate gradients for larger effective batch
    gradient_checkpointing=True,  # Trade compute for memory savings
    optim="adamw_torch_fused",  # Use fused AdamW for efficiency
    learning_rate=2e-4,  # Learning rate (QLoRA paper)
    max_grad_norm=0.3,  # Gradient clipping threshold
    warmup_ratio=0.03,  # Portion of steps for warmup
    lr_scheduler_type="constant",  # Keep learning rate constant after warmup
    logging_steps=10,  # Log metrics every N steps
    save_strategy="epoch",  # Save checkpoint every epoch
    bf16=True,  # Use bfloat16 precision
    push_to_hub=False,  # Don't push to HuggingFace Hub
    report_to="none",  # Disable external logging
    max_seq_length=4096,
    packing=False, # Don't concatenate multiple sequences to meet max_seq_length
)

# Create SFTTrainer with LoRA configuration
trainer = SFTTrainer(
    model=model,
    args=args,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    train_dataset=ds,
    processing_class=tokenizer,
    peft_config=peft_config,
)

In [19]:
# trainer._prepare_non_packed_dataloader(ds)
trainer._prepare_dataset()

TypeError: SFTTrainer._prepare_dataset() missing 8 required positional arguments: 'dataset', 'processing_class', 'packing', 'dataset_text_field', 'max_seq_length', 'formatting_func', 'num_of_sequences', and 'chars_per_token'

In [39]:
data_collator({"input_ids": data[0]})

{'input_ids': tensor([    1, 49152, 49154,  ...,  1083,    30,     2]), 'attention_mask': tensor([1, 1, 1,  ..., 1, 1, 1])}

In [187]:
tokenized_data = [tokenizer(chunk, return_tensors="pt") for chunk in final_data]

In [190]:
import torch

collated_data = data_collator(tokenized_data, )

TypeError: DataCollatorMixin.__call__() got an unexpected keyword argument 'batch_size'

{}

# Add new tokens to the embedding matrix

In [None]:
model.resize_token_embeddings(len(tokenizer))