In [3]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "microsoft/phi-4"
# dataset_path = "HuggingFaceTB/smoltalk"
# dataset_name = "everyday-conversations"
# ebook_file_path = ""

tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForCausalLM.from_pretrained(model_name)

# Preprocess the book into a txt file that has role and content

In [None]:
from ebooklib import epub
from bs4 import BeautifulSoup


def read_epub(file_path):
    book = epub.read_epub(file_path)
    text_content = []

    for item in book.get_items():
        if isinstance(item, epub.EpubHtml):
            soup = BeautifulSoup(item.content, "html.parser")
            text_content.append(soup.get_text())

    return "\n".join(text_content)


if ebook_file_path != "":
    text = read_epub(ebook_file_path)
    with open("../datasets/etel_adnan.txt", "w") as f:
        f.write(text)

In [3]:
# open the txt file and print the first 500 characters
with open("../datasets/etel_adnan.txt", "r") as f:
    text = f.read()

In [4]:
# split the book into chapters
candidates = text.split("\n\n\n\n\n\n")
final = []
for candidate in candidates:
    candidate = candidate.strip()
    if len(candidate) > 1000 and candidate[0] in [
        "1",
        "2",
        "3",
        "4",
        "5",
        "6",
        "7",
        "8",
        "9",
        "10",
        "11",
    ]:
        final.append("\n\n".join(candidate.split("\n\n")[1:]))

In [5]:
# Abbreviate the names for the first two lines which is not abbreviated
final[0] = final[0].replace("LAURE ADLER: ", "LA: ").replace("ETEL ADNAN: ", "EA: ")

In [6]:
def parse_conversation(text):
    result = []
    chunks = text.split("LA: ")[1:]  # Skip empty first chunk

    for chunk in chunks:
        if "EA: " in chunk:
            la_text, ea_chunk = chunk.split("EA: ")
            result.append({"role": "LA", "content": la_text.strip()})
            result.append({"role":"EA", "content": ea_chunk.strip()})
        else:
            result.append({"LA": chunk.strip()})

    return result

role_content_templated = []
for chapter in final:
    result = parse_conversation(chapter)
    role_content_templated.append(result)

In [91]:
role_content_templated[0][:2]

[{'role': 'LA',
  'content': 'Etel, you are a writer, a poet, an artist; you were born in Lebanon. In which language were you brought up?'},
 {'role': 'EA',
  'content': 'I’m a bit of a particular case, especially for the time. My mother was Greek, from Smyrna (now Izmir), which is to say from Turkey, and my father was born in Damascus; he was also an officer of the Ottoman empire, so the common language between them was Turkish. We spoke Turkish in Beirut, at home, but my mother spoke to me in Greek, naturally. I grew up this way until the age of twenty, until twenty-four even, speaking Greek and Turkish, and French, because at the time the schools were strictly French speaking; Arabic wasn’t taught. I “caught”—as the saying goes—my Arabic in the street and with other children. So, I grew up in four languages.'}]

In [92]:
import json

with open("../datasets/etel_adnan.json", "w") as f:
    json.dump(role_content_templated, f)

# Setup tokenizer for chat template and special tokens

In [16]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [4]:
# special strings for role that will be added to tokenizer vocabulary

role_A = "#29njkn(dkj38$%nkjn#" #Laure Adler
role_B = "#foi*Ewoh!@oih(&idl#" #Etel Adnan

In [5]:
# Add chat template to tokenizer

tokenizer.chat_template = "{% for message in messages %}{% if (message['role'] == 'system') %}{{'<|im_start|>system<|im_sep|>' + message['content'] + '<|im_end|>'}}{% elif (message['role'] == 'LA') %}{{'<|im_start|>#29njkn(dkj38$%nkjn#<|im_sep|>' + message['content'] + '<|im_end|><|im_start|>#foi*Ewoh!@oih(&idl#<|im_sep|>'}}{% elif (message['role'] == 'EA') %}{{message['content'] + '<|im_end|>'}}{% endif %}{% endfor %}"

In [6]:
len(tokenizer)

100352

In [7]:
result = tokenizer.add_special_tokens(
    {
        "additional_special_tokens": tokenizer.additional_special_tokens
        + [role_A, role_B, "<|im_sep|>"]
    }
)

In [8]:
len(tokenizer)

100354

In [9]:
tokenizer.special_tokens_map

{'bos_token': '<|endoftext|>',
 'eos_token': '<|endoftext|>',
 'unk_token': '<|endoftext|>',
 'pad_token': '<|endoftext|>',
 'additional_special_tokens': ['#29njkn(dkj38$%nkjn#',
  '#foi*Ewoh!@oih(&idl#',
  '<|im_sep|>']}

In [10]:
print(tokenizer(role_A))
print(tokenizer(role_B))

{'input_ids': [100352], 'attention_mask': [1]}
{'input_ids': [100353], 'attention_mask': [1]}


# Apply tokenizer and split at max_seq_length

We need to split at utterance level, meaning that the chunk should be split at the end of the interviewer finished talking. We also need to keep each chunk with the same token length. 

So we are going to first tokenize and split using max_seq_length, and find the closest utterance end. Then we are going to add padding tokens to make it max_seq_length. 

In [11]:
import json

with open("../datasets/etel_adnan.json", "r") as f:
    data = f.read()
    data = json.loads(data)

In [12]:
# test tokenizer apply_chat_template method 

tokens =tokenizer.apply_chat_template(
    [
        {
            "role": "LA",
            "content": "hi",
        },
        {
            "role": "EA",
            "content": "hi",
        },
    ],
    tokenize=True
)

for token in tokens:
    print(token ,":", tokenizer.decode(token))

100264 : <|im_start|>
100352 : #29njkn(dkj38$%nkjn#
100266 : <|im_sep|>
6151 : hi
100265 : <|im_end|>
100264 : <|im_start|>
100353 : #foi*Ewoh!@oih(&idl#
100266 : <|im_sep|>
6151 : hi
100265 : <|im_end|>


In [13]:
# apply to the whole chapters

chat_templated_tokens = [
    tokenizer.apply_chat_template(chapter, tokenize=True) for chapter in data
]

In [28]:
for chapter in chat_templated_tokens:
    print(len(chapter))

6498
5053
4008
4341
3758
1365
1687
2558
2510
4972
3076


In [17]:
print(tokenizer.decode(chat_templated_tokens[0][:256]))

<|im_start|>#29njkn(dkj38$%nkjn#<|im_sep|>Etel, you are a writer, a poet, an artist; you were born in Lebanon. In which language were you brought up?<|im_end|><|im_start|>#foi*Ewoh!@oih(&idl#<|im_sep|>I’m a bit of a particular case, especially for the time. My mother was Greek, from Smyrna (now Izmir), which is to say from Turkey, and my father was born in Damascus; he was also an officer of the Ottoman empire, so the common language between them was Turkish. We spoke Turkish in Beirut, at home, but my mother spoke to me in Greek, naturally. I grew up this way until the age of twenty, until twenty-four even, speaking Greek and Turkish, and French, because at the time the schools were strictly French speaking; Arabic wasn’t taught. I “caught”—as the saying goes—my Arabic in the street and with other children. So, I grew up in four languages.<|im_end|><|im_start|>#29njkn(dkj38$%nkjn#<|im_sep|>At what point did you realize you were an artist?<|im_end|><|im_start|>#foi*Ewoh!@oih(&idl#<|im_

In [24]:
tokenizer.pad_token

'<|endoftext|>'

In [33]:
# Dividing with padding with fixed maximum length. split at utterance level

max_seq_length = 256
split_token_sequence = [
    100264,
    100353,
]  # tokens for <|im_start|> and 29njkn(dkj38$%nkjn#
# pad_sequence = tokenizer(tokenizer.eos_token)["input_ids"][0]
pad_sequence = tokenizer.pad_token

def find_last_sequence(lst, sequence):
    for i in range(len(lst) - len(sequence), -1, -1):  # Search backwards
        if lst[i : i + len(sequence)] == sequence:
            return i
    return -1


split_padded_tokens = []
for i, chapter in enumerate(chat_templated_tokens):
    if len(chapter) < max_seq_length:
        chapter = [pad_sequence for _ in range(max_seq_length - len(chapter))] + chapter

        split_padded_tokens.append(
            {
                "input_ids": chapter,
                "attention_mask": [0.0 for _ in range(max_seq_length - len(chapter))]
                + [1.0 for _ in range(len(chapter))],
            }
        )
    else:

        while True:
            split_with_max_seq_len = chapter[:max_seq_length]
            last_index = find_last_sequence(
                split_with_max_seq_len, split_token_sequence
            )

            if last_index == 0:
                # If the utterance exceeds one conversation, then just truncate it and move to the next 
                split_at_utterance_level = chapter[:max_seq_length]
                split_padded_tokens.append(
                    {
                        "input_ids": split_at_utterance_level,
                        "attention_mask": [1.0 for _ in range(max_seq_length)],
                    }
                )
            else:
                split_at_utterance_level = chapter[:last_index]
                split_at_utterance_level = [
                    pad_sequence for _ in range(max_seq_length - last_index)
                ] + split_at_utterance_level
                split_padded_tokens.append(
                    {
                        "input_ids": split_at_utterance_level,
                        "attention_mask": [
                            0.0 for _ in range(max_seq_length - last_index)
                        ]
                        + [1.0 for _ in range(last_index)],
                    }
                )
            chapter = chapter[last_index:]
            if len(chapter) < max_seq_length:
                # print("last_chunk")
                split_padded_tokens.append(
                    {
                        "input_ids": [
                            pad_sequence for _ in range(max_seq_length - len(chapter))
                        ]
                        + chapter,
                        "attention_mask": [
                            0.0 for _ in range(max_seq_length - len(chapter))
                        ]
                        + [1.0 for _ in range(len(chapter))],
                    }
                )
                break

In [34]:
# check if it was splited at the end of Etel's utterance

print(tokenizer.decode(split_padded_tokens[0]["input_ids"])[-50:])
print(tokenizer.decode(split_padded_tokens[1]["input_ids"])[-50:])
print(tokenizer.decode(split_padded_tokens[2]["input_ids"])[-50:])
print(tokenizer.decode(split_padded_tokens[-1]["input_ids"])[-50:])

ting, which is one of your recent works?<|im_end|>
hilosophy? And why could it be painting?<|im_end|>
nly the Vietnam war but also in Lebanon.<|im_end|>
ve tree on the balcony. It’s a good day.<|im_end|>


In [35]:
len(split_padded_tokens)

26

In [36]:
print(set([len(chunk["input_ids"]) for chunk in split_padded_tokens]))

{2048}


In [37]:
data_dict = {"input_ids": [], "attention_mask": [], "labels": []}
for chunk in split_padded_tokens:
    data_dict["input_ids"].append(chunk["input_ids"])
    data_dict["attention_mask"].append([bool(x) for x in chunk["attention_mask"]])
    data_dict["labels"].append(chunk["input_ids"])

In [38]:
from datasets import load_dataset, Dataset

ds = Dataset.from_dict(data_dict)

In [39]:
import numpy as np
print(np.array(ds["input_ids"]).shape)
print(np.array(ds["attention_mask"]).shape)
print(np.array(ds["labels"]).shape)

(26, 2048)
(26, 2048)
(26, 2048)


In [40]:
type(ds["attention_mask"][0][0])

bool

In [41]:
ds.save_to_disk("../datasets/etel_adnan_dataset")

Saving the dataset (0/1 shards):   0%|          | 0/26 [00:00<?, ? examples/s]

In [42]:
from datasets import load_from_disk

ds = load_from_disk("../datasets/etel_adnan_dataset")
ds

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 26
})

### To upload to Modal volume

modal volume rm -r lm-as-memory dataset                            
modal volume put lm-as-memory ./datasets/etel_adnan_dataset dataset

In [41]:
import numpy as np

print(np.array(ds["input_ids"]).shape)
print(np.array(ds["attention_mask"]).shape)
print(np.array(ds["labels"]).shape)

(27, 2048)
(27, 2048)
(27, 2048)


# Add new tokens to the embedding matrix

In [None]:
# model.resize_token_embeddings(len(tokenizer))