In [2]:
import json
import time
from pathlib import Path

import mlx.core as mx
import mlx.nn as nn
import mlx.optimizers as optim
import numpy as np
import utils as lora_utils
from mlx.utils import tree_flatten
from models import LoRALinear

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
model = "meta-llama/Meta-Llama-3-8B-Instruct"
data_folder = "../../veri/doktorsitesi/my-data-text"
lora_layers = 4
batch_size = 8
iters = 100
steps_per_report = 2
steps_per_eval = 20
val_batches = 8
learning_rate = 1e-4
seed = 0
save_every = 10

In [4]:
adapter_file = f"{time.strftime('%Y%m%d-%H%M%S')}-adapters-5000.npz"
adapter_file

'20240612-192720-adapters-5000.npz'

In [5]:
print("Loading pretrained model")
model, tokenizer, _ = lora_utils.load(model)

Loading pretrained model


Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 75800.67it/s]


ValueError: Received parameters not in model: model.layers.9.self_attn.k_proj.bias model.layers.20.self_attn.k_proj.bias model.layers.1.self_attn.k_proj.bias model.layers.4.self_attn.k_proj.bias model.layers.10.self_attn.k_proj.bias model.layers.18.self_attn.k_proj.bias model.layers.12.self_attn.k_proj.bias model.layers.14.self_attn.k_proj.bias model.layers.9.self_attn.v_proj.bias model.layers.16.self_attn.v_proj.bias model.layers.20.self_attn.v_proj.bias model.layers.21.self_attn.v_proj.bias model.layers.21.self_attn.q_proj.bias model.layers.19.self_attn.v_proj.bias model.layers.23.self_attn.k_proj.bias model.layers.4.self_attn.q_proj.bias model.layers.7.self_attn.k_proj.bias model.layers.0.self_attn.q_proj.bias model.layers.18.self_attn.v_proj.bias model.layers.23.self_attn.q_proj.bias model.layers.20.self_attn.q_proj.bias model.layers.5.self_attn.v_proj.bias model.layers.18.self_attn.q_proj.bias model.layers.6.self_attn.k_proj.bias model.layers.7.self_attn.v_proj.bias model.layers.5.self_attn.k_proj.bias model.layers.22.self_attn.q_proj.bias model.layers.11.self_attn.q_proj.bias model.layers.3.self_attn.q_proj.bias model.layers.23.self_attn.v_proj.bias model.layers.6.self_attn.v_proj.bias model.layers.12.self_attn.v_proj.bias model.layers.17.self_attn.v_proj.bias model.layers.11.self_attn.k_proj.bias model.layers.0.self_attn.v_proj.bias model.layers.6.self_attn.q_proj.bias model.layers.2.self_attn.v_proj.bias model.layers.17.self_attn.k_proj.bias model.layers.19.self_attn.k_proj.bias model.layers.12.self_attn.q_proj.bias model.layers.21.self_attn.k_proj.bias model.layers.14.self_attn.q_proj.bias model.layers.19.self_attn.q_proj.bias model.layers.8.self_attn.v_proj.bias model.layers.10.self_attn.v_proj.bias model.layers.16.self_attn.q_proj.bias model.layers.10.self_attn.q_proj.bias model.layers.0.self_attn.k_proj.bias model.layers.2.self_attn.q_proj.bias model.layers.7.self_attn.q_proj.bias model.layers.2.self_attn.k_proj.bias model.layers.13.self_attn.v_proj.bias model.layers.9.self_attn.q_proj.bias model.layers.13.self_attn.k_proj.bias model.layers.22.self_attn.k_proj.bias model.layers.22.self_attn.v_proj.bias model.layers.11.self_attn.v_proj.bias model.layers.13.self_attn.q_proj.bias model.layers.8.self_attn.q_proj.bias model.layers.14.self_attn.v_proj.bias model.layers.15.self_attn.k_proj.bias model.layers.3.self_attn.k_proj.bias model.layers.5.self_attn.q_proj.bias model.layers.17.self_attn.q_proj.bias model.layers.15.self_attn.v_proj.bias model.layers.1.self_attn.v_proj.bias model.layers.1.self_attn.q_proj.bias model.layers.15.self_attn.q_proj.bias model.layers.16.self_attn.k_proj.bias model.layers.4.self_attn.v_proj.bias model.layers.8.self_attn.k_proj.bias model.layers.3.self_attn.v_proj.bias.

In [14]:
example_prompt = "Hasta:Ankilozan Spondilit ve omurilik ve göğüs kafesi kemikleri birbirine girdi ve kaynamış biz bu hastalığın tedavisi var mı? Lütfen türkçe cevap ver, tercüme etme"

In [15]:
# Freeze all layers other than LORA linears
model.freeze()
for l in model.model.layers[len(model.model.layers) - lora_layers :]:
    l.self_attn.q_proj = LoRALinear.from_linear(l.self_attn.q_proj)
    l.self_attn.v_proj = LoRALinear.from_linear(l.self_attn.v_proj)
    if hasattr(l, "block_sparse_moe"):
        l.block_sparse_moe.gate = LoRALinear.from_linear(l.block_sparse_moe.gate)

p = sum(v.size for _, v in tree_flatten(model.parameters())) / 10**6
print(f"Total parameters {p:.3f}M")
p = sum(v.size for _, v in tree_flatten(model.trainable_parameters())) / 10**6
print(f"Trainable parameters {p:.3f}M")

Total parameters 8030.687M
Trainable parameters 0.426M


In [16]:
class Dataset:
    """
    Light-weight wrapper to hold lines from a jsonl file
    """

    def __init__(self, path: Path, key: str = "text"):
        if not path.exists():
            self._data = None
        else:
            with open(path, "r") as fid:
                self._data = [json.loads(l) for l in fid]
        self._key = key

    def __getitem__(self, idx: int):
        return self._data[idx][self._key]

    def __len__(self):
        return len(self._data)

In [17]:
def load(data_folder: str, training: bool = False, validation: bool = False, testing: bool = False):
    def load_and_check(name):
        dataset_path = Path(data_folder) / f"{name}.jsonl"
        try:
            return Dataset(dataset_path)
        except Exception as e:
            print(f"Unable to build dataset {dataset_path} ({e})")
            raise

    names = ("train-5000", "valid-5000", "test-5000")
    train, valid, test = (load_and_check(n) for n in names)

    if training and len(train) == 0:
        raise ValueError(
            "Training set not found or empty. Must provide training set for fine-tuning."
        )
    if validation and len(valid) == 0:
        raise ValueError(
            "Validation set not found or empty. Must provide validation set for fine-tuning."
        )
    if testing and len(test) == 0:
        raise ValueError(
            "Test set not found or empty. Must provide test set for evaluation."
        )
    return train, valid, test

In [18]:
print("Loading datasets")
train_set, valid_set, test_set = load(data_folder, training=True)
print(f"Training set: {len(train_set)}, Validation set: {len(valid_set)}, Test set: {len(test_set)}")

Loading datasets
Training set: 4000, Validation set: 500, Test set: 500


In [19]:
def iterate_batches(dset, tokenizer, batch_size, train=False):
    # Shuffle indices
    while True:
        indices = np.arange(len(dset))
        if train:
            indices = np.random.permutation(indices)

        # Collect batches from dataset
        for i in range(0, len(indices) - batch_size + 1, batch_size):
            # Encode batch
            batch = [tokenizer.encode(dset[indices[i + j]]) for j in range(batch_size)]
            lengths = [len(x) for x in batch]

            # Check if any sequence is longer than 2048 tokens
            if max(lengths) > 2048:
                print(
                    "[WARNING] Some sequences are longer than 2048 tokens. "
                    "Consider pre-splitting your data to save memory."
                )

            # Pad to the max length
            batch_arr = np.zeros((batch_size, max(lengths)), np.int32)

            for j in range(batch_size):
                batch_arr[j, : lengths[j]] = batch[j]
            batch = mx.array(batch_arr)
            yield batch[:, :-1], batch[:, 1:], mx.array(lengths)

        if not train:
            break


In [20]:
def evaluate(model, dataset, loss, tokenizer, batch_size, num_batches):
    all_losses = []
    ntokens = 0
    for it, batch in zip(
        range(num_batches),
        iterate_batches(dataset, tokenizer, batch_size),
    ):
        losses, toks = loss(model, *batch)
        all_losses.append((losses * toks).item())
        ntokens += toks.item()

    return np.sum(all_losses) / ntokens

In [21]:
def train(model, train_set, val_set, optimizer, loss, tokenizer):
    # Create value and grad function for loss
    loss_value_and_grad = nn.value_and_grad(model, loss)

    losses = []
    n_tokens = 0

    # Main training loop
    start = time.perf_counter()
    for it, batch in zip(
        range(iters),
        iterate_batches(train_set, tokenizer, batch_size, train=True),
    ):
        # Forward and backward pass
        (lvalue, toks), grad = loss_value_and_grad(model, *batch)

        # Model update
        optimizer.update(model, grad)
        mx.eval(model.parameters(), optimizer.state, lvalue)

        # Record loss
        losses.append(lvalue.item())
        n_tokens += toks.item()

        # Report training loss if needed
        if (it + 1) % steps_per_report == 0:
            train_loss = np.mean(losses)

            stop = time.perf_counter()
            print(
                f"Iter {it + 1}: Train loss {train_loss:.3f}, "
                f"It/sec {steps_per_report / (stop - start):.3f}, "
                f"Tokens/sec {float(n_tokens) / (stop - start):.3f}"
            )
            losses = []
            n_tokens = 0
            start = time.perf_counter()

        # Report validation loss if needed
        if it == 0 or (it + 1) % steps_per_eval == 0:
            stop = time.perf_counter()
            val_loss = evaluate(
                model, val_set, loss, tokenizer, batch_size, val_batches
            )
            print(
                f"Iter {it + 1}: "
                f"Val loss {val_loss:.3f}, "
                f"Val took {(time.perf_counter() - stop):.3f}s"
            )

            start = time.perf_counter()

        # Save adapter weights if needed
        if (it + 1) % save_every == 0:
            mx.savez(
                adapter_file, **dict(tree_flatten(model.trainable_parameters()))
            )
            print(f"Iter {it + 1}: Saved adapter weights to {adapter_file}.")


In [1]:
def loss(model, inputs, targets, lengths):
    # Run model on inputs
    logits, _ = model(inputs)
    logits = logits.astype(mx.float32)

    # Mask padding tokens
    length_mask = mx.arange(inputs.shape[1])[None, :] < lengths[:, None]

    # Calculate the loss
    ce = nn.losses.cross_entropy(logits, targets) * length_mask
    ntoks = length_mask.sum()
    ce = ce.sum() / ntoks
    return ce, ntoks

In [23]:
print("Training")

np.random.seed(seed)

opt = optim.Adam(learning_rate=learning_rate)

# Train model
train(model, train_set, valid_set, opt, loss, tokenizer)

# Save adapter weights
mx.savez(adapter_file, **dict(tree_flatten(model.trainable_parameters())))

Training
Iter 1: Val loss 3.522, Val took 60.112s
Iter 2: Train loss 3.581, It/sec 0.295, Tokens/sec 568.993
Iter 4: Train loss 3.418, It/sec 0.087, Tokens/sec 198.151
Iter 6: Train loss 3.298, It/sec 0.170, Tokens/sec 284.634
Iter 8: Train loss 3.218, It/sec 0.164, Tokens/sec 293.901
Iter 10: Train loss 3.264, It/sec 0.203, Tokens/sec 323.546
Iter 10: Saved adapter weights to 20240612-190857-adapters-5000.npz.


In [None]:
def generate(model, prompt, tokenizer, temp, max_tokens):
    print(prompt, end="", flush=True)

    prompt = mx.array(tokenizer.encode(prompt))

    tokens = []
    skip = 0
    for token, n in zip(
        lora_utils.generate(prompt, model, temp),
        range(max_tokens),
    ):
        if token == tokenizer.eos_token_id:
            break

        tokens.append(token.item())
        s = tokenizer.decode(tokens)
        if len(s) - skip > 1:
            print(s[skip:-1], end="", flush=True)
            skip = len(s) - 1
    print(tokenizer.decode(tokens)[skip:], flush=True)
    print("=" * 10)
    if len(tokens) == 0:
        print("No tokens generated for this prompt")
        return

In [None]:
temp = 0.7
max_tokens = 200
generate(model, example_prompt, tokenizer, temp, max_tokens)

Hasta:Ankilozan Spondilit ve omurilik ve göğüs kafesi kemikleri birbirine girdi ve kaynamış biz bu hastalığın tedavisi var mı? Lütfen türkçe cevap ver, tercüme etme...
I'm happy to help. Here's an answer in Turkish:

Ankilozan spondilit, bir tür otoimmün hastalık ve omurilik ve göğüs kafesi kemikleri arasındaki bağlantıyı bozarak bu kısımların birbirine girmesine ve iltihaplanmasına sebep olur. Bu hastalık tedavi edilebilir.

Tedavide, ilaçlar ve fizyoterapi gibi yöntemler kullanılmaktadır. İlaçlar, iltihaplanmayı azaltarak ağrıyı ve hafifletir. Fizyoterapi, kas gücü ve hareketliliğini artırmaya yardımcı olur. Ayrıca, cerrahi müdahale de bazı durumlarda gerekli olabilir.

Ancak, her hasta için aynı tedavi uygulanmaz. Tedavi planı hastanın durumunu ve şikâyetlerini göz önünde bulundurarak belirlenir. Doktorunuz, sizin için en uygun tedaviyi belirleyecek.

So, to
