# Load model from HF

In [1]:
from huggingface_hub import login
access_token = os.environ['HF_TOKEN']
login(access_token)

In [2]:
from mlx_lm import generate, load
model, tokenizer = load("Qwen/Qwen2.5-1.5B")

Fetching 7 files:   0%|          | 0/7 [00:00<?, ?it/s]

In [3]:
prompt = "What is under-fitting and overfitting in machine learning?"

messages = [{"role": "user", "content": prompt}]
prompt = tokenizer.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)

response = generate(model, tokenizer, prompt=prompt, verbose=True, max_tokens = 100)

 is the difference between under-fitting and overfitting in machine learning? under-fitting and overfitting are two common problems in machine learning. under-fitting occurs when a model is too simple to capture the underlying patterns in the data, resulting in poor performance on both training and test data. overfitting, on the other hand, occurs when a model is too complex and captures the noise in the training data, resulting in poor performance on the test data. under-fitting and over
Prompt: 32 tokens, 342.624 tokens-per-sec
Generation: 100 tokens, 95.709 tokens-per-sec
Peak memory: 3.110 GB


# Loading the dataset

In [44]:
import json

class Dataset:
    """
    Light-weight wrapper to hold lines from a jsonl file
    """

    def __init__(self, path: Path, key: str = "text"):
        if not path.exists():
            self._data = None
        else:
            with open(path, "r") as fid:
                self._data = [json.loads(l) for l in fid]
        self._key = key

    def __getitem__(self, idx: int):
        return self._data[idx][self._key]

    def __len__(self):
        return len(self._data)

def load():
    def load_and_check(path):
        try:
            return Dataset(Path(path))
        except Exception as e:
            print(f"Unable to build dataset {path} ({e})")
            raise

    train = load_and_check('../data/train.jsonl')
    test = load_and_check('../data/valid.jsonl')

    if len(train) == 0:
        raise ValueError("Training set not found or empty")
    if len(test) == 0:
        raise ValueError("Test set not found or empty")

    return train, test


In [45]:
train_set, dev_set = load()

In [4]:
from datasets import load_dataset
ds = load_dataset("win-wang/Machine_Learning_QA_Collection")

README.md:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

train.jsonl:   0%|          | 0.00/8.13M [00:00<?, ?B/s]

valid.jsonl:   0%|          | 0.00/1.17M [00:00<?, ?B/s]

test.jsonl:   0%|          | 0.00/2.34M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8652 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1235 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2471 [00:00<?, ? examples/s]

In [23]:
import pandas as pd
train_set = pd.DataFrame(ds["train"])
dev_set = pd.DataFrame(ds["validation"])
test_set = pd.DataFrame(ds["test"])
train_set.head()

Unnamed: 0,text
0,<start_of_turn>user\n I am solving a system...
1,<start_of_turn>user\n I get a gtk-WARNING w...
2,<start_of_turn>user\n I don't really like w...
3,<start_of_turn>user\n I am having trouble f...
4,<start_of_turn>user\n How do Recurrent Neur...


In [25]:
# MLX requires the data to be placed in a container that allows random access. Here, we convert it into a Python built-in list
def preprocess(dataset):
    return dataset["text"].tolist()
    
train_set, dev_set, test_set = map(preprocess, (train_set, dev_set, test_set))

# Model fine-tuning

In [10]:
import matplotlib.pyplot as plt
import mlx.optimizers as optim
from mlx.utils import tree_flatten
from mlx_lm import load, generate
from mlx_lm.tuner.trainer import iterate_batches
from mlx_lm.tuner import train, TrainingArgs
from mlx_lm.tuner import linear_to_lora_layers
from pathlib import Path
import json

In [11]:
adapter_path = Path("adapters")
adapter_path.mkdir(parents=True, exist_ok=True)

In [12]:
lora_config = {
 "lora_layers": 8,
 "lora_parameters": {
    "rank": 8,
    "scale": 20.0,
    "dropout": 0.0,
}}

In [13]:
with open(adapter_path / "adapter_config.json", "w") as fid:
    json.dump(lora_config, fid, indent=4)

In [14]:
training_args = TrainingArgs(
    adapter_file=adapter_path / "adapters.safetensors",
    iters=200,
    steps_per_eval=50
)

In [15]:
model.freeze()

Model(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 1536)
    (layers.0): TransformerBlock(
      (self_attn): Attention(
        (q_proj): Linear(input_dims=1536, output_dims=1536, bias=True)
        (k_proj): Linear(input_dims=1536, output_dims=256, bias=True)
        (v_proj): Linear(input_dims=1536, output_dims=256, bias=True)
        (o_proj): Linear(input_dims=1536, output_dims=1536, bias=False)
        (rope): RoPE(128, traditional=False)
      )
      (mlp): MLP(
        (gate_proj): Linear(input_dims=1536, output_dims=8960, bias=False)
        (down_proj): Linear(input_dims=8960, output_dims=1536, bias=False)
        (up_proj): Linear(input_dims=1536, output_dims=8960, bias=False)
      )
      (input_layernorm): RMSNorm(1536, eps=1e-06)
      (post_attention_layernorm): RMSNorm(1536, eps=1e-06)
    )
    (layers.1): TransformerBlock(
      (self_attn): Attention(
        (q_proj): Linear(input_dims=1536, output_dims=1536, bias=True)
        (k_proj): Linear(in

In [16]:
linear_to_lora_layers(model, lora_config["lora_layers"], lora_config["lora_parameters"])

In [17]:
num_train_params = (
    sum(v.size for _, v in tree_flatten(model.trainable_parameters()))
)
print(f"Number of trainable parameters: {num_train_params}")

Number of trainable parameters: 1089536


In [18]:
model.train()

Model(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 1536)
    (layers.0): TransformerBlock(
      (self_attn): Attention(
        (q_proj): LoRALinear(
          (linear): Linear(input_dims=1536, output_dims=1536, bias=True)
          (dropout): Dropout(p=0.0)
        )
        (k_proj): Linear(input_dims=1536, output_dims=256, bias=True)
        (v_proj): LoRALinear(
          (linear): Linear(input_dims=1536, output_dims=256, bias=True)
          (dropout): Dropout(p=0.0)
        )
        (o_proj): Linear(input_dims=1536, output_dims=1536, bias=False)
        (rope): RoPE(128, traditional=False)
      )
      (mlp): MLP(
        (gate_proj): Linear(input_dims=1536, output_dims=8960, bias=False)
        (down_proj): Linear(input_dims=8960, output_dims=1536, bias=False)
        (up_proj): Linear(input_dims=1536, output_dims=8960, bias=False)
      )
      (input_layernorm): RMSNorm(1536, eps=1e-06)
      (post_attention_layernorm): RMSNorm(1536, eps=1e-06)
    )
    (l

In [19]:
opt = optim.Adam(learning_rate=1e-5)

In [20]:
class Metrics:
    train_losses = []
    val_losses = []
    def on_train_loss_report(self, info):
        self.train_losses.append((info["iteration"], info["train_loss"]))
    def on_val_loss_report(self, info):
        self.val_losses.append((info["iteration"], info["val_loss"]))
        
metrics = Metrics()

In [33]:
train(
    model = model,
    tokenizer = tokenizer,
    args = training_args,
    optimizer = opt,
    train_dataset = train_set,
    val_dataset = dev_set,
    training_callback = metrics
)

Starting training..., iters: 200


ValueError: invalid literal for int() with base 10: "<s>[INST] Can I use the block editor to create a custom search engine or indexing tool for my site, and what are the benefits?[/INST] Yes, you can leverage the block editor (also known as Gutenberg) 

In [54]:
sample_batch = next(iter(iterate_batches(
    dataset=train_set,
    tokenizer=tokenizer,
    batch_size=1,
    max_seq_length=2000,
    train=True,
)))
print(sample_batch)

ValueError: invalid literal for int() with base 10: '<s>[INST] Where can I find a good WordPress hosting provider?[/INST] There are several reputable WordPress hosting providers you can consider:\n\n1. Bluehost: An official WordPress recommended hostin