# Training Lora From Corpus Version 2
This adds stop tokens to the training

# Load model with Unsloth patching

In [1]:
from unsloth import FastLanguageModel

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="deepseek-ai/deepseek-llm-7b-base",
    max_seq_length=1024,
    load_in_4bit=True,
)

print("Loaded model in 4-bit ✅")

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 08-21 14:58:39 [__init__.py:244] Automatically detected platform cuda.
==((====))==  Unsloth 2025.7.11: Fast Llama patching. Transformers: 4.53.2. vLLM: 0.9.2.
   \\   /|    NVIDIA RTX A4000. Num GPUs = 1. Max memory: 15.724 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 8.6. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


pytorch_model.bin.index.json: 0.00B [00:00, ?B/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.97G [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.85G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/121 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/792 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

deepseek-ai/deepseek-llm-7b-base does not have a padding token! Will use pad_token = <|PAD_TOKEN|>.
Loaded model in 4-bit ✅


# Apply LoRa adapter

In [2]:
peft_model = FastLanguageModel.get_peft_model(
    model,
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    target_modules=["q_proj","k_proj","v_proj","o_proj","up_proj","down_proj","gate_proj"]    bias="none",
    use_gradient_checkpointing=True,
)
print("Loaded peft model ✅")


Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2025.7.11 patched 30 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


Loaded peft model ✅


# Load the dataset from corpus

In [4]:
import os 

CORPUS_DIR = "/datasets/wtk_archive_with_stops"
BLOCK_SIZE = 1024  # max tokens per chunk

tok = tokenizer 

# Ensure EOS/PAD exist and are consistent
added = False
if tok.eos_token is None:
    tok.add_special_tokens({"eos_token": "</s>"})
    added = True
if tok.pad_token is None:
    tok.pad_token = tok.eos_token
    added = True
if added:
    model.resize_token_embeddings(len(tok))

# -----------------------
# 2) Load raw text files (no EOS strings here)
# -----------------------
def load_txt_corpus(directory):
    texts = []
    for filename in os.listdir(directory):
        if not filename.endswith(".txt"):
            continue
        path = os.path.join(directory, filename)
        with open(path, "r", encoding="utf-8", errors="ignore") as f:
            txt = f.read().strip()
            if txt:
                texts.append(txt)
    return texts

raw_texts = load_txt_corpus(CORPUS_DIR)
print(f"Loaded {len(raw_texts)} files from {CORPUS_DIR} ✅")

Loaded 1 files from /datasets/wtk_archive_with_stops ✅


# Tokenize with EOS appended (token id, not string)
We’ll build one long stream of ids and then pack into BLOCK_SIZE chunks.

In [5]:
from datasets import Dataset

def tokenize_append_eos(texts):
    # batch tokenize; append eos token string so tokenizer emits eos_token_id
    # Alternatively: add eos id manually after each example (equivalent).
    enc = tok(texts, add_special_tokens=False)
    input_ids = []
    for ids in enc["input_ids"]:
        input_ids.extend(ids)
        if tok.eos_token_id is not None:
            input_ids.append(tok.eos_token_id)
    return input_ids

flat_ids = tokenize_append_eos(raw_texts)

# -----------------------
# 4) Pack into fixed-length blocks (no cross-doc bleed because we injected EOS)
# -----------------------
def pack_ids_to_blocks(ids, block_size):
    blocks = []
    for i in range(0, len(ids) - block_size + 1, block_size):
        chunk = ids[i : i + block_size]
        blocks.append({"input_ids": chunk, "attention_mask": [1] * len(chunk)})
    return Dataset.from_list(blocks)

train_dataset = pack_ids_to_blocks(flat_ids, BLOCK_SIZE)
print(f"Prepared {len(train_dataset)} packed training chunks of {BLOCK_SIZE} tokens ✅")


Prepared 4883 packed training chunks of 1024 tokens ✅


# Train using SFTTrainer

In [None]:
from trl import SFTTrainer

trainer = SFTTrainer(
    model = peft_model,
    tokenizer = tok,
    train_dataset = train_dataset,
    max_seq_length=BLOCK_SIZE,
    args = {
        "output_dir": "lora-txt-training3",
        "per_device_train_batch_size": 2,
        "gradient_accumulation_steps": 4,
        "num_train_epochs": 1,
        "logging_steps": 10,
        "save_strategy": "steps",
        "save_steps": 250,
        "save_total_limit": 3,
        "fp16": True,
        "remove_unused_columns": False,   # IMPORTANT: keep pretokenized columns
        "report_to": "none",
        "logging_steps": 10,
        "resume_from_checkpoint": True,   # auto-resume if checkpoints found
    },
)
trainer.train()
print("Training complete ✅")

trainer.model.save_pretrained("lora-txt-training2/")
tok.save_pretrained("lora-txt-training2/")

print("Training results saved ✅")

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 4,883 | Num Epochs = 3 | Total steps = 1,833
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 2
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 2 x 1) = 8
 "-____-"     Trainable parameters = 3,932,160 of 6,914,297,856 (0.06% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,2.2796
2,2.2872
3,2.2947
4,2.3583
5,2.2028
6,2.2783
7,2.2287
8,2.2683
9,2.2207
10,2.2758


# Save training results
This may not be needed if the save training results is present in the training snippet.

In [6]:
trainer.model.save_pretrained("lora-txt-training/")
tok.save_pretrained("lora-txt-training/")

print("Training results saved ✅")

Training results saved ✅


# Push LoRa to Huggingface

In [2]:
from huggingface_hub import HfApi, upload_folder

repo_id = "peers-ai/deepseek-7b-my-lora1"
folder = "lora-txt-training2"  # contains adapter_config.json & adapter_model.bin

api = HfApi()
# create the repo if it doesn't exist
api.create_repo(repo_id, repo_type="model", private=True, exist_ok=True)

# upload all files in the folder
upload_folder(
    repo_id=repo_id,
    folder_path=folder,
    repo_type="model",
)
print(f"✅ Uploaded to https://huggingface.co/{repo_id}")


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...training2/adapter_model.safetensors:  90%|######### | 14.2MB / 15.7MB            

✅ Uploaded to https://huggingface.co/peers-ai/deepseek-7b-my-lora1
