In [None]:
!pip -q uninstall -y bitsandbytes


[0m

In [None]:
from datasets import load_dataset

data_file = "/content/autotrain_text.jsonl"
ds = load_dataset("json", data_files=data_file)["train"]
ds = ds.train_test_split(test_size=0.05, seed=42)
train_ds = ds["train"]
eval_ds  = ds["test"]

print("Train:", len(train_ds), "Eval:", len(eval_ds))
print(train_ds[0]["text"][:300])


Generating train split: 0 examples [00:00, ? examples/s]

Train: 760 Eval: 40
### Instruction:
Explain LoRA (Low-Rank Adaptation) in 3 clear paragraphs.

### Response:
One of the main strengths of LoRA (Low-Rank Adaptation) is its ability to enhance generalization. By applying this concept correctly, models become more robust and better suited for real-world deployment.


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)
model.config.use_cache = False
print("✅ Model loaded")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

✅ Model loaded


In [None]:
from transformers import DataCollatorForLanguageModeling

max_length = 512

def tokenize_fn(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        max_length=max_length,
        padding=False,
    )

train_tok = train_ds.map(tokenize_fn, batched=True, remove_columns=train_ds.column_names)
eval_tok  = eval_ds.map(tokenize_fn,  batched=True, remove_columns=eval_ds.column_names)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
print("✅ Tokenization done")


Map:   0%|          | 0/760 [00:00<?, ? examples/s]

Map:   0%|          | 0/40 [00:00<?, ? examples/s]

✅ Tokenization done


In [None]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj","k_proj","v_proj","o_proj"],
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
print("✅ LoRA attached")


trainable params: 4,505,600 || all params: 1,104,553,984 || trainable%: 0.4079
✅ LoRA attached


In [None]:
from transformers import Trainer, TrainingArguments

out_dir = "/content/tinyllama_lora_out"

args = TrainingArguments(
    output_dir=out_dir,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=8,
    learning_rate=2e-4,
    num_train_epochs=3,
    fp16=True,
    logging_steps=10,
    save_steps=50,

    eval_strategy="steps",   # ✅ بدل evaluation_strategy
    eval_steps=50,

    save_total_limit=2,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_tok,
    eval_dataset=eval_tok,
    data_collator=data_collator,
)

trainer.train()
print("✅ Training finished")


The model is already on multiple devices. Skipping the move to device specified in `args`.


Step,Training Loss,Validation Loss
50,0.297,0.236547
100,0.1447,0.146174
150,0.1347,0.135155
200,0.1256,0.129473
250,0.1281,0.127168


✅ Training finished


In [None]:
adapter_dir = "/content/tinyllama_lora_adapter"
model.save_pretrained(adapter_dir)
tokenizer.save_pretrained(adapter_dir)
print("✅ Saved adapter to:", adapter_dir)


✅ Saved adapter to: /content/tinyllama_lora_adapter


In [None]:
from peft import PeftModel

base = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)
ft = PeftModel.from_pretrained(base, adapter_dir)
ft.eval()

prompt = "### Instruction:\nWhat is synthetic data?\n\n### Response:\n"
inputs = tokenizer(prompt, return_tensors="pt").to(ft.device)

with torch.no_grad():
    out = ft.generate(**inputs, max_new_tokens=120, do_sample=True, temperature=0.7, top_p=0.9)

print(tokenizer.decode(out[0], skip_special_tokens=True))


### Instruction:
What is synthetic data?

### Response:
Synthetic data is a fundamental concept in artificial intelligence that plays a critical role in improving the performance and reliability of modern models. It helps systems understand complex patterns and generate meaningful outputs across various tasks. Without synthetic data, many modern applications would struggle to achieve high accuracy and reliability.

From a theoretical perspective, synthetic data represents a key advancement in the field of artificial intelligence. Researchers continue to explore its potential to improve scalability, interpretability, and efficiency.

In practice, synthetic data helps machines understand complex patterns and generate meaningful outputs across


In [None]:
with torch.no_grad():
    out = ft.generate(
        **inputs,
        max_new_tokens=250,
        do_sample=True,
        temperature=0.6,
        top_p=0.9,
        repetition_penalty=1.1,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id,
    )
print(tokenizer.decode(out[0], skip_special_tokens=True))


### Instruction:
What is synthetic data?

### Response:
In practical terms, synthetic data allows AI systems to operate more efficiently and accurately. It is widely applied in areas such as natural language processing, recommendation systems, and intelligent assistants. Without synthetic data, many modern apps would be difficult or impossible to understand.

A real-world example of synthetic data can be seen in applications like ChatGPT, autonomous systems, and advanced data analysis tools, where intelligent behavior depends heavily on this concept. To become a successful engineer, students and professionals should embrace this fundamental theory of information processing.

From a academic perspective, synthetic data represents a key advancement in the field of artificial intelligence. Researchers continue to explore its potential to improve scalability, interpretability, and efficiency. Future developments may also focus on generating genuine intelligent behaviors across large langua

In [None]:
import json
import random

output_path = "/content/autotrain_text.jsonl"

topics = [
    "Generative Artificial Intelligence",
    "Large Language Models",
    "Transformers",
    "Self-Attention Mechanism",
    "Fine-tuning",
    "LoRA (Low-Rank Adaptation)",
    "Parameter Efficient Fine-Tuning",
    "Prompt Engineering",
    "Instruction Tuning",
    "Hallucinations in AI models",
    "Synthetic Data",
    "Machine Learning",
    "Deep Learning",
    "Neural Networks",
    "Tokenization",
    "Context Window",
    "Embeddings",
    "Vector Databases",
    "Retrieval-Augmented Generation (RAG)",
    "Overfitting",
    "Generalization",
    "Bias in AI systems",
    "Evaluation Metrics in NLP",
]

advanced_instruction_templates = [
    "Explain {topic} in simple terms.",
    "Explain {topic} as if teaching a university student.",
    "Explain {topic} using 5 bullet points.",
    "Compare {topic} with traditional approaches.",
    "Why is {topic} important in modern AI systems?",
    "What are the advantages and disadvantages of {topic}?",
    "Explain {topic} with a real-world example.",
    "Describe the role of {topic} in large language models.",
    "Explain {topic} step by step.",
    "Explain {topic} and mention common mistakes beginners make.",
    "How does {topic} improve model performance?",
    "What would happen if {topic} is ignored in AI systems?",
    "Explain {topic} in 3 clear paragraphs.",
    "Summarize {topic} in exactly 3 sentences.",
    "Explain {topic} in an academic style.",
    "Explain {topic} for exam preparation purposes.",
    "Discuss the relationship between {topic} and model generalization.",
    "Analyze the impact of {topic} on AI reliability.",
    "Give a detailed explanation of {topic} with examples.",
    "Explain {topic} and compare it with a related concept.",
]

response_templates = [
    "{topic} is a fundamental concept in artificial intelligence that plays a critical role in improving the performance and reliability of modern models. It helps systems understand complex patterns and generate meaningful outputs across various tasks.",

    "In practical terms, {topic} allows AI systems to operate more efficiently and accurately. It is widely applied in areas such as natural language processing, recommendation systems, and intelligent assistants.",

    "One of the main strengths of {topic} is its ability to enhance generalization. By applying this concept correctly, models become more robust and better suited for real-world deployment.",

    "From an academic perspective, {topic} represents a key advancement in the field of artificial intelligence. Researchers continue to explore its potential to improve scalability, interpretability, and efficiency.",

    "{topic} contributes significantly to model performance by optimizing how information is processed and learned. Without it, many modern AI systems would struggle to achieve high accuracy and reliability.",

    "A real-world example of {topic} can be seen in applications like ChatGPT, autonomous systems, and advanced data analysis tools, where intelligent behavior depends heavily on this concept.",

    "Understanding {topic} is essential for students and engineers working in AI, as it connects theoretical foundations with practical implementations in modern systems.",
]

def generate_advanced_dataset(n=800):
    dataset = []
    for _ in range(n):
        topic = random.choice(topics)
        instruction = random.choice(advanced_instruction_templates).format(topic=topic)
        response = random.choice(response_templates).format(topic=topic)

        text = f"### Instruction:\n{instruction}\n\n### Response:\n{response}"
        dataset.append({"text": text})
    return dataset

data = generate_advanced_dataset(800)

with open(output_path, "w", encoding="utf-8") as f:
    for item in data:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

print(f"✅ Generated {len(data)} high-quality examples")
print(f"📁 Saved to: {output_path}")


✅ Generated 800 high-quality examples
📁 Saved to: /content/autotrain_text.jsonl
