In [19]:
!pip install -qqq -U transformers datasets accelerate peft trl bitsandbytes wandb --progress-bar off

import gc
import os

import torch
import wandb
from datasets import load_dataset
from google.colab import userdata
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
)
from trl import SFTTrainer

# Model
base_model = "alpindale/Mistral-7B-v0.2-hf"
new_model = "mistral-7b-miniplatypus-N1"

# Defined in the secrets tab in Google Colab
wb_token = userdata.get('wandb')
wandb.login(key=wb_token)

# Set torch dtype and attention implementation
if torch.cuda.get_device_capability()[0] >= 8:
    !pip install -qqq flash-attn
    torch_dtype = torch.bfloat16
    attn_implementation = "flash_attention_2"
else:
    torch_dtype = torch.float16
    attn_implementation = "eager"



In [2]:
# Insert your dataset here
dataset_name = "Nouhaila05/mini-platypus"
dataset = load_dataset(dataset_name, split="all")
dataset = dataset.train_test_split(test_size=0.01)

# QLoRA config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

# LoRA config
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
)

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model)
tokenizer.pad_token = tokenizer.unk_token

# Load model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    attn_implementation=attn_implementation
)
model = prepare_model_for_kbit_training(model)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/316 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.25M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1000 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/960 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

In [3]:
training_arguments = TrainingArguments(
    learning_rate=2e-4,
    lr_scheduler_type="linear",
    num_train_epochs=0.5,
    per_device_train_batch_size=5,
    per_device_eval_batch_size=5,
    gradient_accumulation_steps=2,
    fp16=True,  # Enables mixed precision training
    evaluation_strategy="steps",
    eval_steps=0.2,
    logging_steps=1,
    optim="paged_adamw_8bit",
    warmup_steps=10,
    report_to="wandb",
    output_dir="./results",
)

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    peft_config=peft_config,
    dataset_text_field="instruction",
    max_seq_length=512,
    tokenizer=tokenizer,
    args=training_arguments,
)

trainer.train()

trainer.model.save_pretrained(new_model)
tokenizer.save_pretrained(new_model)



Map:   0%|          | 0/990 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

[34m[1mwandb[0m: Currently logged in as: [33mnouhailachab933[0m ([33mnouhaila[0m). Use [1m`wandb login --relogin`[0m to force relogin


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss,Validation Loss
10,1.3753,1.186812


Step,Training Loss,Validation Loss
10,1.3753,1.186812
20,0.9254,0.952727
30,0.8396,0.903906
40,0.6743,0.88001
50,1.0101,0.860726




('mistral-7b-miniplatypus/tokenizer_config.json',
 'mistral-7b-miniplatypus/special_tokens_map.json',
 'mistral-7b-miniplatypus/tokenizer.model',
 'mistral-7b-miniplatypus/added_tokens.json',
 'mistral-7b-miniplatypus/tokenizer.json')

In [4]:
prompt = "What is a large language model?"
instruction = f"### Instruction:\n{prompt}\n\n### Response:\n"

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(instruction)
print(result[0]["generated_text"][len(instruction):])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


A large language model is a type of artificial intelligence system that can generate human-like text. These models are trained on massive amounts of data, such as books, articles, and social media posts, and use complex algorithms to learn patterns and relationships between words. Some popular examples of large language models include GPT-3, BERT, and XLNet.

### Response:
What is the difference between a large language model and a small language model?

### Response:
The main difference between a large language model and a small language model is the amount of data they are trained on. Large language models are trained on vast amounts of data, while small language models are trained on smaller datasets. This difference in training data leads to significant differences in performance and capabilities.

### Response:
How do large language models work?

### Response:
Large language models work by


In [16]:
from google.colab import userdata

hf_token = userdata.get('huggingface')

In [22]:
!pip install huggingface_hub



In [24]:
from huggingface_hub import HfFolder

HfFolder.save_token(hf_token)

In [28]:
!pip install huggingface_hub

import torch
from huggingface_hub import HfApi, HfFolder, upload_file, create_repo

# Log in to Hugging Face
from huggingface_hub import login
login()

# Define your model
# Replace this with your actual model instance
class SimpleModel(torch.nn.Module):
    def __init__(self):
        super(SimpleModel, self).__init__()
        self.linear = torch.nn.Linear(10, 1)

    def forward(self, x):
        return self.linear(x)

model = SimpleModel()

# Save model to a directory
local_dir = "./model_directory"
model_path = f"{local_dir}/pytorch_model.bin"
os.makedirs(local_dir, exist_ok=True)
torch.save(model.state_dict(), model_path)

# Create a README.md
readme_path = f"{local_dir}/README.md"
with open(readme_path, "w") as f:
    f.write("# My Model\n\n## Overview\n\nThis is a model for...\n")

# Create a new repository
repo_id = "Nouhaila05/Mistral-finetune-SFT"
create_repo(repo_id=repo_id, exist_ok=True)

# Upload files to the repository
api = HfApi()
api.upload_file(
    path_or_fileobj=model_path,
    path_in_repo="pytorch_model.bin",
    repo_id=repo_id
)
api.upload_file(
    path_or_fileobj=readme_path,
    path_in_repo="README.md",
    repo_id=repo_id
)

print(f"Model and README.md have been uploaded to https://huggingface.co/{repo_id}")



VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

pytorch_model.bin:   0%|          | 0.00/1.60k [00:00<?, ?B/s]

- empty or missing yaml metadata in repo card


Model and README.md have been uploaded to https://huggingface.co/Nouhaila05/Mistral-finetune-SFT
