<a href="https://colab.research.google.com/github/nguforche/LLaMPS/blob/main/unsloth_finetune_mistral_public.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Fine-tuning Mistral with Unsloth

In [None]:
%%capture
import torch
major_version, minor_version = torch.cuda.get_device_capability()
if major_version >= 8:
    # Use this for new GPUs like Ampere, Hopper GPUs (RTX 30xx, RTX 40xx, A100, H100, L40)
    !pip install wandb "unsloth[colab-ampere] @ git+https://github.com/unslothai/unsloth.git"
else:
    # Use this for older GPUs (V100, Tesla T4, RTX 20xx)
    !pip install wandb "unsloth[colab] @ git+https://github.com/unslothai/unsloth.git"
pass

In [None]:
import wandb, os
wandb.login()

wandb_project = "your-wandb-project"
if len(wandb_project) > 0:
    os.environ["WANDB_PROJECT"] = wandb_project

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [None]:
from unsloth import FastLanguageModel
import torch

max_seq_length = 8192 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "mistralai/Mistral-7B-Instruct-v0.2",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = False,
    # max_seq_length = max_seq_length,
    trust_remote_code=True
)

config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

==((====))==  Unsloth: Fast Mistral patching release 2024.3
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.564 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.1+cu121. CUDA = 8.0. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. Xformers = 0.0.24. FA = True.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

## 3. Load Dataset

### Scraping PR data with file diffs



In [None]:
import requests
import csv
import time

GITHUB_TOKEN = "token"  # Replace with your GitHub token
REPO_OWNER = 'octocat'
REPO_NAME = 'Hello-World'
API_URL = f'https://api.github.com/repos/{REPO_OWNER}/{REPO_NAME}'
HEADERS = {
    'Authorization': f'token {GITHUB_TOKEN}',
    'Accept': 'application/vnd.github.v3+json'
}
TEXT_FILE = 'pr_data_100.txt'

TOTAL_PRS = 5  # Total number of PRs to fetch
PER_PAGE = 100  # Maximum items per page, I think max is 100

def get_closed_pull_requests(api_url, headers, page, per_page):
    """Fetches closed pull requests for a specific page."""
    prs_url = f'{api_url}/pulls?state=closed&sort=updated&direction=desc&per_page={per_page}&page={page}'
    print(f"Fetching PRs from: {prs_url}")
    response = requests.get(prs_url, headers=headers)
    if response.status_code == 200:
        print(f"Successfully fetched PRs for page {page}")
        return response.json()
    else:
        print(f'Failed to fetch PRs for page {page}: {response.status_code}')
        return []

def get_pr_comments(api_url, pr_number, headers):
    """Fetches comments for a specific pull request."""
    comments_url = f'{api_url}/issues/{pr_number}/comments'
    print(f"Fetching comments for PR #{pr_number}")
    response = requests.get(comments_url, headers=headers)
    if response.status_code == 200:
        print(f"Successfully fetched comments for PR #{pr_number}")
        return response.json()
    else:
        print(f'Failed to fetch comments for PR {pr_number}: {response.status_code}')
        return []

def get_pr_files(api_url, pr_number, headers):
    """Fetches file changes for a specific pull request."""
    files_url = f'{api_url}/pulls/{pr_number}/files'
    print(f"Fetching files for PR #{pr_number}")
    response = requests.get(files_url, headers=headers)
    if response.status_code == 200:
        print(f"Successfully fetched files for PR #{pr_number}")
        return response.json()
    else:
        print(f'Failed to fetch files for PR {pr_number}: {response.status_code}')
        return []

def get():
    with open(TEXT_FILE, 'w') as txt_file:
        pr_count = 0
        page = 1
        while pr_count < TOTAL_PRS:
            print(f"Processing page {page}")
            pull_requests = get_closed_pull_requests(API_URL, HEADERS, page, PER_PAGE)

            if not pull_requests:  # If no PRs are returned
                print(f"No more PRs found at page {page}. Stopping.")
                break

            print(f"Fetched {len(pull_requests)} PRs from page {page}.")
            for pr in pull_requests:
                if pr_count >= TOTAL_PRS:
                    print(f"Reached total PRs count: {TOTAL_PRS}")
                    break
                pr_number = pr['number']
                print(f"Processing PR #{pr_number}")

                pr_number = pr['number']
                pr_data = f'PR #{pr_number}: {pr["title"]}\n'
                pr_data += f'URL: {pr["html_url"]}\n'
                pr_data += f'Description: {pr["body"]}\n'
                pr_data += f'Created at: {pr["created_at"]}\n'
                pr_data += f'Merged at: {pr["merged_at"]}\n'

                comments = get_pr_comments(API_URL, pr_number, HEADERS)
                for comment in comments:
                    pr_data += f'Comment: {comment["body"]}\n'

                files = get_pr_files(API_URL, pr_number, HEADERS)

                for file in files:
                    if file["filename"] != "yarn.lock":
                        pr_data += f'File: {file["filename"]}, Status: {file["status"]}\n'

                        if "patch" in file:
                            pr_data += f'Changes: {file["patch"]}\n'  # This will print the diff of the file

                pr_data += '\n\n'
                txt_file.write(pr_data)

                pr_count += 1

            if len(pull_requests) < PER_PAGE:  # Break if the last page has been reached
                print("Reached the last page of PRs.")
                break

            page += 1
            print(f"Moving to page {page}")

get()

### Raw text data

In [None]:
from datasets import Dataset

def create_datasets(chunks, eval_split=0.1):
    """Splits the chunks into training and evaluation datasets."""
    total_chunks = len(chunks)
    eval_start = int(total_chunks * (1 - eval_split))

    train_chunks = chunks
    eval_chunks = chunks[eval_start:]

    train_dataset = Dataset.from_dict({'text': train_chunks})
    eval_dataset = Dataset.from_dict({'text': eval_chunks})

    return train_dataset, eval_dataset

#### Wiki

In [None]:
def read_and_chunk_file(file_path, chunk_marker='§'):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()

    sections = text.split(chunk_marker)[1:]
    return sections

file_path = '/content/wiki.txt'
wiki_chunks = read_and_chunk_file(file_path)

#### Code and PR's

In [None]:
def chunk_text_by_file_and_pr(filepath):
    with open(filepath, 'r') as file:
      content = file.read()

    lines = content.split('\n')

    chunks = []
    current_chunk = []

    # Helper function to add non-empty chunks
    def add_chunk(c_chunk):
        if c_chunk:
            chunks.append('\n'.join(c_chunk))

    for line in lines:
        # Check for file or PR beginning
        if line.startswith('- File:') or line.strip().startswith('- Title:') or line.strip().startswith('PR #'):
            # If there's an existing chunk, add it to the chunks list
            add_chunk(current_chunk)
            current_chunk = [line]  # Start a new chunk
        else:
            current_chunk.append(line)

    # Add the last chunk if it's not empty
    add_chunk(current_chunk)

    return chunks

file_path = '/content/code-and-PRs.txt'

code_and_pr_chunks = chunk_text_by_file_and_pr(file_path)

#### PR's with file diffs

In [None]:
def chunk_text_by_pr_diff(filepath):
    with open(filepath, 'r') as file:
        content = file.read()

    lines = content.split('\n')

    chunks = []  # This will hold the final chunks of text
    current_chunk = []  # Temporarily holds lines for the current chunk

    def add_chunk(c_chunk):
        if c_chunk:
            chunks.append('\n'.join(c_chunk))

    for line in lines:
        # Check for file or PR beginning
        if line.startswith('PR #'):
            # If there's an existing chunk, add it to the chunks list
            add_chunk(current_chunk)
            current_chunk = [line]  # Start a new chunk
        else:
            current_chunk.append(line)

    # Add the last chunk if it's not empty
    add_chunk(current_chunk)

    return chunks

file_path = '/content/pr_data_250.txt'

pr_diff_chunks = chunk_text_by_pr_diff(file_path)

#### Further chunk sequences that exceed 8192 tokens

In [None]:
def chunk_data_points(dataset, max_tokens=8192):
    """
    Chunk data points in the dataset into parts of up to max_tokens size.

    :param dataset: List of strings, each representing a data point.
    :param max_tokens: Maximum number of tokens allowed per chunk.
    :return: A new list where each data point is <= max_tokens size.
    """
    chunked_dataset = []
    for data_point in dataset:
        tokens = tokenizer.tokenize(data_point)
        num_tokens = len(tokens)
        if num_tokens <= max_tokens:
            chunked_dataset.append(data_point)
        else:
            print("chunk exceeds token limit, chunking further...")
            # Since direct token counts are used, need to manage chunks based on tokens directly.
            start_index = 0
            while start_index < num_tokens:
                # The end index is calculated based on the max_tokens, adjusting for the start_index.
                end_index = min(start_index + max_tokens, num_tokens)
                # Convert tokens back to text. Might need adjustment based on tokenizer specifics.
                chunk_text = tokenizer.convert_tokens_to_string(tokens[start_index:end_index])
                chunked_dataset.append(chunk_text)
                start_index = end_index
    return chunked_dataset

def chunk_data_points_for_pr_diffs(dataset, max_tokens=8192):
    """
    If a PR's content exceeds the max_tokens size, we just skip. This seemed logical for me, rather than letting the model start predicting random diffed code without previous PR context.

    :param dataset: List of strings, each representing a data point.
    :param max_tokens: Maximum number of tokens allowed per chunk.
    :return: A new list where each data point is <= max_tokens size.
    """
    chunked_dataset = []
    for data_point in dataset:
        tokens = tokenizer.tokenize(data_point)
        if len(tokens) <= max_tokens:
            chunked_dataset.append(data_point)
    return chunked_dataset

chunks_1 = chunk_data_points(wiki_chunks + code_and_pr_chunks)
chunks_2 = chunk_data_points_for_pr_diffs(pr_diff_chunks)
chunks = chunks_1 + chunks_2

final_chunks = []

for chunk in chunks:
  # Add in the beginning and end of sentence tokens to each text sequence
  text = tokenizer.bos_token + chunk + tokenizer.eos_token

  final_chunks.append({"text": text})

chunk exceeds token limit, chunking further...
chunk exceeds token limit, chunking further...
chunk exceeds token limit, chunking further...
chunk exceeds token limit, chunking further...
chunk exceeds token limit, chunking further...
chunk exceeds token limit, chunking further...
chunk exceeds token limit, chunking further...
chunk exceeds token limit, chunking further...
chunk exceeds token limit, chunking further...
chunk exceeds token limit, chunking further...
chunk exceeds token limit, chunking further...
chunk exceeds token limit, chunking further...


Token indices sequence length is longer than the specified maximum sequence length for this model (32918 > 32768). Running this sequence through the model will result in indexing errors


What a sample of our raw text data looks like before being tokenized and packing is applied:

In [None]:
final_chunks[500]

{'text': "<s>- File: Lines.tsx\n  Content:\nimport type { FC } from 'react';\n\ninterface LinesProps {\n\tn: number;\n\tmargin?: string;\n}\n\nexport const Lines: FC<LinesProps> = (props) => {\n\tconst { n, margin } = props;\n\tconst thickness = 1;\n\tconst distance = 4;\n\tconst height = n * distance;\n\treturn (\n\t\t<>\n\t\t\t<hr\n\t\t\t\tcss={{\n\t\t\t\t\tbackgroundImage: `repeating-linear-gradient(to bottom, #dcdcdc, #dcdcdc ${thickness}px, transparent ${thickness}px, transparent ${distance}px)`,\n\t\t\t\t\tbackgroundRepeat: 'repeat',\n\t\t\t\t\tbackgroundPosition: 'top',\n\t\t\t\t\theight: `${height}px`,\n\t\t\t\t\tborder: 0,\n\t\t\t\t\tmargin: margin ? margin : '12px auto 6px',\n\t\t\t\t}}\n\t\t\t/>\n\t\t</>\n\t);\n};\n\n</s>"}

### Labelled conversational data

#### QA data

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments
from datasets import Dataset, load_dataset
import pandas as pd
import json

In [None]:
chat_data_path = '/content/QA_data.jsonl'
qa_texts = []
with open(chat_data_path, 'r', encoding='utf-8') as file:
    for line in file:
        chat = json.loads(line)
        text = tokenizer.apply_chat_template(
            chat["messages"], tokenize=False, add_generation_prompt=False
        )
        qa_texts.append(text)

#### Code

In [None]:
files = []
code_texts = []

with open("/content/code.txt", 'r', encoding='utf-8') as file:
    content = file.read()

    # Splitting the content into sections for each file
    sections = content.split("- File: ")[1:]  # Skip the first split before the first file name

    for section in sections:
        # Splitting each section into file name and content
        parts = section.split("Content:\n", 1)
        file_name = parts[0].strip()
        file_content = parts[1].strip() if len(parts) > 1 else ""

        # Drop any files larger than n tokens
        if len(tokenizer.tokenize(file_content)) > 8192:
          continue

        files.append([file_name, file_content])

for i in range(0, len(files), 1):
    qa = [{'role': 'user', 'content': f"What does the {files[i][0]} file look like in `your-repo`"}, {'role': 'assistant', 'content': f"As of DATE, {files[i][0]} looks like this: \n {files[i][1]}"}]

    text = tokenizer.apply_chat_template(qa, tokenize=False, add_generation_prompt=False)
    code_texts.append(text)

#### Wiki

In [None]:
wiki_texts = []

def read_and_chunk_file(file_path, chunk_marker='§'):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()

    sections = text.split(chunk_marker)

    qa_pairs = []

    for section in sections[1:]:
        if section.strip():
            if '\n' in section:
                question, answer = section.split('\n', 1)
                qa_pairs.append((question.strip(), answer.strip()))
            else:
                qa_pairs.append((section.strip(), ""))

    for pair in qa_pairs:
      qa_inner = [{'role': 'user', 'content': pair[0]}, {'role': 'assistant', 'content': pair[1]}]

      text = tokenizer.apply_chat_template(qa_inner, tokenize=False, add_generation_prompt=False)
      wiki_texts.append(text)

file_path = '/content/wiki-QA.txt'
read_and_chunk_file(file_path)

In [None]:
import random

texts = qa_texts + code_texts + wiki_texts
random.shuffle(texts)

# Convert the preprocessed texts into a pandas DataFrame
pandas_dataset = pd.DataFrame(texts, columns=["text"])

# Convert the pandas DataFrame into a HuggingFace Dataset
dataset = Dataset.from_pandas(pandas_dataset)

from datasets import DatasetDict

datasets = dataset.train_test_split(test_size=0.2)
train_dataset_qa = datasets['train']
eval_dataset = datasets['test']

**Note:** Upon finding the optimal hyperparameters for your model, it can be beneificial to re-run the training using ALL the data available, including 20% of the dataset you'd usually use for evaluation. This is just so we can take advantage of all the data we have available.

In [None]:
train_dataset = final_chunks + train_dataset_qa.to_list()

# To include ALL data:
# train_dataset = final_chunks + dataset.to_list()

In [None]:
len(train_dataset)

1837

In [None]:
train_dataset_qa[3]

{'text': "<s>[INST] Where can the end-to-end tests be found in The Guardian's manage-frontend? [/INST]The end-to-end tests live in `cypress/tests/e2e`.</s>"}

## 4. Set up LoRA and training environment

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 768, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 1024,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    use_gradient_checkpointing = True,
    random_state = 3407,
)

Unsloth 2024.3 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [None]:
project = "Mistral-Instruct-16bit-8K-Sweep1"
run_name = "run_1_SS" # Defining a separate run name in case you want to start another one resuming from a checkpoint
project_and_run_name = project + "-" + run_name
output_dir = "./" + project_and_run_name

### Callback for uploading checkpoints to Google Drive

**Note: ** Checkpoints, especially those with a rank size as high as 768 will take up a lot of storage (10GB). There's [a recent offer](https://blog.google/products/google-one/google-one-gemini-ai-gmail-docs-sheets/) from Google for their Google One AI Premium subscription where you get two months free of 2TB storage in Google Drive amongst some other things like Gemini Ultra access.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import shutil
from pathlib import Path
from transformers import TrainerCallback

class UploadCheckpointCallback(TrainerCallback):
    def __init__(self, output_dir, google_drive_dir):
        super().__init__()
        self.output_dir = output_dir
        self.google_drive_dir = google_drive_dir

    def on_save(self, args, state, control, **kwargs):
        list_of_dirs = [d for d in os.listdir(self.output_dir) if os.path.isdir(os.path.join(self.output_dir, d)) and 'checkpoint' in d]
        list_of_dirs.sort(key=lambda x: os.path.getmtime(os.path.join(self.output_dir, x)), reverse=True)

        if list_of_dirs:
            latest_checkpoint_dir = os.path.join(self.output_dir, list_of_dirs[0])
            destination_path = os.path.join(self.google_drive_dir, os.path.basename(latest_checkpoint_dir))

            # Ensure the destination directory does not exist before copying
            if os.path.exists(destination_path):
                shutil.rmtree(destination_path)
            shutil.copytree(latest_checkpoint_dir, destination_path)
            print(f'Uploaded {latest_checkpoint_dir} to Google Drive: {destination_path}')

            # Delete the local checkpoint directory after upload to stop disk from filling up
            shutil.rmtree(latest_checkpoint_dir)
            print(f'Deleted local checkpoint directory: {latest_checkpoint_dir}')

google_drive_dir = '/content/drive/My Drive/your_drive_folder/' + project + "/" +run_name + "/"

upload_checkpoint_callback = UploadCheckpointCallback(output_dir, google_drive_dir)

#### Copy a checkpoint from Google Drive if you are resuming from a checkpoint

In [None]:
!cp -r "/content/drive/My Drive/your_drive_folder/Mistral-Instruct-16bit-8K-Sweep1/run_1_SS/checkpoint-1120/" "/content/latest_checkpoint/"

## 5. Training

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from datetime import datetime

wandbname = project + "-" + run_name

trainer = SFTTrainer(
    model = model,
    callbacks=[upload_checkpoint_callback],
    train_dataset = train_dataset,
    eval_dataset = eval_dataset,
    tokenizer = tokenizer,
    max_seq_length = max_seq_length,
    packing = True,
    dataset_text_field="text",
    args = TrainingArguments(
        per_device_eval_batch_size = 1,
        per_device_train_batch_size = 1,
        gradient_accumulation_steps = 1,
        warmup_ratio = 0,
        max_grad_norm = 1.0,
        num_train_epochs = 5,
        learning_rate = 2e-5,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        evaluation_strategy="epoch",
        optim = "adamw_8bit",
        weight_decay = 0.1,
        lr_scheduler_type = "cosine",
        save_strategy="epoch",
        seed = 3407,
        output_dir = output_dir,
        # report_to="wandb",
        run_name=f"{wandbname}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}"
    ),
)

trainer.train()

# If resuming from a checkpoint:
# trainer.train(resume_from_checkpoint="/content/latest-checkpoint/")

## 6. Try the Trained Model!


### With chat template

The `apply_chat_template` function injects the special [INST] tokens to your prompt

In [None]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "mistral", # Supports zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old, unsloth
    mapping = {"role" : "from", "content" : "value", "user" : "human", "assistant" : "gpt"}, # ShareGPT style
    map_eos_token = True, # Maps <|im_end|> to </s> instead
)

FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
    {"from": "human", "value": """Explain the theory of relativity to me in detail"""},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

outputs = model.generate(input_ids = inputs, max_new_tokens = 8192, use_cache = True)
tokenizer.batch_decode(outputs)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


["<s> [INST] Explain the theory of relativity to me in detail [/INST] The Theory of Relativity is a fundamental concept in physics that was developed by Albert Einstein between 1905 and 1915. It consists of two parts: the Special Theory of Relativity (STR) and the General Theory of Relativity (GTR).\n\n1. Special Theory of Relativity (STR, 1905):\n\nSTR is based on two postulates:\n\na) The laws of physics are the same in all inertial frames of reference. This means that there is no preferred inertial frame of reference in the universe.\n\nb) The speed of light in a vacuum is the same for all observers, regardless of their motion or the motion of the source of light. This speed is approximately 299,792,458 meters per second (m/s) or 186,282 miles per second (mi/s).\n\nFrom these postulates, several consequences follow:\n\n- Time dilation: Moving clocks run slower than stationary clocks. This means that a clock moving relative to an observer will appear to tick more slowly than a clock 

### Without chat template

In [None]:
from transformers import TextIteratorStreamer
from threading import Thread
text_streamer = TextIteratorStreamer(tokenizer)
import textwrap
max_print_width = 100

inputs = tokenizer(
[
    """Explain the theory of relativity to me"""
]*1, return_tensors = "pt").to("cuda")

generation_kwargs = dict(
    inputs,
    streamer = text_streamer,
    max_new_tokens = 8192,
    use_cache = True,
)
thread = Thread(target = model.generate, kwargs = generation_kwargs)
thread.start()

length = 0
for j, new_text in enumerate(text_streamer):
    if j == 0:
        wrapped_text = textwrap.wrap(new_text, width = max_print_width)
        length = len(wrapped_text[-1])
        wrapped_text = "\n".join(wrapped_text)
        print(wrapped_text, end = "")
    else:
        length += len(new_text)
        if length >= max_print_width:
            length = 0
            print()
        print(new_text, end = "")
    pass
pass

### 7. Save the model and upload to Google Drive

In [None]:
model.save_pretrained_merged("your-model", tokenizer, save_method = "merged_16bit",)

Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 55.8 out of 83.48 RAM for saving.


100%|██████████| 32/32 [00:00<00:00, 38.34it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...
Done.


In [None]:
!cp -r "/content/your-model/" "/content/drive/My Drive/your-model/"